In [133]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import pickle

In [134]:
# lue data sekä jaa X ja y
df = pd.read_csv('./work/viikko6/datasets/housing.csv')
y = df['median_house_value']
X = df[['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'median_income', 'ocean_proximity']]

In [135]:
# count of Nan values
print(X.isnull().sum())

# unique ocean_proximity values
print(X['ocean_proximity'].unique())

# replace NaN values with mean
X['total_bedrooms'].fillna(X['total_bedrooms'].mean(), inplace=True)

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
median_income           0
ocean_proximity         0
dtype: int64
['NEAR BAY' '<1H OCEAN' 'INLAND' 'NEAR OCEAN' 'ISLAND']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['total_bedrooms'].fillna(X['total_bedrooms'].mean(), inplace=True)


In [137]:
# dummies
X_org = X
ct = ColumnTransformer(transformers=[('encoder', 
OneHotEncoder(drop='first'), ['ocean_proximity'])], remainder='passthrough')
X = ct.fit_transform(X)

In [138]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, 
                                                    random_state = 0)

In [139]:
# Training the Multiple Linear Regression model on the Training set
model = LinearRegression()
model.fit(X_train, y_train)

In [140]:
# Predicting the Test set results
y_pred = model.predict(X_test)

In [141]:
# Regression metrics
mae=mean_absolute_error(y_test, y_pred) 
r2=r2_score(y_test, y_pred)
mea = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mea)

print(f'r2:  {round(r2,4)}')
print(f'mae: {round(mae,4)}')
print(f'rmse: {round(rmse,4)}')

r2:  0.6131
mae: 51619.5098
rmse: 71023.6036


In [142]:
# tallennetaan malli levylle
with open('housing-model.pickle', 'wb') as f:
    pickle.dump(model, f)
    
# tallennetaan encoderi
with open('housing-ct.pickle', 'wb') as f:
    pickle.dump(ct, f)