In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

import seaborn as sns
sns.set()

In [None]:
data_raw = pd.read_csv("data/housing/housing.csv")

In [None]:
print(data_raw.columns)

In [None]:
data_raw.describe()

#### Plot scatter between feature pairs to check for non-linear dependencies

In [None]:
# for simplicity I drop the NaN values and the categorical column ocean_proximity
# random_sample = data.dropna().drop(columns=['ocean_proximity']).sample(n=1000)
# sns.pairplot(random_sample);

In [None]:
#data = data_raw.where(data_raw['median_house_value'] != 500001.000000)

In [None]:
#data = data_raw.query('median_house_value != 500001.000000')

In [None]:
data = data_raw

### Encode input Data

In [None]:
# ONE-HOT-ENCODE ocean proximity
data_str_ocean = data['ocean_proximity'].values

# transform to numerical
data_ocean = LabelEncoder().fit_transform(data_str_ocean.ravel()).reshape(*data_str_ocean.shape)

# transform to binary
data_ocean_bin = OneHotEncoder().fit_transform(data_ocean.reshape(-1, 1)).toarray()

In [None]:
data['old_house'] = 0
mask = (data['housing_median_age'] == 52)
data.loc[mask, 'old_house'] = 1

In [None]:
data['median_income_log'] = data['median_income'].apply(np.log)

In [None]:
data['density'] = data['population']/data['households']
data['density_bedrooms'] = data['population']/data['total_bedrooms']
data['room_ratio'] = data['total_bedrooms']/data['total_rooms']

In [None]:
data['rich_district'] = 0
mask = (data['median_income'] > 10)
data.loc[mask, 'rich_district'] = 1

In [None]:
data['poor_district'] = 0
mask = (data['median_income'] < 2)
data.loc[mask, 'poor_district'] = 1

In [None]:
data.head()

### Assign input and target values, split in test and train

In [None]:
y = data['median_house_value'].values
X = data.drop(columns=['median_house_value', 'ocean_proximity', 'median_income']).values

In [None]:
# Concat one-hot encoded features to input data
X = np.concatenate([X,data_ocean_bin], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
age_mean = X_train[:,2].mean()
inds = np.where(X_train[:,2]==52)
X_train[inds,2] = age_mean

In [None]:
inds = np.where(X_test[:,2]==52)
X_test[inds,2] = age_mean

### Fix NaN Values

In [None]:
#--- TRAIN
#Obtain mean of columns as you need, nanmean is just convenient.
col_mean = np.nanmean(X_train, axis=0)

#Find indicies that you need to replace in TRAIN
inds = np.where(np.isnan(X_train))

#Place column means in the indices. Align the arrays using take
X_train[inds] = np.take(col_mean, inds[1])

#--- TEST
#Find indicies that you need to replace in TEST
inds = np.where(np.isnan(X_test))

#Place column means in the indices. Align the arrays using take
X_test[inds] = np.take(col_mean, inds[1])


### Train model

In [None]:
lm = linear_model.LinearRegression()

In [None]:
# fit linear model to training subset 
model = lm.fit(X_train, y_train)
    
# make a prediction using the test subset
y_pred = lm.predict(X_test)
    

# Print Errors
print("MSE: ", mean_squared_error(y_test, y_pred))
print("RMSE: ", np.sqrt(((y_test - y_pred) ** 2).mean()))
print("R2: ", r2_score(y_test, y_pred))

In [None]:
#pd.DataFrame([y_pred, y_test]).T

In [None]:
np.sqrt(((y_test - y_pred) ** 2).mean())/ y_test.mean()