In [1]:
import itertools
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.ensemble import RandomForestClassifier

In [21]:
df = pd.read_csv("https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-California-Housing-Prices/master/Data/housing.csv")
df = df.sample(frac=0.2)
df.shape

(4128, 10)

In [22]:
df = df.dropna(axis=0)
df.drop(["longitude", "latitude"], axis = 1, inplace=True)

In [23]:
df.groupby("ocean_proximity").median_house_value.count()

ocean_proximity
<1H OCEAN     1843
INLAND        1254
NEAR BAY       447
NEAR OCEAN     538
Name: median_house_value, dtype: int64

In [24]:
print(df.shape)
df = df.merge(pd.get_dummies(df.ocean_proximity, drop_first=True, prefix="OCEAN_PROX"), 
              left_index=True, right_index=True, how="inner")
df.drop("ocean_proximity", axis = 1, inplace=True)
print(df.shape)

(4082, 8)
(4082, 10)


In [25]:
df.head(2)

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,OCEAN_PROX_INLAND,OCEAN_PROX_NEAR BAY,OCEAN_PROX_NEAR OCEAN
8079,43.0,1758.0,347.0,954.0,312.0,5.2606,198900.0,0,0,1
15723,37.0,1235.0,314.0,481.0,297.0,3.6875,492300.0,0,1,0


In [26]:
#df.info()

In [27]:
#df.columns

In [28]:
y = "median_house_value"
X = [x for x in df.columns if x != y]

X_train, X_test, y_train, y_test = train_test_split(df[X], df[y], test_size=0.20, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(3265, 9) (817, 9) (3265,) (817,)


In [29]:
reg = LinearRegression(fit_intercept=True).fit(X_train, y_train)
y_pred = reg.predict(X_test)
yh  = [x for x in zip(y_test, y_pred)]
#print(yh)
rootMeanSquaredError = sqrt(mean_squared_error(y_test, y_pred))
print("RMSE All: ", rootMeanSquaredError)

RMSE All:  73093.48702598235


## Variable Importance

In [30]:
clf = RandomForestClassifier(n_estimators=50, max_features='sqrt')
df_X = df[X].copy()
df_X['randomVar'] = np.random.randint(1, 10, df_X.shape[0])
clf = clf.fit(df_X, df[y])
features = pd.DataFrame()
features['feature'] = df_X.columns
features['importance'] = clf.feature_importances_
features.sort_values(by=['importance'], ascending=True, inplace=True)
features.set_index('feature', inplace=True)
features = features.sort_values(by="importance", ascending=False).reset_index(drop=False)
features

Unnamed: 0,feature,importance
0,median_income,0.166278
1,population,0.151361
2,total_rooms,0.149481
3,total_bedrooms,0.143259
4,households,0.143076
5,housing_median_age,0.125689
6,randomVar,0.087551
7,OCEAN_PROX_NEAR OCEAN,0.015456
8,OCEAN_PROX_NEAR BAY,0.012373
9,OCEAN_PROX_INLAND,0.005476


In [31]:
randomVarIndex = features[features.feature=="randomVar"].index.values[0]

In [32]:
feat_positive = list(features[features.index < randomVarIndex].feature.values)
feat_positive

['median_income',
 'population',
 'total_rooms',
 'total_bedrooms',
 'households',
 'housing_median_age']

In [33]:
reg = LinearRegression(fit_intercept=True).fit(X_train[feat_positive], y_train)
y_pred = reg.predict(X_test[feat_positive])
yh  = [x for x in zip(y_test, map(int, y_pred))]
#print(yh)
rootMeanSquaredError = sqrt(mean_squared_error(y_test, y_pred))
print("RMSE Better than random: ", rootMeanSquaredError)

RMSE Better than random:  77531.88378964817


In [34]:
# Compare variable importance with predictive capacity of each var with intercept, Mean RMSE with train-test loop 

## Linear regression brute force eval

In [35]:
y = "median_house_value"
X = [x for x in df.columns if x != y]

In [36]:
X

['housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'OCEAN_PROX_INLAND',
 'OCEAN_PROX_NEAR BAY',
 'OCEAN_PROX_NEAR OCEAN']

In [37]:
def split_fit_eval(df, y):
    X = [x for x in df.columns if x != y]
    res = []
    elements = np.arange(2,len(X)+1,1)
    ucombin=[]
    for e in elements:
        ucombin.append(list(itertools.combinations(X, e)))
    comb_flat_list = [list(item) for sublist in ucombin for item in sublist]
    for enum, x in enumerate(comb_flat_list):
        if enum % 100 == 0:
            print(enum)
        rmse = []
        for rs in range(10):
            X_train, X_test, y_train, y_test = train_test_split(df[x], df[y], test_size=0.20, random_state=rs)
            reg = LinearRegression(fit_intercept=True).fit(X_train, y_train)
            y_pred = reg.predict(X_test)
            rmse.append(sqrt(mean_squared_error(y_test, y_pred)))
        res.append((x, np.mean(rmse)))
    res = pd.DataFrame(res, columns=["var", "rmse"])
    res = res.sort_values(by="rmse").reset_index(drop=True)
    res["nvars"] = res["var"].apply(lambda x : len(x))
    return res

In [38]:
r = split_fit_eval(df=df, y="median_house_value")

0
100
200
300
400
500


In [39]:
r.head(10)

Unnamed: 0,var,rmse,nvars
0,"[housing_median_age, total_rooms, total_bedroo...",70081.676558,9
1,"[housing_median_age, total_rooms, total_bedroo...",70114.722237,8
2,"[housing_median_age, total_rooms, total_bedroo...",70131.572561,8
3,"[housing_median_age, total_bedrooms, populatio...",70156.506044,8
4,"[housing_median_age, total_rooms, total_bedroo...",70177.110081,7
5,"[housing_median_age, total_bedrooms, populatio...",70184.629934,7
6,"[housing_median_age, total_rooms, total_bedroo...",70184.767531,8
7,"[housing_median_age, total_rooms, total_bedroo...",70191.777152,7
8,"[housing_median_age, total_rooms, population, ...",70211.747251,8
9,"[housing_median_age, total_bedrooms, populatio...",70211.766642,7


In [40]:
r["var"].loc[0]

['housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'OCEAN_PROX_INLAND',
 'OCEAN_PROX_NEAR BAY',
 'OCEAN_PROX_NEAR OCEAN']

In [41]:
#r["var"].loc[0]

In [42]:
features

Unnamed: 0,feature,importance
0,median_income,0.166278
1,population,0.151361
2,total_rooms,0.149481
3,total_bedrooms,0.143259
4,households,0.143076
5,housing_median_age,0.125689
6,randomVar,0.087551
7,OCEAN_PROX_NEAR OCEAN,0.015456
8,OCEAN_PROX_NEAR BAY,0.012373
9,OCEAN_PROX_INLAND,0.005476


## Var Importance - Correlations

In [43]:
y = "median_house_value"
df_corr = df.corr()
df_corr = df_corr[y]
df_corr = df_corr.reset_index(drop=False)
df_corr[y] = df_corr[y].apply(lambda x : abs(x))
df_corr.sort_values(by=y, ascending=False).reset_index(drop=True)

Unnamed: 0,index,median_house_value
0,median_house_value,1.0
1,median_income,0.679166
2,OCEAN_PROX_INLAND,0.478666
3,OCEAN_PROX_NEAR BAY,0.175265
4,OCEAN_PROX_NEAR OCEAN,0.135948
5,total_rooms,0.128108
6,housing_median_age,0.091837
7,households,0.067624
8,total_bedrooms,0.051856
9,population,0.028141


## Remove explinatory vars with high correlations 

In [44]:
df_corr = df.corr().reset_index(drop=False)
df_corr

Unnamed: 0,index,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,OCEAN_PROX_INLAND,OCEAN_PROX_NEAR BAY,OCEAN_PROX_NEAR OCEAN
0,housing_median_age,1.0,-0.362212,-0.313661,-0.29554,-0.295916,-0.132817,0.091837,-0.24802,0.243376,0.026648
1,total_rooms,-0.362212,1.0,0.92882,0.873551,0.92274,0.186739,0.128108,0.034637,-0.026096,-0.019428
2,total_bedrooms,-0.313661,0.92882,1.0,0.88615,0.984084,-0.01452,0.051856,-0.001668,-0.018296,-0.002609
3,population,-0.29554,0.873551,0.88615,1.0,0.909086,0.000619,-0.028141,-0.007217,-0.061201,-0.038163
4,households,-0.295916,0.92274,0.984084,0.909086,1.0,0.00747,0.067624,-0.028319,-0.008658,-0.003569
5,median_income,-0.132817,0.186739,-0.01452,0.000619,0.00747,1.0,0.679166,-0.228834,0.067027,0.010214
6,median_house_value,0.091837,0.128108,0.051856,-0.028141,0.067624,0.679166,1.0,-0.478666,0.175265,0.135948
7,OCEAN_PROX_INLAND,-0.24802,0.034637,-0.001668,-0.007217,-0.028319,-0.228834,-0.478666,1.0,-0.233513,-0.25945
8,OCEAN_PROX_NEAR BAY,0.243376,-0.026096,-0.018296,-0.061201,-0.008658,0.067027,0.175265,-0.233513,1.0,-0.13663
9,OCEAN_PROX_NEAR OCEAN,0.026648,-0.019428,-0.002609,-0.038163,-0.003569,0.010214,0.135948,-0.25945,-0.13663,1.0


In [45]:
df_corr_melt = pd.melt(df_corr, id_vars=['index'], value_vars=[col for col in df_corr.columns if col != "index" ])
df_corr_melt["value"] = df_corr_melt["value"].apply(lambda x : abs(x))
df_corr_melt = df_corr_melt.sort_values(by="value", ascending=False)
df_corr_melt = df_corr_melt.loc[df_corr_melt["index"]!=df_corr_melt["variable"], :]
df_corr_melt = df_corr_melt[df_corr_melt["value"]>0.5]
df_corr_melt

Unnamed: 0,index,variable,value
42,total_bedrooms,households,0.984084
24,households,total_bedrooms,0.984084
12,total_bedrooms,total_rooms,0.92882
21,total_rooms,total_bedrooms,0.92882
14,households,total_rooms,0.92274
41,total_rooms,households,0.92274
43,population,households,0.909086
34,households,population,0.909086
23,population,total_bedrooms,0.88615
32,total_bedrooms,population,0.88615
