In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score, mean_squared_error
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.feature_extraction import DictVectorizer
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
raw_df = pd.read_csv('data/housing.csv')
raw_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


### Main Dataframe

In [3]:
feats = ['latitude', 'longitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'ocean_proximity' ]
target = 'median_house_value'
df = raw_df[feats + [ target ] ].fillna(0)
df

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,NEAR BAY,452600.0
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,NEAR BAY,358500.0
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,NEAR BAY,352100.0
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,NEAR BAY,341300.0
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,NEAR BAY,342200.0
...,...,...,...,...,...,...,...,...,...,...
20635,39.48,-121.09,25.0,1665.0,374.0,845.0,330.0,1.5603,INLAND,78100.0
20636,39.49,-121.21,18.0,697.0,150.0,356.0,114.0,2.5568,INLAND,77100.0
20637,39.43,-121.22,17.0,2254.0,485.0,1007.0,433.0,1.7000,INLAND,92300.0
20638,39.43,-121.32,18.0,1860.0,409.0,741.0,349.0,1.8672,INLAND,84700.0


## Data Preparation

In [4]:
df['rooms_per_household'] = df.total_rooms / df.households
df['bedrooms_per_room'] = df.total_bedrooms / df.total_rooms
df['population_per_household'] = df.population / df.households
df

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value,rooms_per_household,bedrooms_per_room,population_per_household
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,NEAR BAY,452600.0,6.984127,0.146591,2.555556
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,NEAR BAY,358500.0,6.238137,0.155797,2.109842
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,NEAR BAY,352100.0,8.288136,0.129516,2.802260
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,NEAR BAY,341300.0,5.817352,0.184458,2.547945
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,NEAR BAY,342200.0,6.281853,0.172096,2.181467
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,39.48,-121.09,25.0,1665.0,374.0,845.0,330.0,1.5603,INLAND,78100.0,5.045455,0.224625,2.560606
20636,39.49,-121.21,18.0,697.0,150.0,356.0,114.0,2.5568,INLAND,77100.0,6.114035,0.215208,3.122807
20637,39.43,-121.22,17.0,2254.0,485.0,1007.0,433.0,1.7000,INLAND,92300.0,5.205543,0.215173,2.325635
20638,39.43,-121.32,18.0,1860.0,409.0,741.0,349.0,1.8672,INLAND,84700.0,5.329513,0.219892,2.123209


### Question 1
#### What is the most frequent observation (mode) for the column ocean_proximity?

In [5]:
# Most frequent value in ocean_proximity
df.ocean_proximity.value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

### Question 2

#### What are the two features that have the biggest correlation in this dataset?

In [6]:
feats = feats + [ 'rooms_per_household', 'bedrooms_per_room', 'population_per_household' ]
feats

['latitude',
 'longitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'ocean_proximity',
 'rooms_per_household',
 'bedrooms_per_room',
 'population_per_household']

In [7]:
categorical = [ 'ocean_proximity' ]
numerical = [ x for x in feats + [ target ] if x not in categorical ]
df[numerical]

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household,median_house_value
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,6.984127,0.146591,2.555556,452600.0
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,6.238137,0.155797,2.109842,358500.0
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,8.288136,0.129516,2.802260,352100.0
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,5.817352,0.184458,2.547945,341300.0
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,6.281853,0.172096,2.181467,342200.0
...,...,...,...,...,...,...,...,...,...,...,...,...
20635,39.48,-121.09,25.0,1665.0,374.0,845.0,330.0,1.5603,5.045455,0.224625,2.560606,78100.0
20636,39.49,-121.21,18.0,697.0,150.0,356.0,114.0,2.5568,6.114035,0.215208,3.122807,77100.0
20637,39.43,-121.22,17.0,2254.0,485.0,1007.0,433.0,1.7000,5.205543,0.215173,2.325635,92300.0
20638,39.43,-121.32,18.0,1860.0,409.0,741.0,349.0,1.8672,5.329513,0.219892,2.123209,84700.0


In [8]:
corr = df[numerical].corr()
corr

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household,median_house_value
latitude,1.0,-0.924664,0.011173,-0.0361,-0.065318,-0.108785,-0.071035,-0.079809,0.106389,-0.104112,0.002366,-0.14416
longitude,-0.924664,1.0,-0.108197,0.044568,0.068082,0.099773,0.05531,-0.015176,-0.02754,0.084836,0.002476,-0.045967
housing_median_age,0.011173,-0.108197,1.0,-0.361262,-0.317063,-0.296244,-0.302916,-0.119034,-0.153277,0.125396,0.013191,0.105623
total_rooms,-0.0361,0.044568,-0.361262,1.0,0.920196,0.857126,0.918484,0.19805,0.133798,-0.174583,-0.024581,0.134153
total_bedrooms,-0.065318,0.068082,-0.317063,0.920196,1.0,0.866266,0.966507,-0.007295,0.002717,0.122205,-0.028019,0.049148
population,-0.108785,0.099773,-0.296244,0.857126,0.866266,1.0,0.907222,0.004834,-0.072213,0.031397,0.069863,-0.02465
households,-0.071035,0.05531,-0.302916,0.918484,0.966507,0.907222,1.0,0.013033,-0.080598,0.059818,-0.027309,0.065843
median_income,-0.079809,-0.015176,-0.119034,0.19805,-0.007295,0.004834,0.013033,1.0,0.326895,-0.573836,0.018766,0.688075
rooms_per_household,0.106389,-0.02754,-0.153277,0.133798,0.002717,-0.072213,-0.080598,0.326895,1.0,-0.387465,-0.004852,0.151948
bedrooms_per_room,-0.104112,0.084836,0.125396,-0.174583,0.122205,0.031397,0.059818,-0.573836,-0.387465,1.0,0.003047,-0.238759


## Prepare the Dataset

In [9]:
df['above_average'] = (df.median_house_value > df.median_house_value.mean()).astype(int)
binary_target = 'above_average'
targets = [target, binary_target]

In [10]:
tst_size = 0.2
new_tst_size = tst_size / (1 - tst_size)
df_full_train, df_test = train_test_split(df, random_state=42, test_size=tst_size)
df_train, df_val = train_test_split(df_full_train, random_state=42, test_size=new_tst_size)
len(df_train), len(df_val), len(df_test)

(12384, 4128, 4128)

In [11]:
y_train = df_train[targets]
y_val = df_val[targets]
y_test = df_test[targets]

df_train.drop(axis=1, columns=targets , inplace=True)
df_val.drop(axis=1, columns=targets , inplace=True)
df_test.drop(axis=1, columns= targets, inplace=True)

### Question 3
#### Calculate the mutual information score with the (binarized) price for the categorical variable that we have. Use the training set only. What is the value of mutual information?

In [12]:
round(mutual_info_score(df_train.ocean_proximity.values, y_train[binary_target].values), 2)

0.1

### Question 4
#### Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [13]:
vec = DictVectorizer(sparse=False)
X_train = vec.fit_transform(df_train.to_dict(orient='records'))
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42).fit(X_train, y_train[binary_target])
y_pred = model.predict(vec.fit_transform(df_val.to_dict(orient='records')))
round((y_val[binary_target] == y_pred).mean(), 2)

0.84

### Question 5

#### Which of following feature has the smallest difference?

- total_rooms
- total_bedrooms
- population
- households


In [14]:
vec = DictVectorizer(sparse=False)
X_train = vec.fit_transform(df_train.to_dict(orient='records'))
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42).fit(X_train, y_train[binary_target])
y_pred = model.predict(vec.fit_transform(df_val.to_dict(orient='records')))
baseline = (y_val[binary_target] == y_pred).mean()

accuracies = []
diffs = {}
excluded = ['total_rooms', 'total_bedrooms', 'population', 'households']
for e in excluded:
        cur_feats = [f for f in feats if f not in [e]]
        cur_vec = DictVectorizer(sparse=False)
        X_sel_train = cur_vec.fit_transform(df_train[cur_feats].to_dict(orient='records'))
        cur_model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42).fit(X_sel_train, y_train[binary_target])
        preds = cur_model.predict(vec.fit_transform(df_val[cur_feats].to_dict(orient='records')))
        cur_acc = (y_val[binary_target] == preds).mean()
        accuracies.append(cur_acc)
        diffs[e] = baseline - cur_acc

In [15]:
diffs

{'total_rooms': -0.0007267441860464574,
 'total_bedrooms': 0.0004844961240310086,
 'population': 0.009932170542635732,
 'households': 0.0031492248062016115}

### Question 6
#### Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.

In [16]:
y_train_cur = np.log1p(y_train[target].values)
y_val_cur = np.log1p(y_val[target].values)
vec = DictVectorizer(sparse=False)
X_train_cur = vec.fit_transform(df_train.to_dict(orient='records'))
X_val_cur = vec.fit_transform(df_val.to_dict(orient='records'))
alphas = [0, 0.01, 0.1, 1, 10]
selected_rmse = np.finfo(np.float64).max
selected_alpha = -1
for a in alphas:
    my_model = Ridge(alpha=a, solver="sag", random_state=42).fit(X_train_cur, y_train_cur)
    cur_preds = my_model.predict(X_val_cur)
    cur_rmse = round(mean_squared_error(y_val_cur, cur_preds, squared=False), 3)
    print(f"Current Alpha Option is {a} with RMSE = {cur_rmse}")
    if cur_rmse < selected_rmse:
        selected_rmse = cur_rmse
        selected_alpha = a
selected_alpha, selected_rmse

Current Alpha Option is 0 with RMSE = 0.524
Current Alpha Option is 0.01 with RMSE = 0.524
Current Alpha Option is 0.1 with RMSE = 0.524
Current Alpha Option is 1 with RMSE = 0.524
Current Alpha Option is 10 with RMSE = 0.524


(0, 0.524)