In [5]:
import pandas as pd

In [6]:
df = pd.read_csv('housing.csv')

In [7]:
col_list = ['latitude', 'longitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value', 'ocean_proximity']

# Data preparation

In [8]:
df_features = df[col_list]
df_features.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [9]:
df_features.isnull().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [10]:
df_features = df_features.fillna(0)
df_features.isnull().sum()

latitude              0
longitude             0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [11]:
df_features['rooms_per_household'] = df_features['total_rooms']/df_features['households']
df_features.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853


In [12]:
df_features['bedrooms_per_room'] = df_features.total_bedrooms/df_features.total_rooms
df_features['population_per_household '] = df_features.population /df_features.households 
df_features.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.80226
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467


# Question 1

In [13]:
df_features.ocean_proximity = df_features.ocean_proximity.str.lower().replace(' ', '_')

df_features.ocean_proximity.mode()

0    <1h ocean
Name: ocean_proximity, dtype: object

# Split the data

In [14]:
from sklearn.model_selection import train_test_split
import numpy as np

In [145]:
df_train_full, df_test = train_test_split(df_features, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

len(df_train), len(df_test), len(df_val)

(12384, 4128, 4128)

In [16]:
y_train = df_train.median_house_value.values
y_val = df_val.median_house_value.values

In [146]:
del df_train['median_house_value']
del df_val['median_house_value']

# Question 2

In [18]:
from sklearn.metrics import mutual_info_score

In [19]:
# get numeric features from train dataset 
df_train_numeric = list(df_train.select_dtypes(include=[np.number]).columns.values)

In [20]:
from itertools import combinations

f_pairs = list(combinations(df_train_numeric, 2))
f_pairs

[('latitude', 'longitude'),
 ('latitude', 'housing_median_age'),
 ('latitude', 'total_rooms'),
 ('latitude', 'total_bedrooms'),
 ('latitude', 'population'),
 ('latitude', 'households'),
 ('latitude', 'median_income'),
 ('latitude', 'rooms_per_household'),
 ('latitude', 'bedrooms_per_room'),
 ('latitude', 'population_per_household '),
 ('longitude', 'housing_median_age'),
 ('longitude', 'total_rooms'),
 ('longitude', 'total_bedrooms'),
 ('longitude', 'population'),
 ('longitude', 'households'),
 ('longitude', 'median_income'),
 ('longitude', 'rooms_per_household'),
 ('longitude', 'bedrooms_per_room'),
 ('longitude', 'population_per_household '),
 ('housing_median_age', 'total_rooms'),
 ('housing_median_age', 'total_bedrooms'),
 ('housing_median_age', 'population'),
 ('housing_median_age', 'households'),
 ('housing_median_age', 'median_income'),
 ('housing_median_age', 'rooms_per_household'),
 ('housing_median_age', 'bedrooms_per_room'),
 ('housing_median_age', 'population_per_household 

In [21]:
op1 = df_train.corrwith(df_train["population_per_household "])
op1.total_rooms

-0.029451679411510782

In [22]:
pd.__version__

'1.4.3'

In [23]:
corr_info = df_train.corrwith(df_train['households'])
corr_info

latitude                    -0.063529
longitude                    0.049762
housing_median_age          -0.306119
total_rooms                  0.921441
total_bedrooms               0.979399
population                   0.906841
households                   1.000000
median_income                0.011925
rooms_per_household         -0.085832
bedrooms_per_room            0.058004
population_per_household    -0.032522
dtype: float64

# Make median_house_value binary

In [26]:
mean = y_train.mean() # median_house_value
#df_features.describe()

In [84]:
y_train.shape
#above_average = np.where(y_train > mean, 1, 0)
above_average = (y_train > mean).astype(int)

In [89]:
y_val_mean = y_val.mean()

In [90]:
y_val = (y_val > y_val_mean).astype(int)

# Question 3

In [28]:
# get categorical features from train dataset 
df_train_categorical = list(df_train.select_dtypes(exclude=[np.number, "number","bool_"]).columns)
df_train_categorical

['ocean_proximity']

In [29]:
from sklearn.feature_extraction import DictVectorizer

In [30]:
train_dict = df_train[df_train_categorical].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [31]:
X_train = dv.transform(train_dict)
X_train.shape

(12384, 5)

In [32]:
from sklearn.metrics import mutual_info_score
above_average.shape

(12384,)

In [33]:
def calculate_mi(series):
    return mutual_info_score(series, above_average)

df_mi = df_train[df_train_categorical].apply(calculate_mi)

#df_mi = mutual_info_score(df_train[df_train_categorical], above_average)

df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')

In [34]:
round(df_mi, 2)

Unnamed: 0,MI
ocean_proximity,0.1


# Question 4

In [67]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[df_train_categorical + df_train_numeric].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[df_train_categorical + df_train_numeric].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [68]:
X_train.shape

(12384, 16)

In [69]:
from sklearn.linear_model import LogisticRegression

In [85]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)

In [86]:
# solver='lbfgs' is the default solver in newer version of sklearn
# for older versions, you need to specify it explicitly
model.fit(X_train, above_average)

In [87]:
y_pred = model.predict_proba(X_val)[:, 1]

In [91]:
th_decision = (y_pred >= 0.5)
th_decision

array([False, False,  True, ...,  True,  True, False])

In [92]:
y_val

array([0, 0, 1, ..., 1, 1, 0])

In [108]:
round((y_val == th_decision).mean(), 2)

0.84

In [56]:
df_train[df_train_numeric + df_train_categorical].describe()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household
count,12384.0,12384.0,12384.0,12384.0,12384.0,12384.0,12384.0,12384.0,12384.0,12384.0,12384.0
mean,35.642003,-119.579831,28.585837,2633.74895,537.035368,1427.289648,499.044089,3.87009,5.411837,0.213091,3.033264
std,2.130977,2.005077,12.608072,2137.057303,412.252061,1140.978664,375.749834,1.888237,2.077484,0.058301,7.07441
min,32.56,-124.35,1.0,2.0,2.0,3.0,2.0,0.4999,0.888889,0.1,0.692308
25%,33.93,-121.8,18.0,1461.0,298.0,790.0,282.0,2.5625,4.443167,0.175329,2.427379
50%,34.26,-118.5,29.0,2127.0,438.0,1170.0,412.0,3.5521,5.223123,0.203165,2.816215
75%,37.72,-118.01,37.0,3150.0,647.25,1726.0,606.0,4.7404,6.046054,0.239753,3.28002
max,41.95,-114.31,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,61.8125,1.0,599.714286


# Question 5

In [97]:
small = ['total_rooms', 'total_bedrooms', 'population', 'households']

In [98]:
dicts_train_small = df_train[small].to_dict(orient='records')
dicts_val_small = df_val[small].to_dict(orient='records')

In [99]:
dv_small = DictVectorizer(sparse=False)
dv_small.fit(dicts_train_small)

In [101]:
dv_small.get_feature_names_out()

array(['households', 'population', 'total_bedrooms', 'total_rooms'],
      dtype=object)

In [112]:
X_train_small = dv_small.transform(dicts_train_small)
#X_train_small = df_train[small]

In [113]:
model_small = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model_small.fit(X_train_small, above_average)

In [114]:
X_val_small = df_val[small]
y_pred = model_small.predict_proba(X_val_small)[:, 1]



In [115]:
th_decision_small = (y_pred >= 0.5)

In [116]:
(th_decision_small == True).sum()

3891

In [117]:
acc_original = round((y_val == th_decision).mean(), 2)

0.84

In [119]:
df_train[small].head()

Unnamed: 0,total_rooms,total_bedrooms,population,households
17244,1467.0,381.0,1404.0,374.0
8817,6097.0,794.0,2248.0,806.0
19686,1317.0,309.0,856.0,337.0
3545,4759.0,924.0,1884.0,915.0
17019,2769.0,387.0,994.0,395.0


In [138]:
acc_values = []
small = ['total_rooms', 'total_bedrooms', 'population', 'households']
#small = ['total_rooms', 'total_bedrooms']

#for feature in small:
df_train_small = df_train[small].copy()
df_val_small = df_val[small].copy()

del df_train_small['population']
del df_val_small['population']

dicts_train_small = df_train_small.to_dict(orient='records')
dicts_val_small = df_train_small.to_dict(orient='records')

dv_small = DictVectorizer(sparse=False)
dv_small.fit(dicts_train_small)

dv_small.get_feature_names_out()

array(['households', 'total_bedrooms', 'total_rooms'], dtype=object)

In [None]:
dicts_train_small = df_train_small.to_dict(orient='records')
dicts_val_small = df_train_small.to_dict(orient='records')

dv_small = DictVectorizer(sparse=False)
dv_small.fit(dicts_train_small)

X_train_small = dv_small.transform(dicts_train_small)

model_small = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model_small.fit(X_train_small, above_average)

X_val_small = df_val_small
y_pred = model_small.predict_proba(X_val_small)[:, 1]

th_decision_small = (y_pred >= 0.5)
acc_current = (y_val == th_decision).mean()

print(acc_current)
acc_values.append(acc_current)

In [155]:
small = ['total_rooms', 'total_bedrooms']

df_train_small = df_train[small].copy()
df_val_small = df_val[small].copy()

for f in small:
    mean_t = df_train[f].values.mean()
    df_train_small[f] = (df_train[f].values > mean_t).astype(int)
    
    mean_v = df_val[f].values.mean()
    df_val_small[f] = (df_val[f].values > mean_v).astype(int)
    



In [156]:
df_train_small.head()

Unnamed: 0,total_rooms,total_bedrooms
17244,0,0
8817,1,1
19686,0,0
3545,1,1
17019,1,0


In [157]:
model_small = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model_small.fit(df_train_small, above_average)

In [159]:
X_val_small = df_val_small
y_pred = model_small.predict_proba(X_val_small)[:, 1]

th_decision_small = (y_pred >= 0.5)
acc_current = (y_val == th_decision).mean()

print(acc_current)
acc_values.append(acc_current)

0.8362403100775194
