In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
housing_data = pd.read_csv('./Data Required for Projects/California_Housing.csv')
housing_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,NEAR BAY
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,NEAR BAY
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,NEAR BAY
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,NEAR BAY


In [16]:
housing_data = housing_data.fillna(0)

In [17]:
housing_data['rooms_per_household'] = housing_data['total_rooms']/housing_data['households']
housing_data['bedrooms_per_room'] = housing_data['total_bedrooms']/housing_data['total_rooms']
housing_data['population_per_household'] = housing_data['population']/housing_data['households']
print(housing_data.head())

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                  41          880           129.0   
1    -122.22     37.86                  21         7099          1106.0   
2    -122.24     37.85                  52         1467           190.0   
3    -122.25     37.85                  52         1274           235.0   
4    -122.25     37.85                  52         1627           280.0   

   population  households  median_income  median_house_value ocean_proximity  \
0         322         126         8.3252              452600        NEAR BAY   
1        2401        1138         8.3014              358500        NEAR BAY   
2         496         177         7.2574              352100        NEAR BAY   
3         558         219         5.6431              341300        NEAR BAY   
4         565         259         3.8462              342200        NEAR BAY   

   rooms_per_household  bedrooms_per_room  population_per_household 

In [18]:
print(housing_data['ocean_proximity'].mode())

0    <1H OCEAN
Name: ocean_proximity, dtype: object


In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [20]:
X = housing_data.drop('median_house_value', axis=1)
y = housing_data['median_house_value']
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
categorical = list(X_full_train.dtypes[X_full_train.dtypes==object].index)
numerical = list(X_full_train.dtypes[X_full_train.dtypes!=object].index)
print(categorical)
print(numerical)

['ocean_proximity']
['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'rooms_per_household', 'bedrooms_per_room', 'population_per_household']


In [22]:
print(housing_data[numerical].corr())

                          longitude  latitude  housing_median_age  \
longitude                  1.000000 -0.924664           -0.108197   
latitude                  -0.924664  1.000000            0.011173   
housing_median_age        -0.108197  0.011173            1.000000   
total_rooms                0.044568 -0.036100           -0.361262   
total_bedrooms             0.068082 -0.065318           -0.317063   
population                 0.099773 -0.108785           -0.296244   
households                 0.055310 -0.071035           -0.302916   
median_income             -0.015176 -0.079809           -0.119034   
rooms_per_household       -0.027540  0.106389           -0.153277   
bedrooms_per_room          0.084836 -0.104112            0.125396   
population_per_household   0.002476  0.002366            0.013191   

                          total_rooms  total_bedrooms  population  households  \
longitude                    0.044568        0.068082    0.099773    0.055310   
latitude 

In [23]:
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=42)

In [24]:
X_train['above_avg'] = np.where(y_train > y_train.mean(), 1, 0)
print(X_train.head())

       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
17244    -119.67     34.43                  39         1467           381.0   
8817     -118.32     33.74                  24         6097           794.0   
19686    -121.62     39.13                  41         1317           309.0   
3545     -118.63     34.24                   9         4759           924.0   
17019    -122.30     37.52                  38         2769           387.0   

       population  households  median_income ocean_proximity  \
17244        1404         374         2.3681       <1H OCEAN   
8817         2248         806        10.1357      NEAR OCEAN   
19686         856         337         1.6719          INLAND   
3545         1884         915         4.8333       <1H OCEAN   
17019         994         395         5.5902      NEAR OCEAN   

       rooms_per_household  bedrooms_per_room  population_per_household  \
17244             3.922460           0.259714                  3.

In [45]:
X_val['above_avg'] = np.where(y_val > y_val.mean(), 1, 0)
print(X_val.head())

       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
2071     -119.85     36.74                  35         1191           190.0   
2612     -124.16     41.02                  23         1672           385.0   
10838    -117.92     33.67                  14         6224          1679.0   
4061     -118.45     34.15                  10         1091           260.0   
10767    -117.90     33.63                  28         2370           352.0   

       population  households  median_income ocean_proximity  \
2071          537         182         3.5375          INLAND   
2612         1060         390         2.1726      NEAR OCEAN   
10838        3148        1589         4.2071       <1H OCEAN   
4061          517         266         4.1727       <1H OCEAN   
10767         832         347         7.1148       <1H OCEAN   

       rooms_per_household  bedrooms_per_room  population_per_household  \
2071              6.543956           0.159530                  2.

In [46]:
X_test['above_avg'] = np.where(y_test > y_test.mean(), 1, 0)
print(X_test.head())

       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
20046    -119.01     36.06                  25         1505             0.0   
3024     -119.46     35.14                  30         2943             0.0   
15663    -122.44     37.80                  52         3830             0.0   
20484    -118.72     34.28                  17         3051             0.0   
9814     -121.93     36.62                  34         2351             0.0   

       population  households  median_income ocean_proximity  \
20046        1392         359         1.6812          INLAND   
3024         1565         584         2.5313          INLAND   
15663        1310         963         3.4801        NEAR BAY   
20484        1705         495         5.7376       <1H OCEAN   
9814         1063         428         3.7250      NEAR OCEAN   

       rooms_per_household  bedrooms_per_room  population_per_household  \
20046             4.192201                0.0                  3.

In [25]:
from sklearn.metrics import mutual_info_score

mi_score = mutual_info_score(X_train['above_avg'], X_train['ocean_proximity'])
print(round(mi_score, 2))

0.1


In [26]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

dv = DictVectorizer(sparse=False)
train_dicts = X_train[categorical+numerical].to_dict(orient='records')
train_vectors = dv.fit_transform(train_dicts)

In [48]:
y_train = X_train['above_avg']
y_val = X_val['above_avg']
y_test = X_test['above_avg']

X_train = X_train.drop('above_avg', axis=1)
X_val = X_val.drop('above_avg', axis=1)
X_test = X_test.drop('above_avg', axis=1)

In [49]:
model = LogisticRegression()
model.fit(train_vectors, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [50]:
val_dicts = X_val[categorical+numerical].to_dict(orient='records')
val_vectors = dv.transform(val_dicts)
y_pred = model.predict(val_vectors)
print(y_pred.shape)

(4128,)


In [81]:
from sklearn.metrics import accuracy_score

original_accuracy = accuracy_score(y_pred, y_val)
orig_acc_score = round(original_accuracy, 3)
print(orig_acc_score)

0.809


In [82]:
def accuracy_without_feature(feature):
    X_train_without_feature = X_train.drop(feature, axis=1).to_dict(orient='records')
    X_val_without_feature = X_val.drop(feature, axis=1).to_dict(orient='records')
    X_test_without_feature = X_test.drop(feature, axis=1).to_dict(orient='records')
    
    dv = DictVectorizer(sparse=False)
    X_train_without_feature_vectors = dv.fit_transform(X_train_without_feature)
    X_val_without_feature_vectors = dv.transform(X_val_without_feature)
    
    lr = LogisticRegression()
    lr.fit(X_train_without_feature_vectors, y_train)
    y_pred_without_feature = lr.predict(X_val_without_feature_vectors)
    
    return accuracy_score(y_pred_without_feature, y_val)

In [83]:
import warnings

features_list = list(X_train.columns)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for feature in features_list:
        acc_score = round(accuracy_without_feature(feature), 3)
        print(f'Accuracy without the feature {feature} is: {acc_score} and the accuracy difference is: {orig_acc_score - acc_score}')

Accuracy without the feature longitude is: 0.816 and the accuracy difference is: -0.006999999999999895
Accuracy without the feature latitude is: 0.823 and the accuracy difference is: -0.013999999999999901
Accuracy without the feature housing_median_age is: 0.798 and the accuracy difference is: 0.01100000000000001
Accuracy without the feature total_rooms is: 0.834 and the accuracy difference is: -0.02499999999999991
Accuracy without the feature total_bedrooms is: 0.828 and the accuracy difference is: -0.018999999999999906
Accuracy without the feature population is: 0.827 and the accuracy difference is: -0.017999999999999905
Accuracy without the feature households is: 0.83 and the accuracy difference is: -0.020999999999999908
Accuracy without the feature median_income is: 0.755 and the accuracy difference is: 0.05400000000000005
Accuracy without the feature ocean_proximity is: 0.8 and the accuracy difference is: 0.009000000000000008
Accuracy without the feature rooms_per_household is: 0.

In [87]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [92]:
from sklearn.linear_model import Ridge

X = housing_data.drop('median_house_value', axis=1)
y = housing_data['median_house_value']
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=42)
y_train_values = np.log1p(y_train.values)
y_val_values = np.log1p(y_val.values)

for a in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(train_vectors, y_train_values)
    pred = model.predict(val_vectors)
    print(f"RMSE score for alpha {a} is: {rmse(pred, y_val)}")

RMSE score for alpha 0 is: 239035.852094729
RMSE score for alpha 0.01 is: 239035.852094729
RMSE score for alpha 0.1 is: 239035.85209472908
RMSE score for alpha 1 is: 239035.85209473036
RMSE score for alpha 10 is: 239035.85209474585
