In [32]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [33]:
housing_data = pd.read_csv('./data/housing.csv')
housing_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [34]:
housing_data = housing_data.dropna()
housing_data.shape

(20433, 10)

In [35]:
housing_data = housing_data.drop(housing_data.loc[housing_data.median_house_value == 500001].index)
housing_data.shape

(19475, 10)

In [36]:
housing_data = pd.get_dummies(housing_data, columns=['ocean_proximity'])
housing_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,0,1,0


In [37]:
median = housing_data['median_house_value'].median()

In [38]:
housing_data['above_median'] = housing_data['median_house_value'] > median
housing_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,above_median
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,0,1,0,True
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,0,1,0,True
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,0,1,0,True
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,0,1,0,True
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,0,1,0,True


In [39]:
X = housing_data.drop(['median_house_value', 'above_median'], axis=1)
Y = housing_data['above_median']

In [40]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [41]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((15580, 13), (3895, 13), (15580,), (3895,))

In [42]:
logistic_model = LogisticRegression(solver='liblinear').fit(X_train, Y_train)

In [43]:
"Trainig score: {:.2f}".format(logistic_model.score(X_train, Y_train))

'Trainig score: 0.82'

In [44]:
y_pred = logistic_model.predict(X_test)

In [45]:
df_pred_actual = pd.DataFrame({'predicted': y_pred, 'actual': Y_test})
df_pred_actual.head()

Unnamed: 0,predicted,actual
5193,False,False
9085,False,False
6058,True,True
15211,True,True
6998,True,True


In [46]:
"Testing score: {:.2f}".format(accuracy_score(Y_test, y_pred))

'Testing score: 0.82'