In [1]:
import pandas as pd

import matplotlib.pyplot as plt

In [2]:
housing_data = pd.read_csv('datasets/housing.csv')

housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
995,-121.75,37.71,11.0,12070.0,2220.0,5826.0,2125.0,4.8624,192400.0,INLAND
10644,-117.81,33.56,24.0,6258.0,1003.0,1730.0,752.0,10.9601,500001.0,<1H OCEAN
6467,-118.05,34.1,30.0,2143.0,427.0,1107.0,416.0,4.2321,252200.0,INLAND
2828,-119.05,35.4,18.0,1894.0,319.0,846.0,317.0,3.8611,126400.0,INLAND
17832,-121.86,37.41,16.0,2938.0,589.0,1718.0,568.0,5.5073,178900.0,<1H OCEAN


In [3]:
housing_data = housing_data.dropna()

In [4]:
housing_data.shape

(20433, 10)

In [5]:
housing_data.loc[housing_data['median_house_value'] == 500001].count()

longitude             958
latitude              958
housing_median_age    958
total_rooms           958
total_bedrooms        958
population            958
households            958
median_income         958
median_house_value    958
ocean_proximity       958
dtype: int64

In [6]:
housing_data = housing_data.drop(housing_data.loc[housing_data['median_house_value'] == 500001].index)

In [7]:
housing_data.shape

(19475, 10)

In [8]:
housing_data['ocean_proximity'].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [9]:
housing_data = pd.get_dummies(housing_data, columns=['ocean_proximity'])

In [10]:
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
17437,-120.45,34.64,17.0,1226.0,277.0,484.0,224.0,3.2167,112500.0,0,0,0,0,1
13763,-117.12,34.06,38.0,281.0,55.0,151.0,52.0,1.3906,120800.0,0,1,0,0,0
88,-122.27,37.8,10.0,105.0,42.0,125.0,39.0,0.9722,137500.0,0,0,0,1,0
6349,-117.94,34.15,33.0,859.0,144.0,421.0,138.0,4.4821,220100.0,0,1,0,0,0
10350,-117.67,33.61,23.0,3588.0,577.0,1695.0,569.0,6.1401,243200.0,1,0,0,0,0


In [13]:
median = housing_data['median_house_value'].median()

median

173800.0

In [14]:
housing_data['above_median']= (housing_data['median_house_value']-median)>0

In [15]:
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,above_median
8720,-118.35,33.83,36.0,1102.0,193.0,522.0,172.0,6.1187,342000.0,1,0,0,0,0,True
6684,-118.07,34.16,39.0,1804.0,265.0,730.0,276.0,6.4761,397500.0,0,1,0,0,0,True
1450,-121.95,37.96,18.0,2739.0,393.0,1072.0,374.0,6.1436,259500.0,0,1,0,0,0,True
9520,-123.2,39.14,17.0,1620.0,396.0,878.0,399.0,1.8042,109200.0,1,0,0,0,0,False
12258,-116.95,33.79,8.0,10997.0,2205.0,5060.0,1949.0,2.1979,95300.0,0,1,0,0,0,False


In [16]:
X = housing_data.drop(['median_house_value', 'above_median'], axis = 1)
Y = housing_data['above_median']

In [17]:
X.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND',
       'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY',
       'ocean_proximity_NEAR OCEAN'],
      dtype='object')

In [18]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.2)

In [19]:
x_train.shape, x_test.shape

((15580, 13), (3895, 13))

In [20]:
y_train.shape, y_test.shape

((15580,), (3895,))

In [22]:
from sklearn.linear_model import LogisticRegression
#liblinear is used for classification of binary and small dataset
logistic_model = LogisticRegression(solver='liblinear').fit(x_train, y_train)

In [24]:
print("Training score: ", logistic_model.score(x_train, y_train))

Training score:  0.820410783055199


In [25]:
y_pred = logistic_model.predict(x_test)

In [26]:
df_pred_actual = pd.DataFrame({'predicted':y_pred, 'actual': y_test})
df_pred_actual.head(10)

Unnamed: 0,predicted,actual
10387,True,True
18127,True,True
3434,False,False
11666,True,True
7743,True,True
6546,True,False
13887,False,False
17488,True,False
13986,False,False
9215,False,False


In [27]:
from sklearn.metrics import accuracy_score

print("Testing_score: ", accuracy_score(y_test, y_pred))

Testing_score:  0.8184852374839537
