LOAD DATA

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
%matplotlib inline


In [2]:
data = pd.read_csv(r'C:\Users\pogbe\Downloads\housing.csv')
print(data.shape)
data.head()

(20640, 10)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,NEAR BAY,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,NEAR BAY,342200


HANDLE MISSING DATA

In [6]:
data['total_bedrooms'].fillna(data['total_bedrooms'].mean(), inplace=True)
data.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
ocean_proximity       0
median_house_value    0
dtype: int64

CONVERT CATEGORICAL DATA

In [11]:
data['ocean_proximity'] = data['ocean_proximity'].replace('NEAR BAY', 1)
data['ocean_proximity'] = data['ocean_proximity'].replace('<1H OCEAN', 2)
data['ocean_proximity'] = data['ocean_proximity'].replace('INLAND', 3)
data['ocean_proximity'] = data['ocean_proximity'].replace('NEAR OCEAN', 4)
data['ocean_proximity'] = data['ocean_proximity'].replace('ISLAND', 5)

In [12]:
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,1,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,1,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,1,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,1,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,1,342200


1c - EXTRACT INPUT(X) AND OUTPUT(Y)

In [16]:
X = data[['longitude','latitude','housing_median_age','total_rooms', 'total_bedrooms','population','households','median_income','ocean_proximity']]
y= data['median_house_value']
y.describe()

count     20640.000000
mean     206855.816909
std      115395.615874
min       14999.000000
25%      119600.000000
50%      179700.000000
75%      264725.000000
max      500001.000000
Name: median_house_value, dtype: float64

SPLIT DATA

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

STANDARDIZE DATA

In [19]:
from sklearn.preprocessing import StandardScaler

In [20]:
scaler = StandardScaler()
X_trainstd = scaler.fit_transform(X_train)
X_teststd = scaler.transform(X_test)

LINEAR REGRESSION

In [22]:
reg = LinearRegression()
reg.fit(X_trainstd, y_train)
print (reg.score(X_teststd, y_test))

0.6145634396339879


In [23]:
y_pred = reg.predict(X_teststd)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(rmse)

71068.9474556865


DECISION TREE REGRESSION

In [24]:
regr = DecisionTreeRegressor(random_state=0)
regr.fit(X_trainstd, y_train)
y_pred =regr.predict(X_teststd)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(rmse)

69264.28041019065


RANDOM FOREST REGRESSION

In [25]:
regres = RandomForestRegressor(random_state=0)
regres.fit(X_trainstd, y_train)
y_pred = regres.predict(X_teststd)
rmse =mean_squared_error(y_test, y_pred, squared=False)
print(rmse)

50215.5889258326


LINEAR REGRESSION With Median_Income

In [35]:
XX = np.array(data['median_income']).reshape(-1,1)
yy = data['median_house_value']

XX_train, XX_test, yy_train, yy_test = train_test_split(XX, yy, test_size=0.20, random_state=42)

In [36]:
regression = LinearRegression()
regression.fit(XX_train, yy_train)
yy_pred = regression.predict(XX_test)
rmse = mean_squared_error(yy_test, yy_pred, squared=False)
print(rmse)

84209.01241414454
