In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [2]:
# 2019 cleaned data for machine learning
path_2019=os.path.join('output_census', 'ca_2019_ml.csv')
ca_2019=pd.read_csv(path_2019)

In [3]:
#Things to do
#choose relevent columns
#get dummy data
#train test split
#multiple linear regression
#lasso regression
#random forest
#tune model using gridsearch cv
#test ensembles

In [4]:
ca_2019

Unnamed: 0,City,Population,House Value,Household Income,Poverty Rate,Unemployment Rate,Monthly Owner Cost,Monthly Rent,Public Transport Rate,Personal Transport Rate,College Rate,White Population Rate,Uneducated Rate
0,Los Angeles,59832.0,359000.0,43360.0,25.496390,3.929335,1853.0,1150.0,11.425133,67.471780,2.744351,41.882270,3.546597
1,Los Angeles,53302.0,345900.0,37285.0,31.188323,4.110540,1813.0,1187.0,13.388805,68.219869,2.615286,42.973997,2.986755
2,Los Angeles,73730.0,362800.0,40598.0,29.959311,4.002441,1892.0,1212.0,12.446959,69.500121,2.513224,33.116777,3.754238
3,Los Angeles,60541.0,1063200.0,49675.0,17.784642,2.685783,3489.0,1307.0,18.395309,66.077288,18.390843,35.676649,2.175385
4,Los Angeles,39732.0,777100.0,38491.0,25.999195,3.289540,3149.0,1235.0,27.870066,52.237287,16.767341,23.336354,2.126749
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,Tahoe City,2147.0,666300.0,73875.0,5.076851,0.000000,2419.0,1054.0,0.000000,75.699440,36.609222,96.413600,0.000000
1456,Olympic Valley,1145.0,911900.0,76851.0,3.930131,1.135371,2745.0,1492.0,2.197802,72.527473,28.034934,93.886463,0.000000
1457,Tahoe Vista,580.0,750000.0,73611.0,20.862069,0.000000,3375.0,1625.0,0.000000,97.763578,18.448276,100.000000,0.000000
1458,South Lake Tahoe,29327.0,442300.0,57396.0,10.815972,3.808777,1953.0,1049.0,2.833476,70.593649,15.050977,83.943124,0.692195


In [5]:
ca_2019.columns

Index(['City', 'Population', 'House Value', 'Household Income', 'Poverty Rate',
       'Unemployment Rate', 'Monthly Owner Cost', 'Monthly Rent',
       'Public Transport Rate', 'Personal Transport Rate', 'College Rate',
       'White Population Rate', 'Uneducated Rate'],
      dtype='object')

In [6]:
#seting city as a index
ca_2019.set_index('City', inplace=True)

In [7]:
#dcreating features matrix and response vectors
X=ca_2019.drop('House Value', axis=1) #selecting all columns except 'House Value'
y=ca_2019['House Value'].values  ##selecting 'House Value'

In [8]:
#features scaling using standard scaler
from sklearn.preprocessing import StandardScaler
std_scaler=StandardScaler()
scaled_X=std_scaler.fit_transform(X)

In [9]:
scaled_X.shape

(1460, 11)

In [10]:
#splitting data in train and test
X_train, X_test, y_train, y_test = train_test_split(
                               scaled_X, y, test_size=0.2, random_state=42)

## Linear Regression

In [11]:
#using linear regression
# Make a linear regression instance
lr=LinearRegression()
# Training the model on the data, storing the information learned from the data
# Model is learning the relationship between X and y 
lr.fit(X_train, y_train)


LinearRegression()

In [12]:
print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))

0.8607789185273351
0.8241552523056297


In [15]:
#test the model in test data
y_pred=lr.predict(X_test)

In [16]:
#regression matrices-mean squared error(give you the prediction error)
lin_mse=mean_squared_error(y_test, y_pred)
lin_rmse=np.sqrt(lin_mse)
lin_rmse

166339.4899058907

In [17]:
#regression matrices-mean absolute error(give you the prediction error)
lin_mae=mean_absolute_error(y_test, y_pred)
lin_mae

103315.59489383496

In [21]:
cross_val_score(lr, X_train, y_train, scoring='neg_mean_absolute_error',cv=5)

array([-96770.77735665, -91440.28913294, -98186.66455641, -89568.07046732,
       -85129.35208968])

In [20]:
score=cross_val_score(lr, X_train, y_train, scoring='neg_mean_squared_error',cv=5)
rmse_score=np.sqrt(-score) 
rmse_score

array([156136.66693834, 141589.94105545, 156950.65420068, 142785.41910185,
       125674.10919709])

## Laso Regression

In [18]:
#using lassoregression(lasso make some features exactly zero)
#alpha=0 no regularization
lr_l=Lasso()
lr_l.fit(X_train, y_train)


Lasso()

In [22]:
cross_val_score(lr_l, X_train, y_train, scoring='neg_mean_absolute_error', cv=3)

array([-95308.33155207, -93686.34282313, -85900.96676039])

In [None]:
#choosing the best alpha
alpha=[]
error=[]
for i in range(1,100):
    alpha.append(i/10)
    lrl=Lasso(alpha=(i/10))
    error.append(np.mean(cross_val_score(lrl, X_train, y_train, scoring='neg_mean_absolute_error',
                                         cv=3)))
plt.plot(alpha,error) 

In [None]:
lrl=Lasso(alpha=10)
lrl.fit(X_train, y_train)
cross_val_score(lrl, X_train, y_train, scoring='neg_mean_absolute_error', cv=3)


## Random Forest

In [23]:
rf=RandomForestRegressor()
rf.fit(X_train, y_train)

RandomForestRegressor()

In [24]:
#test the model in test data
y_pred=rf.predict(X_test)

In [25]:
#regression matrices-mean squared error(give you the prediction error)
rf_mse=mean_squared_error(y_test, y_pred)
rf_rmse=np.sqrt(lin_mse)
rf_rmse

166339.4899058907

In [26]:
cross_val_score(rf, X_train, y_train, scoring='neg_mean_absolute_error', cv=3)

array([-83556.17502564, -78397.48524422, -69283.98789203])

In [27]:
#tune model using gridsearch cv#

In [None]:
parameters = {'n_estimators':range(10,300,10), 'criterion': ('mse','mae'), 'max_features':('auto', 'sqrt', 'log2')}
gs=GridSearchCV(rf, parameters, scoring='neg_mean_absolute_error', cv=3)
gs.fit(X_train, y_train)