In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

**Read Data**

In [5]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
df_house_data=pd.read_csv("kc_house_data.csv", dtype=dtype_dict)
df_house_train=pd.read_csv("kc_house_train_data.csv", dtype=dtype_dict)
df_house_test=pd.read_csv("kc_house_test_data.csv", dtype=dtype_dict)

In [6]:
from sklearn import linear_model

def simple_regression_scikit(input_feature, output):
    clf = linear_model.LinearRegression()
    clf.fit(input_feature,output)
    return clf
    
    
model = simple_regression_scikit(df_house_train[['sqft_living']], df_house_train[['price']])

(model.intercept_[0],model.coef_[0][0])


(-47116.079072893248, 281.95883963034214)

In [7]:
predictions = model.predict(df_house_train[['sqft_living']])
from sklearn.metrics import mean_squared_error
MSE=mean_squared_error(df_house_train[['price']],predictions)
#MSE = RSS/n
RSS=MSE*len(predictions) 
print(RSS)

1.20191835418e+15


In [8]:
df_house_train.index

RangeIndex(start=0, stop=17384, step=1)

In [9]:
df_house_train.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [10]:
df_house_train['bedrooms_squared']=df_house_train['bedrooms']*df_house_train['bedrooms']
df_house_test['bedrooms_squared']=df_house_test['bedrooms']*df_house_test['bedrooms']
df_house_train[['bedrooms_squared','bedrooms']].head()
df_house_train['bed_bath_rooms']=df_house_train['bedrooms']*df_house_train['bathrooms']
df_house_test['bed_bath_rooms']=df_house_test['bedrooms']*df_house_test['bathrooms']
df_house_train['log_sqft_living']=np.log(df_house_train['sqft_living'])
df_house_test['log_sqft_living']=np.log(df_house_test['sqft_living'])
df_house_train['lat_plus_long']=df_house_train['lat']+df_house_train['long']
df_house_test['lat_plus_long']=df_house_test['lat']+df_house_test['long']
df_house_train[['log_sqft_living','sqft_living']].head()

Unnamed: 0,log_sqft_living,sqft_living
0,7.07327,1180.0
1,7.851661,2570.0
2,6.646391,770.0
3,7.5807,1960.0
4,7.426549,1680.0


In [11]:
model1 = linear_model.LinearRegression().fit(df_house_train[['sqft_living', 'bedrooms', 'bathrooms','lat','long']], df_house_train[['price']])
model2 = linear_model.LinearRegression().fit(df_house_train[['sqft_living', 'bedrooms', 'bathrooms','lat','long','bed_bath_rooms']], df_house_train[['price']])
model3 = linear_model.LinearRegression().fit(df_house_train[['sqft_living', 'bedrooms', 'bathrooms','lat','long','bed_bath_rooms','bedrooms_squared','log_sqft_living','lat_plus_long']], df_house_train[['price']])

In [12]:
model1.coef_, model2.coef_, model3.coef_


(array([[  3.12258646e+02,  -5.95865332e+04,   1.57067421e+04,
           6.58619264e+05,  -3.09374351e+05]]),
 array([[  3.06610053e+02,  -1.13446368e+05,  -7.14613083e+04,
           6.54844630e+05,  -2.94298969e+05,   2.55796520e+04]]),
 array([[  5.29422820e+02,   3.45142296e+04,   6.70607813e+04,
           5.34085611e+05,  -4.06750711e+05,  -8.57050439e+03,
          -6.78858667e+03,  -5.61831484e+05,   1.27334900e+05]]))

In [24]:
predictions_model1 = model1.predict(df_house_train[['sqft_living', 'bedrooms', 'bathrooms','lat','long']])
predictions_model2 = model2.predict(df_house_train[['sqft_living', 'bedrooms', 'bathrooms','lat','long','bed_bath_rooms']])
predictions_model3 = model3.predict(df_house_train[['sqft_living', 'bedrooms', 'bathrooms','lat','long','bed_bath_rooms','bedrooms_squared','log_sqft_living','lat_plus_long']])

predictions_model1_test = model1.predict(df_house_test[['sqft_living', 'bedrooms', 'bathrooms','lat','long']])
predictions_model2_test = model2.predict(df_house_test[['sqft_living', 'bedrooms', 'bathrooms','lat','long','bed_bath_rooms']])
predictions_model3_test = model3.predict(df_house_test[['sqft_living', 'bedrooms', 'bathrooms','lat','long','bed_bath_rooms','bedrooms_squared','log_sqft_living','lat_plus_long']])

In [30]:
RSS_model1 = ((df_house_train[['price']] - predictions_model1)**2).sum()
RSS_model2 = ((df_house_train[['price']] - predictions_model2)**2).sum()
RSS_model3 = ((df_house_train[['price']] - predictions_model3)**2).sum()

RSS_test_model1 = ((df_house_test[['price']] - predictions_model1_test)**2).sum()
RSS_test_model2 = ((df_house_test[['price']] - predictions_model2_test)**2).sum()
RSS_test_model3 = ((df_house_test[['price']] - predictions_model3_test)**2).sum()

print(RSS_model1, RSS_model2, RSS_model3)
print(RSS_test_model1, RSS_test_model2, RSS_test_model3)

price    9.678800e+14
dtype: float64 price    9.584196e+14
dtype: float64 price    9.034365e+14
dtype: float64
price    2.255005e+14
dtype: float64 price    2.233775e+14
dtype: float64 price    2.592363e+14
dtype: float64
