In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics,model_selection
from sklearn.linear_model import LinearRegression

In [3]:
train=pd.read_csv('kc_house_train_data.csv')
test=pd.read_csv('kc_house_test_data.csv')

In [4]:
train.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [5]:
## add new columns
def func(df):
    df['bed_sq']=df['bedrooms']*df['bedrooms']
    df['bed_bath']=df['bedrooms']*df['bathrooms']
    df['log_sqft']=np.log(df['sqft_living'])
    df['lat_long']=df['lat']+df['long']
    return df

In [6]:
train=func(train)
test=func(test)

In [7]:
train.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,bed_sq,bed_bath,log_sqft,lat_long
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,0,98178,47.5112,-122.257,1340,5650,9,3.0,7.07327,-74.7458
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,1991,98125,47.721,-122.319,1690,7639,9,6.75,7.851661,-74.598
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,0,98028,47.7379,-122.233,2720,8062,4,2.0,6.646391,-74.4951
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,0,98136,47.5208,-122.393,1360,5000,16,12.0,7.5807,-74.8722
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,0,98074,47.6168,-122.045,1800,7503,9,6.0,7.426549,-74.4282


In [8]:
## mean on test data of new cols
print(test['bed_sq'].mean())
print(test['bed_bath'].mean())
print(test['log_sqft'].mean())
print(test['lat_long'].mean())

12.4466777015843
7.5039016315913925
7.550274679645921
-74.65333355403185


In [13]:
def model(df,cols):
    x=df.loc[:,cols]
    y=df.loc[:,'price']
    reg=LinearRegression()
    return reg.fit(x,y)

In [14]:
cols1=['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']
cols2=['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long','bed_bath']
cols3=[ 'sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long','bed_bath','bed_sq', 'log_sqft','lat_long']

In [15]:
model1=model(train,cols1)
model2=model(train,cols2)
model3=model(train,cols3)

In [16]:
## model 1
print(model1.coef_)
print(model1.intercept_)

[ 3.12258646e+02 -5.95865332e+04  1.57067421e+04  6.58619264e+05
 -3.09374351e+05]
-69075726.79256983


In [17]:
## model 2
print(model2.coef_)
print(model2.intercept_)

[ 3.06610053e+02 -1.13446368e+05 -7.14613083e+04  6.54844630e+05
 -2.94298969e+05  2.55796520e+04]
-66867968.87107886


In [18]:
## model 3
print(model3.coef_)
print(model3.intercept_)

[ 5.29422820e+02  3.45142296e+04  6.70607813e+04  5.34085611e+05
 -4.06750711e+05 -8.57050439e+03 -6.78858667e+03 -5.61831484e+05
  1.27334900e+05]
-62036084.98609827


In [19]:
#predictions
def pred(df,cols,model):
    x=df.loc[:,cols]
    return model.predict(x)

In [20]:
#predictions on training data
pred1=pred(train,cols1,model1)
pred2=pred(train,cols2,model2)
pred3=pred(train,cols3,model3)

In [21]:
def error(y,y_pred):
    return metrics.mean_squared_error(y,y_pred)

In [22]:
# errors on training data
y_train=train.loc[:,'price']
err1=error(y_train,pred1)
err2=error(y_train,pred2)
err3=error(y_train,pred3)
print(err1)
print(err2)
print(err3)

55676481997.78795
55132284576.28106
51969423323.1983


In [23]:
#predictions on test data
pred1=pred(test,cols1,model1)
pred2=pred(test,cols2,model2)
pred3=pred(test,cols3,model3)

In [25]:
# errors on test data
y_test=test.loc[:,'price']
err1=error(y_test,pred1)
err2=error(y_test,pred2)
err3=error(y_test,pred3)
print(err1)
print(err2)
print(err3)

53322409504.72697
52820397960.857765
61299673494.249214
