In [11]:
import numpy as np 
import pandas as pd 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler

In [16]:
# load data
df_housing = pd.read_csv('../data/kc_house_data.csv')
df_housing.corr()['price']
df_housing.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [9]:
df_housing = df_housing.drop(['id','date'],axis = 1)
df_housing.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [7]:
# checking for nulls
df_housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long  

In [10]:
# creating indicator variables using one hot encoding 
df_housing = pd.get_dummies(df_housing,columns=['zipcode','condition','grade'])

In [12]:
x_train,x_test,y_train,y_test = train_test_split(df_housing.drop(['price'],axis = 1),df_housing['price'],test_size=0.2,random_state=777)

In [19]:
# scaling
columns_toscale = ['bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'sqft_above', 'sqft_basement', 'yr_built', 
       'yr_renovated','lat', 'long', 'sqft_living15', 'sqft_lot15']
scaler = StandardScaler()
scaler = scaler.fit(x_train.loc[:,columns_toscale])
scaled_x_train = pd.DataFrame(scaler.transform(x_train.loc[:,columns_toscale]),columns = columns_toscale, index = x_train.index).join(x_train.drop(columns_toscale, axis = 1))
scaled_x_test = pd.DataFrame(scaler.transform(x_test.loc[:,columns_toscale]),columns = columns_toscale, index = x_test.index).join(x_test.drop(columns_toscale, axis = 1))

In [20]:
# creating new model
model = LinearRegression()
model = model.fit(x_train, y_train)

In [26]:
y_pred = model.predict(x_train)
#y_pred = cross_val_predict(model,x_train,y_train)
rmse = np.sqrt(mean_squared_error(y_pred=y_pred,y_true=y_train))
mae = mean_absolute_error(y_pred=y_pred,y_true=y_train)

In [27]:
y_pred_test = model.predict(x_test)
#y_pred = cross_val_predict(model,x_train,y_train)
rmse_test = np.sqrt(mean_squared_error(y_pred=y_pred_test,y_true=y_test))
mae_test = mean_absolute_error(y_pred=y_pred_test,y_true=y_test)