## Multiple Linear Regression

In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

#### Read in *house_clean.csv* and use this data for modeling

In [2]:
house=pd.read_csv('./datasets/house_Clean.csv')

In [3]:
house.head()

Unnamed: 0.1,Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,0,109,533352170,60,RL,0.0,13517,Pave,,IR1,...,0,0,,,,0,3,2010,WD,130500
1,1,544,531379050,60,RL,43.0,11492,Pave,,IR1,...,0,0,,,,0,4,2009,WD,220000
2,2,153,535304180,20,RL,68.0,7922,Pave,,Reg,...,0,0,,,,0,1,2010,WD,109000
3,3,318,916386060,60,RL,73.0,9802,Pave,,Reg,...,0,0,,,,0,4,2010,WD,174000
4,4,255,906425045,50,RL,82.0,14235,Pave,,IR1,...,0,0,,,,0,3,2010,WD,138500


In [4]:
house.drop(columns='Unnamed: 0')

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,0.0,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2037,1587,921126030,20,RL,79.0,11449,Pave,,IR1,HLS,...,0,0,,,,0,1,2008,WD,298751
2038,785,905377130,30,RL,0.0,12342,Pave,,IR1,Lvl,...,0,0,,,,0,3,2009,WD,82500
2039,916,909253010,50,RL,57.0,7558,Pave,,Reg,Bnk,...,0,0,,,,0,3,2009,WD,177000
2040,639,535179160,20,RL,80.0,10400,Pave,,Reg,Lvl,...,0,0,,,,0,11,2009,WD,144000


# Features decided on

Overall Qual

Gr Liv Area

Total Bsmt SF

Garage Cars

Year Built

Year Remod/Add

Full Bath

#### Baseline model

In [30]:
features = house[['Overall Qual', 'Gr Liv Area',
       'Total Bsmt SF', 'Garage Cars', 'Year Built','Year Remod/Add','Full Bath']]
X = features
y= house['SalePrice']

### Model Prep: Train_test_split

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=424)

#### Instantiate & fit the Model

In [32]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

#### Model Evaluation & metrics

In [36]:
lr.score(X_train, y_train),lr.score(X_test, y_test)

(0.8313465989974215, 0.815098221429693)

In [37]:
#3.03-lesson-regression-metrics
print(lr.intercept_)
print(lr.coef_)

-1290158.355445915
[ 17999.88110639     62.31965999     45.99417103  11292.25823644
    300.93544475    317.51155237 -14527.08753366]


In [38]:
#3.03-lesson-regression-metrics
[f'{X.columns[i]}: {lr.coef_[i]}' for i in range(len(X.columns))]

['Overall Qual: 17999.881106393437',
 'Gr Liv Area: 62.319659992173236',
 'Total Bsmt SF: 45.99417103236554',
 'Garage Cars: 11292.258236438076',
 'Year Built: 300.93544475369254',
 'Year Remod/Add: 317.51155236803237',
 'Full Bath: -14527.08753365628']

### Model 2 Prep: Train_test_split

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= .2 , random_state=420)

### Standard Scaler

In [44]:
ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_test = ss.transform(X_test)

### Model Prep: Instantiate the model

In [45]:
lr2=LinearRegression()

In [46]:
lr2.fit(X_train, y_train)

LinearRegression()

In [47]:
lr2.score(X_train, y_train),lr2.score(X_test, y_test)

(0.8276668780845685, 0.8243261752563309)

## Adding Cross Validation

In [48]:
features = house[['Overall Qual', 'Gr Liv Area',
       'Total Bsmt SF', 'Garage Cars', 'Year Built','Year Remod/Add','Full Bath']]
X = features
y= house['SalePrice']

### Cross Validation

In [49]:
# Running cross validation on the training data
cross_val_score(estimator=lr2, X=X_train, y=y_train)

array([0.83338349, 0.82315614, 0.81675579, 0.83330651, 0.818313  ])

In [50]:
#mean of CV on training data, STD of CV on training data
cross_val_score(estimator=lr2,X=X_train, y=y_train).mean(), cross_val_score(estimator=lr2, X=X_train, y=y_train).std()

(0.8249829862120241, 0.007146500243956576)

In [51]:
cross_val_score(estimator=lr2, X=X_test, y=y_test)

array([0.83243878, 0.82464794, 0.8556893 , 0.84483279, 0.76713909])

### Model Fitting and Evaluation

In [52]:
#fit the model on all training data
lr2.fit(X_train, y_train)

LinearRegression()

In [53]:
#training Score
lr2.score(X_train, y_train)

0.8276668780845685

In [54]:
#testing Score
lr2.score(X_test, y_test)

0.8243261752563309

### Kaggle Submission

In [62]:
kaggle = pd.read_csv('./datasets/test.csv', keep_default_na=False, na_values=[''])

In [63]:
kaggle.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


In [64]:
kaggle.shape

(878, 80)

In [65]:
Kfeatures = kaggle[['Overall Qual', 'Gr Liv Area',
       'Total Bsmt SF', 'Garage Cars', 'Year Built','Year Remod/Add','Full Bath']]

In [66]:
Kfeatures.isnull().sum()

Overall Qual      0
Gr Liv Area       0
Total Bsmt SF     0
Garage Cars       0
Year Built        0
Year Remod/Add    0
Full Bath         0
dtype: int64

In [67]:
Kfeatures.shape

(878, 7)

In [72]:
kaggle_preds4 = lr.predict(Kfeatures)

### Make a new Data frame

In [74]:
preds_dict = {
    'Id': kaggle['Id'],
    'SalePrice': kaggle_preds4
}

In [75]:
kaggle_preds4_tocsv = pd.DataFrame(preds_dict)

In [76]:
kaggle_preds4_tocsv.to_csv('Kaggle_preds4.csv',index=False,)