In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import Lasso, Ridge
from sklearn.linear_model import RidgeCV, LassoCV

import warnings

warnings.filterwarnings('ignore')

train = pd.read_csv('../datasets/train.csv')

In [2]:
train.drop(columns = ['Pool QC', 'Pool Area'], inplace = True)

In [3]:
train.drop(columns = ['Alley'], inplace = True)

In [4]:
train.drop(columns = ['Misc Feature', 'Misc Val'], inplace = True)

In [5]:
train.drop(columns = ['Fence'], inplace = True)

In [6]:
train['Mas Vnr Type'].fillna('None', inplace = True)
train['Mas Vnr Area'].fillna(0, inplace = True)

In [7]:
train['Bsmt Qual'].fillna('NA', inplace = True)
train['Bsmt Cond'].fillna('NA', inplace = True)
train['Bsmt Exposure'].fillna('NA', inplace = True)
train['BsmtFin Type 1'].fillna('NA', inplace = True)
train['BsmtFin Type 2'].fillna('NA', inplace = True)
train['BsmtFin SF 1'].fillna(0, inplace = True)
train['BsmtFin SF 2'].fillna(0, inplace = True)
train['Bsmt Unf SF'].fillna(0, inplace = True)
train['Total Bsmt SF'].fillna(0, inplace = True)
train['Bsmt Full Bath'].fillna(0, inplace = True)
train['Bsmt Half Bath'].fillna(0, inplace = True)

In [8]:
train['Garage Type'].fillna('NA', inplace = True)
train['Garage Finish'].fillna('NA', inplace = True)
train['Garage Qual'].fillna('NA', inplace = True)
train['Garage Cond'].fillna('NA', inplace = True)
train['Garage Cars'].fillna(0, inplace = True)
train['Garage Area'].fillna(0, inplace = True)
train['Bsmt Unf SF'].fillna(0, inplace = True)
train.drop(columns = 'Garage Yr Blt', inplace = True)

In [9]:
train['Fireplace Qu'].fillna('NA', inplace = True)


In [10]:
train['Lot Frontage'].fillna(0, inplace = True)


In [11]:
train.drop([960, 1885], inplace = True)

In [12]:
train.drop([471, 694, 1854], inplace = True)

In [13]:
train['porch sf'] = train['Wood Deck SF'] + train['Open Porch SF'] + \
                    train['Enclosed Porch'] + train['3Ssn Porch'] + \
                    train['Screen Porch']

In [14]:
train['age at sale'] = train['Yr Sold'] - train['Year Built']

In [15]:
train['age remod'] = train['Yr Sold'] - train['Year Remod/Add']

In [16]:
train['total baths'] = train['Full Bath'] + train['Half Bath'] + train['Bsmt Full Bath'] + train['Bsmt Half Bath']

In [17]:
train['garage area_cars'] = train['Garage Area'] * train['Garage Cars']

In [18]:
train['finish bsmt sf'] = train['BsmtFin SF 1'] + train['BsmtFin SF 2']

In [21]:
poly = PolynomialFeatures(include_bias = False)

In [20]:
features = ['Overall Qual', 'Gr Liv Area', 'garage area_cars', 'age at sale',
          'age remod', 'porch sf', 'finish bsmt sf', 'total baths',
          'Mas Vnr Area', 'Fireplaces','Bedroom AbvGr', 'Kitchen AbvGr', 
          'Lot Area', 'Lot Frontage', 'Overall Cond']
X = train[features]

In [22]:
X_poly = poly.fit_transform(X)

In [23]:
poly.get_feature_names(features)

['Overall Qual',
 'Gr Liv Area',
 'garage area_cars',
 'age at sale',
 'age remod',
 'porch sf',
 'finish bsmt sf',
 'total baths',
 'Mas Vnr Area',
 'Fireplaces',
 'Bedroom AbvGr',
 'Kitchen AbvGr',
 'Lot Area',
 'Lot Frontage',
 'Overall Cond',
 'Overall Qual^2',
 'Overall Qual Gr Liv Area',
 'Overall Qual garage area_cars',
 'Overall Qual age at sale',
 'Overall Qual age remod',
 'Overall Qual porch sf',
 'Overall Qual finish bsmt sf',
 'Overall Qual total baths',
 'Overall Qual Mas Vnr Area',
 'Overall Qual Fireplaces',
 'Overall Qual Bedroom AbvGr',
 'Overall Qual Kitchen AbvGr',
 'Overall Qual Lot Area',
 'Overall Qual Lot Frontage',
 'Overall Qual Overall Cond',
 'Gr Liv Area^2',
 'Gr Liv Area garage area_cars',
 'Gr Liv Area age at sale',
 'Gr Liv Area age remod',
 'Gr Liv Area porch sf',
 'Gr Liv Area finish bsmt sf',
 'Gr Liv Area total baths',
 'Gr Liv Area Mas Vnr Area',
 'Gr Liv Area Fireplaces',
 'Gr Liv Area Bedroom AbvGr',
 'Gr Liv Area Kitchen AbvGr',
 'Gr Liv Area Lot

In [28]:
pd.DataFrame(X_poly,
             columns = poly.get_feature_names(features))

Unnamed: 0,Overall Qual,Gr Liv Area,garage area_cars,age at sale,age remod,porch sf,finish bsmt sf,total baths,Mas Vnr Area,Fireplaces,...,Kitchen AbvGr^2,Kitchen AbvGr Lot Area,Kitchen AbvGr Lot Frontage,Kitchen AbvGr Overall Cond,Lot Area^2,Lot Area Lot Frontage,Lot Area Overall Cond,Lot Frontage^2,Lot Frontage Overall Cond,Overall Cond^2
0,6.0,1479.0,950.0,34.0,5.0,44.0,533.0,3.0,289.0,0.0,...,1.0,13517.0,0.0,8.0,182709289.0,0.0,108136.0,0.0,0.0,64.0
1,7.0,2122.0,1118.0,13.0,12.0,74.0,637.0,4.0,132.0,1.0,...,1.0,11492.0,43.0,5.0,132066064.0,494156.0,57460.0,1849.0,215.0,25.0
2,5.0,1057.0,246.0,57.0,3.0,52.0,731.0,2.0,0.0,0.0,...,1.0,7922.0,68.0,7.0,62758084.0,538696.0,55454.0,4624.0,476.0,49.0
3,5.0,1444.0,800.0,4.0,3.0,100.0,0.0,3.0,0.0,0.0,...,1.0,9802.0,73.0,5.0,96079204.0,715546.0,49010.0,5329.0,365.0,25.0
4,6.0,1445.0,968.0,110.0,17.0,59.0,0.0,2.0,0.0,0.0,...,1.0,14235.0,82.0,8.0,202635225.0,1167270.0,113880.0,6724.0,656.0,64.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2041,8.0,1728.0,1040.0,1.0,1.0,276.0,1011.0,3.0,0.0,1.0,...,1.0,11449.0,79.0,5.0,131079601.0,904471.0,57245.0,6241.0,395.0,25.0
2042,4.0,861.0,1078.0,69.0,59.0,158.0,262.0,1.0,0.0,0.0,...,1.0,12342.0,0.0,5.0,152324964.0,0.0,61710.0,0.0,0.0,25.0
2043,6.0,1913.0,684.0,81.0,59.0,0.0,0.0,2.0,0.0,1.0,...,1.0,7558.0,57.0,6.0,57123364.0,430806.0,45348.0,3249.0,342.0,36.0
2044,4.0,1200.0,294.0,53.0,53.0,329.0,905.0,2.0,0.0,2.0,...,1.0,10400.0,80.0,5.0,108160000.0,832000.0,52000.0,6400.0,400.0,25.0


In [25]:
def evaluate_model(model, X, y): 
    
    y_train_hat = ols.predict(X)

    mse = mean_squared_error(y, y_train_hat)
    rmse = mse ** .5
    r2 = r2_score(y, y_train_hat)

    print(f'R^2: {r2}')
    print(f'MSE: {mse}')
    print(f'RMSE: {rmse}')

In [None]:
list(train.columns)

In [29]:

y = train['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

ols = LinearRegression()
ols.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [None]:
evaluate_model(ols, X_train, y_train)

In [None]:
evaluate_model(ols, X_test, y_test)

In [30]:
cross_val_score(ols, X_poly, y_train).mean()

ValueError: Found input variables with inconsistent numbers of samples: [2046, 1534]