In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import r2_score, mean_squared_error
train = pd.read_csv('../datasets/train.csv')
test = pd.read_csv('../datasets/test.csv')

In [2]:
train = pd.get_dummies(train, columns = ['Kitchen Qual', 'Overall Qual',
                                        'Exter Qual'], 
                       drop_first = True)

In [3]:
# train = train[train['Garage Cars'].notna()]

In [4]:
# train = train[train['Garage Area'].notna()]

In [3]:
train['Garage Cars'].fillna(0, inplace = True)

In [4]:
train['Garage Area'].fillna(0, inplace = True)

In [5]:
train['tot baths'] = train['Full Bath'] + train['Half Bath']

In [6]:
list(train.columns)

['Id',
 'PID',
 'MS SubClass',
 'MS Zoning',
 'Lot Frontage',
 'Lot Area',
 'Street',
 'Alley',
 'Lot Shape',
 'Land Contour',
 'Utilities',
 'Lot Config',
 'Land Slope',
 'Neighborhood',
 'Condition 1',
 'Condition 2',
 'Bldg Type',
 'House Style',
 'Overall Cond',
 'Year Built',
 'Year Remod/Add',
 'Roof Style',
 'Roof Matl',
 'Exterior 1st',
 'Exterior 2nd',
 'Mas Vnr Type',
 'Mas Vnr Area',
 'Exter Cond',
 'Foundation',
 'Bsmt Qual',
 'Bsmt Cond',
 'Bsmt Exposure',
 'BsmtFin Type 1',
 'BsmtFin SF 1',
 'BsmtFin Type 2',
 'BsmtFin SF 2',
 'Bsmt Unf SF',
 'Total Bsmt SF',
 'Heating',
 'Heating QC',
 'Central Air',
 'Electrical',
 '1st Flr SF',
 '2nd Flr SF',
 'Low Qual Fin SF',
 'Gr Liv Area',
 'Bsmt Full Bath',
 'Bsmt Half Bath',
 'Full Bath',
 'Half Bath',
 'Bedroom AbvGr',
 'Kitchen AbvGr',
 'TotRms AbvGrd',
 'Functional',
 'Fireplaces',
 'Fireplace Qu',
 'Garage Type',
 'Garage Yr Blt',
 'Garage Finish',
 'Garage Cars',
 'Garage Area',
 'Garage Qual',
 'Garage Cond',
 'Paved Drive

In [7]:
X = train[['Year Built', 'Gr Liv Area', 'Garage Cars', 
           '1st Flr SF', 'TotRms AbvGrd', 'tot baths',
 'Kitchen Qual_Fa',
 'Kitchen Qual_Gd',
 'Kitchen Qual_TA',
 'Overall Qual_2',
 'Overall Qual_3',
 'Overall Qual_4',
 'Overall Qual_5',
 'Overall Qual_6',
 'Overall Qual_7',
 'Overall Qual_8',
 'Overall Qual_9',
 'Overall Qual_10',
 'Exter Qual_Fa',
 'Exter Qual_Gd',
 'Exter Qual_TA']]
y = train['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

ols = LinearRegression()
ols.fit(X_train, y_train)

def evaluate_model(model, X, y): 
    
    y_train_hat = ols.predict(X)

    mse = mean_squared_error(y, y_train_hat)
    rmse = mse ** .5
    r2 = r2_score(y, y_train_hat)

    print(f'R^2: {r2}')
    print(f'MSE: {mse}')
    print(f'RMSE: {rmse}')

In [8]:
evaluate_model(ols, X_train, y_train)

R^2: 0.8204154804234052
MSE: 1135785926.4720051
RMSE: 33701.42321137203


In [9]:
evaluate_model(ols, X_test, y_test)

R^2: 0.8603740254270887
MSE: 857306936.8714385
RMSE: 29279.804249199457


In [10]:
cross_val_score(ols, X_train, y_train).mean()

0.7971181913778542

In [11]:
test = pd.get_dummies(test, columns = ['Kitchen Qual', 'Overall Qual',
                                        'Exter Qual'], 
                       drop_first = True)

In [12]:
list(test.columns)

['Id',
 'PID',
 'MS SubClass',
 'MS Zoning',
 'Lot Frontage',
 'Lot Area',
 'Street',
 'Alley',
 'Lot Shape',
 'Land Contour',
 'Utilities',
 'Lot Config',
 'Land Slope',
 'Neighborhood',
 'Condition 1',
 'Condition 2',
 'Bldg Type',
 'House Style',
 'Overall Cond',
 'Year Built',
 'Year Remod/Add',
 'Roof Style',
 'Roof Matl',
 'Exterior 1st',
 'Exterior 2nd',
 'Mas Vnr Type',
 'Mas Vnr Area',
 'Exter Cond',
 'Foundation',
 'Bsmt Qual',
 'Bsmt Cond',
 'Bsmt Exposure',
 'BsmtFin Type 1',
 'BsmtFin SF 1',
 'BsmtFin Type 2',
 'BsmtFin SF 2',
 'Bsmt Unf SF',
 'Total Bsmt SF',
 'Heating',
 'Heating QC',
 'Central Air',
 'Electrical',
 '1st Flr SF',
 '2nd Flr SF',
 'Low Qual Fin SF',
 'Gr Liv Area',
 'Bsmt Full Bath',
 'Bsmt Half Bath',
 'Full Bath',
 'Half Bath',
 'Bedroom AbvGr',
 'Kitchen AbvGr',
 'TotRms AbvGrd',
 'Functional',
 'Fireplaces',
 'Fireplace Qu',
 'Garage Type',
 'Garage Yr Blt',
 'Garage Finish',
 'Garage Cars',
 'Garage Area',
 'Garage Qual',
 'Garage Cond',
 'Paved Drive

In [12]:
# test = test[test['Garage Cars'].notna()]

In [13]:
# test = test[test['Garage Area'].notna()]

In [13]:
test['tot baths'] = test['Full Bath'] + test['Half Bath']

In [16]:
X_kaggle = test[['Year Built', 'Gr Liv Area', 'Garage Cars', 
           '1st Flr SF', 'TotRms AbvGrd', 'tot baths',
 'Kitchen Qual_Fa',
 'Kitchen Qual_Gd',
 'Kitchen Qual_Po',
 'Kitchen Qual_TA',
 'Overall Qual_3',
 'Overall Qual_4',
 'Overall Qual_5',
 'Overall Qual_6',
 'Overall Qual_7',
 'Overall Qual_8',
 'Overall Qual_9',
 'Overall Qual_10',
 'Exter Qual_Fa',
 'Exter Qual_Gd',
 'Exter Qual_TA']]
test['SalePrice'] = ols.predict(X_kaggle)

In [17]:
test[['Id', 'SalePrice']].to_csv('../output_csvs/7_5_final_preds.csv', index = False)