In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import plotly.express as px
import matplotlib.pyplot as plt
from scipy.stats import skew
from scipy.stats.stats import pearsonr
from sklearn.decomposition import PCA
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn import linear_model
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

#______________________________________________________________________________________________
#Read test and train files, create dataframes to hold dependent and independent variables

#read in train and test data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

#used to view dataframes
#train.head()
#test.head()

#get a list of column names from train
original_features = []
for i in train.columns:
  original_features.append(i)

#fill null values with 0
train.fillna(0, inplace = True)
test.fillna(0,inplace = True)

#set train_x to quantitative variables of train dataset
train_x = train.loc[:, ('MSSubClass','LotFrontage','LotArea','OverallQual','OverallCond','YearBuilt','YearRemodAdd','MasVnrArea','BsmtFinSF1','BsmtUnfSF','TotalBsmtSF',
                '1stFlrSF','2ndFlrSF','GrLivArea','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces',
                'GarageYrBlt','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','YrSold','MoSold')]

#set train_y = to sale price data
train_y = train.loc[:, ('SalePrice')]

#set test_x to test data
test_x = test.loc[:, ('MSSubClass','LotFrontage','LotArea','OverallQual','OverallCond','YearBuilt','YearRemodAdd','MasVnrArea','BsmtFinSF1','BsmtUnfSF','TotalBsmtSF',
                '1stFlrSF','2ndFlrSF','GrLivArea','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces',
                'GarageYrBlt','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','YrSold','MoSold')]

#______________________________________________________________________________________________
#This section runs a principle component analysis to select features that contribute the most to the variance of SalesPrice


#get a list of column names from train
original_features = []
for i in train.columns:
  original_features.append(i)

#create list of feature names
initial_feature_names = []
for col in train_x.columns:
  initial_feature_names.append(col)
print(initial_feature_names)

#run pca
pca = PCA(n_components=len(train_x.columns))
pca.fit(train_x,train_y)

#get number of components and most important components
n_pcs = pca.components_.shape[0]
most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]

#list of names in order of variance explained
most_important_names = [initial_feature_names[most_important[i]] for i in range(n_pcs)]
print(most_important_names)

#list of variances
variances = pca.explained_variance_ratio_
print(variances)

#graph
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)

px.area(
    x = range(1, exp_var_cumul.shape[0] + 1),
    y = exp_var_cumul,
    labels={"x": "# Components", "y": "Explained Variance"}
)

#______________________________________________________________________________________________



['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'YrSold', 'MoSold']
['LotArea', 'GrLivArea', '2ndFlrSF', 'BsmtUnfSF', 'GarageYrBlt', '1stFlrSF', 'MasVnrArea', 'GarageArea', 'WoodDeckSF', 'TotalBsmtSF', 'OpenPorchSF', 'EnclosedPorch', 'MSSubClass', 'LotFrontage', '1stFlrSF', 'YearBuilt', 'YearRemodAdd', 'MoSold', 'YrSold', 'TotRmsAbvGrd', 'OverallCond', 'OverallQual', 'Fireplaces', 'BedroomAbvGr', 'HalfBath', 'BsmtFullBath', 'GarageCars', 'FullBath', 'BsmtHalfBath', 'KitchenAbvGr']
[9.85517857e-01 5.09407248e-03 3.39083240e-03 2.77565235e-03
 1.97025539e-03 4.68762214e-04 2.65873368e-04 1.83663885e-04
 1.39819643e-04 7.88444439e-05 4.0253928

In [12]:
#remove everything except for "Lot Area" from both independent variable datasets

train_x = train.loc[:, ('LotArea','OverallQual')]

test_x = test.loc[:, ('LotArea','OverallQual')]

In [None]:
#This runs recursive feature elimination and cross-validated selection to find the best number of features
#uses SVR model
estimator = SVR(kernel="linear")
selector = RFECV(estimator, step=1, cv=5)
selector = selector.fit(train_x, train_y)
print(selector.support_)

In [18]:
#remove "Lot Area" from both independent variable datasets

train_x = train.loc[:, ('MSSubClass','LotFrontage','OverallQual','OverallCond','YearBuilt','YearRemodAdd','MasVnrArea','BsmtFinSF1','BsmtUnfSF','TotalBsmtSF',
                '1stFlrSF','2ndFlrSF','GrLivArea','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces',
                'GarageYrBlt','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','YrSold','MoSold')]

test_x = test.loc[:, ('MSSubClass','LotFrontage','OverallQual','OverallCond','YearBuilt','YearRemodAdd','MasVnrArea','BsmtFinSF1','BsmtUnfSF','TotalBsmtSF',
                '1stFlrSF','2ndFlrSF','GrLivArea','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces',
                'GarageYrBlt','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','YrSold','MoSold')]

In [None]:
#This runs recursive feature elimination and cross-validated selection to find the best number of features
#uses XGBregressor model
estimator = XGBRegressor(kernel="linear")
selector = RFECV(estimator, step=1, cv=5)
selector = selector.fit(train_x, train_y)
print(selector.support_)

In [16]:
#Remove according to XGBRegressor
train_x = train.loc[:, ('LotArea','OverallQual','OverallCond','YearBuilt','MasVnrArea','BsmtUnfSF','TotalBsmtSF',
                '1stFlrSF','2ndFlrSF','GrLivArea','BsmtHalfBath','FullBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces',
                'GarageYrBlt','GarageCars','GarageArea','OpenPorchSF')]

#set test_x to test data
test_x = test.loc[:, ('LotArea','OverallQual','OverallCond','YearBuilt','MasVnrArea','BsmtUnfSF','TotalBsmtSF',
                '1stFlrSF','2ndFlrSF','GrLivArea','BsmtHalfBath','FullBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces',
                'GarageYrBlt','GarageCars','GarageArea','OpenPorchSF')]

In [21]:
#This runs recursive feature elimination and cross-validated selection to find the best number of features
#uses random forest model
estimator = RandomForestRegressor(max_depth=30, random_state=0)
selector = RFECV(estimator, step=1, cv=5)
selector = selector.fit(train_x, train_y)
print(selector.support_)

[False False  True  True False  True False False  True False  True  True
  True  True False False False False False False False False False  True
  True False False False False False]


In [23]:
#set according to Random forrest regressor @ 5
train_x = train.loc[:, ('LotArea','OverallQual','YearBuilt','BsmtFinSF1','TotalBsmtSF',
                '1stFlrSF','2ndFlrSF','GrLivArea','FullBath','GarageCars','GarageArea')]

#set test_x to test data
test_x = test.loc[:, ('LotArea','OverallQual','YearBuilt','BsmtFinSF1','TotalBsmtSF',
                '1stFlrSF','2ndFlrSF','GrLivArea','FullBath','GarageCars','GarageArea')]

In [None]:
#This runs XGBClassifier

model = XGBClassifier()
model.fit(train_x, train_y)

print(model)

y_pred = model.predict(test_x)
predictions = [value for value in y_pred]

print(predictions)

#get results and export them to a csv
results = test.loc[:, ('Id')].copy()
final = results.to_frame()
final['SalePrice'] = predictions
final.head()
final.to_csv('results.csv')

#SVR Score: 0.22837

In [None]:
#This runs a linear regressiono model

reg = LinearRegression().fit(train_x, train_y)
reg.score(train_x, train_y)

print(reg.coef_)
print(reg.intercept_)

y_pred = reg.predict(test_x)
predictions = [value for value in y_pred]

print(predictions)

#get results and export them to a csv
results = test.loc[:, ('Id')].copy()
final = results.to_frame()
final['SalePrice'] = predictions
final.head()
final.to_csv('results.csv')

#SVR Score: 0.33024

In [24]:
#This runs XGBRegressor

model = XGBRegressor()
model.fit(train_x, train_y)

print(model)

y_pred = model.predict(test_x)
predictions = [value for value in y_pred]

print(predictions)

#get results and export them to a csv
results = test.loc[:, ('Id')].copy()
final = results.to_frame()
final['SalePrice'] = predictions
final.head()
final.to_csv('results.csv')

#SVR Score: 0.14440
#XGB Score: 0.14606
#RFR Score: 0.15827

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)
[128860.45, 160853.4, 175137.9, 183360.36, 209008.38, 177820.8, 162769.67, 165347.9, 186436.98, 126430.08, 204200.84, 95805.45, 97345.39, 152847.23, 139265.0, 379629.16, 268075.22, 309618.28, 273816.28, 471451.66, 325886.84, 216223.98, 170285.12, 169147.88, 176088.03, 194843.81, 338088.03, 250012.25, 199064.47, 209905.02, 190979.28, 104324.4, 177858.02, 300201.62, 295101.44, 223664.19, 170350.34, 159873.95, 160875.39, 146132.05, 168995.16, 146878.77, 287050.94, 226973.3, 215969.64, 193250.22, 236553.67, 195911.83

In [23]:
#This runs Random Forest Regressor

regr = RandomForestRegressor(max_depth=29, random_state=0)
regr.fit(train_x, train_y)

print(regr)

y_pred = regr.predict(test_x)
predictions = [value for value in y_pred]

print(predictions)

#get results and export them to a csv
results = test.loc[:, ('Id')].copy()
final = results.to_frame()
final['SalePrice'] = predictions
final.head()
final.to_csv('results.csv')

#SVR Score: 0.17825
#XGB Score: 0.17890
#depth 29:  0.14852

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=29, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)
[129155.65, 159284.5, 176536.2, 183505.0, 200309.07, 183587.22, 164535.96, 173691.8, 180271.98, 122395.0, 195579.2, 105455.0, 96508.93, 150667.0, 136592.7214285714, 389058.08, 260021.6, 310123.42, 248023.68, 441757.49, 322085.2, 204969.09, 169713.04, 170088.54, 170712.8, 208745.12, 349776.38, 252932.59, 197794.72, 199712.32, 187020.31, 101819.24, 183325.0, 296422.95, 313441.2, 221440.8, 186442.07, 152854.09000000003, 152552.6906666667, 152263.22, 173100.4, 147820.8106666667, 297123.84, 226242.92, 216422

In [None]:
#this runs bayesian Ridge

reg = linear_model.BayesianRidge()
reg.fit(train_x, train_y)

print(reg)

y_pred = reg.predict(test_x)
predictions = [value for value in y_pred]

print(predictions)

#get results and export them to a csv
results = test.loc[:, ('Id')].copy()
final = results.to_frame()
final['SalePrice'] = predictions
final.head()
final.to_csv('results.csv')

#SVR Score: 0.32992

In [13]:
#this runs svr

regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
regr.fit(train_x, train_y)

print(regr)

y_pred = regr.predict(test_x)
predictions = [value for value in y_pred]

print(predictions)

#get results and export them to a csv
results = test.loc[:, ('Id')].copy()
final = results.to_frame()
final['SalePrice'] = predictions
final.head()
final.to_csv('results.csv')

#SVR Score: 0.41620

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svr',
                 SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2,
                     gamma='scale', kernel='rbf', max_iter=-1, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)
[162891.65893784232, 162964.7124632912, 163052.71752377605, 163074.20830286067, 163033.29422010723, 163072.89127807535, 163018.19668198586, 163060.41250672968, 163025.62090209962, 162914.51012893577, 163061.3383644192, 162962.75541373712, 162953.06347651445, 163014.89795896213, 162913.23516468, 163113.41712866066, 163103.3795996992, 163110.0466168473, 163105.21630647866, 163034.39456301212, 163149.07651536714, 163080.33958755835, 163037.6782566094, 163038.81782008772, 163054.38011936177, 163095.53512880253, 163100.236465123, 163104.59005607155, 163048.89469803436, 163062.9426969725, 163091.3727576072, 162

In [None]:
#Code for visualizatinon
SVR = 29
XGB = 21
RFR = 11

