In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn import metrics


In [6]:
df = pd.read_csv('datasets/draft2_no_dummies_train.csv')

In [17]:
correlations = pd.DataFrame(df.corr()['saleprice'].sort_values(ascending = False))
correlations.head()

Unnamed: 0,saleprice
saleprice,1.0
overall_qual,0.799028
exter_qual,0.710894
gr_liv_area,0.69816
kitchen_qual,0.69109


In [19]:
correlations.rename(columns = {'saleprice': 'corr'}, inplace = True)
correlations.head()

Unnamed: 0,corr
saleprice,1.0
overall_qual,0.799028
exter_qual,0.710894
gr_liv_area,0.69816
kitchen_qual,0.69109


In [74]:
corr_above50 = correlations[correlations['corr']>.5].index.tolist()

In [75]:
print(type(corr_above50))
corr_above50

<class 'list'>


['saleprice',
 'overall_qual',
 'exter_qual',
 'gr_liv_area',
 'kitchen_qual',
 'total_bsmt_sf',
 '1st_flr_sf',
 'year_built',
 'year_remod/add',
 'full_bath',
 'mas_vnr_area',
 'totrms_abvgrd']

In [77]:
corr_above50.remove('saleprice')

In [78]:
print(corr_above50)

['overall_qual', 'exter_qual', 'gr_liv_area', 'kitchen_qual', 'total_bsmt_sf', '1st_flr_sf', 'year_built', 'year_remod/add', 'full_bath', 'mas_vnr_area', 'totrms_abvgrd']


In [79]:
X = df[corr_above50]
y = df['saleprice']

In [80]:
X.head()

Unnamed: 0,overall_qual,exter_qual,gr_liv_area,kitchen_qual,total_bsmt_sf,1st_flr_sf,year_built,year_remod/add,full_bath,mas_vnr_area,totrms_abvgrd
0,6,3,1479,3,725.0,725,1976,2005,2,289.0,6
1,7,3,2122,3,913.0,913,1996,1997,2,132.0,8
2,5,2,1057,3,1057.0,1057,1953,2007,1,0.0,5
3,5,2,1444,2,384.0,744,2006,2007,2,0.0,7
4,6,2,1445,2,676.0,831,1900,1993,2,0.0,6


In [81]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X, y, random_state = 42)

In [82]:
lr1 = LinearRegression()

In [83]:
lr1.fit(X_train_1, y_train_1)

LinearRegression()

In [88]:
cross_val_score(lr1, X_train_1, y_train_1)

array([0.6901737 , 0.84367594, 0.83813819, 0.83513768, 0.67845218])

In [89]:
cross_val_score(lr1, X_train_1, y_train_1).mean()

0.7771155387544387

In [84]:
lr1.score(X_train_1, y_train_1)

0.7916893745017548

In [85]:
lr1.score(X_test_1, y_test_1)

0.8436992861320319

I'm very happy with this first model. The $r^2$ score for the training data is reasonably high, and the mean of the cross-validation score is also close. The $r^2$ score for the test data is actually higher than for the training data, so the model is not overfit.

In [86]:
preds1 = lr1.predict(X_test_1)
preds1[:5]

array([296047.90410374, 213511.41933565, 156257.17375894, 137364.00208469,
       264998.51556001])

In [97]:
null_pred = y.mean()
null_pred

180839.1701183432

In [96]:
y_test_1.shape

(507,)

In [107]:
null_pred_list = ([null_pred] * 507)
print(len(null_pred_list))
null_pred_list[:5]

507


[180839.1701183432,
 180839.1701183432,
 180839.1701183432,
 180839.1701183432,
 180839.1701183432]

In [105]:
type(null_pred_list)

list

In [94]:
#root mean squared error for model
metrics.mean_squared_error(y_test_1, preds1, squared = False)

30006.834937032247

In [106]:
#root mean squared error for null
metrics.mean_squared_error(y_test_1, null_pred_list, squared = False)

75922.17182221451

In [108]:
#the mean of the test data.
y_test_1.mean()

178989.57001972388

In [109]:
30_006.83/178_989.57

0.16764569019300957

The model RMSE (\\$30,006.83) is higher than I'd want in a real world scenario, as it represents an error of 16\% of the mean sale price in our test data, but the test RMSE represents a significant improvement over the null RMSE (\\$75,922.17)

In [199]:
lr1.coef_

array([14589.5622664 , 14540.97737392,    41.04146751, 13768.58550128,
          16.88916597,    17.74760125,   261.09029334,   189.72632137,
       -2329.75197393,    42.47498633,  1234.45199744])

Hank helped me work out how to put these together in the 4.04 exercise. I'm referencing the code from that exercise here.

In [202]:
pd.Series(lr1.coef_, index = X.columns)

overall_qual      14589.562266
exter_qual        14540.977374
gr_liv_area          41.041468
kitchen_qual      13768.585501
total_bsmt_sf        16.889166
1st_flr_sf           17.747601
year_built          261.090293
year_remod/add      189.726321
full_bath         -2329.751974
mas_vnr_area         42.474986
totrms_abvgrd      1234.451997
dtype: float64

In [203]:
df['year_built'].describe()

count    2028.000000
mean     1971.357495
std        30.147866
min      1872.000000
25%      1953.000000
50%      1973.000000
75%      2000.000000
max      2010.000000
Name: year_built, dtype: float64

# **COME BACK HERE TO LOOK AT THE COEFFICIENTS FURTHER FOR INFERENCE**

## First Kaggle Attempt

In [184]:
kaggledf = pd.read_csv('datasets/draft2_no_dummies_kaggle_test.csv')
kaggledf.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street_surface,alley,lot_shape,land_contour,...,mo_sold,yr_sold,sale_type,saleprice,has_bsmt,has_garage,has_mas_vnr,has_pool,has_alley,has_fireplace
0,2658,902301120,190,RM,69.0,9142,1,Grvl,0,Lvl,...,4,2006,WD,0,1,1,0,0,1,0
1,2718,905108090,90,RL,0.0,9662,1,,1,Lvl,...,8,2006,WD,0,1,1,0,0,0,0
2,2414,528218130,60,RL,58.0,17104,1,,1,Lvl,...,9,2006,New,0,1,1,0,0,0,1
3,1989,902207150,30,RM,60.0,8520,1,,0,Lvl,...,7,2007,WD,0,1,1,0,0,0,0
4,625,535105100,20,RL,0.0,9500,1,,1,Lvl,...,7,2009,WD,0,1,1,1,0,0,1


In [185]:
X_k = kaggledf[corr_above50]

In [186]:
X_k.shape

(878, 11)

In [187]:
predsk1 = lr1.predict(X_k)

In [188]:
predsk1.shape

(878,)

In [189]:
predsk1[:5]

array([140870.02041902, 200288.52311187, 202516.40581009, 126787.62345763,
       173086.28988017])

In [190]:
kaggledf['saleprice'] = predsk1

In [191]:
lr1_submission = kaggledf[['id', 'saleprice']]

In [192]:
lr1_submission.head()

Unnamed: 0,id,saleprice
0,2658,140870.020419
1,2718,200288.523112
2,2414,202516.40581
3,1989,126787.623458
4,625,173086.28988


I used [this site](https://www.geeksforgeeks.org/change-the-data-type-of-a-column-or-a-pandas-series/) to remind myself of a method to change a column's datatype.

In [193]:
lr1_submission = lr1_submission.astype({'id': int})

In [194]:
lr1_submission.head()

Unnamed: 0,id,saleprice
0,2658,140870.020419
1,2718,200288.523112
2,2414,202516.40581
3,1989,126787.623458
4,625,173086.28988


In [195]:
lr1_submission.shape

(878, 2)

In [196]:
lr1_submission.set_index('id', inplace = True)

In [197]:
lr1_submission.head(2)

Unnamed: 0_level_0,saleprice
id,Unnamed: 1_level_1
2658,140870.020419
2718,200288.523112


In [198]:
lr1_submission.to_csv('kaggle_submissions/lr1_submission.csv')

# Feature engineering - Interaction Terms 1 using Polynomial Features
I worked through this, evaluated it, then prepped the Kaggle test data.

In [207]:
poly1 = PolynomialFeatures(include_bias = False)

In [208]:
poly1

PolynomialFeatures(include_bias=False)

In [219]:
X_poly_train1 = poly1.fit_transform(X_train_1)

In [None]:
X_poly_test1 = poly1.fit_transform(X_test_1)

In [232]:
lr1_2 = LinearRegression()

In [233]:
lr1_2.fit(X_poly_train1, y_train_1)

LinearRegression()

In [234]:
lr1_2.score(X_poly_train1, y_train_1)

0.8960460555737353

In [235]:
lr1_2.score(X_poly_test1, y_test_1)

0.8758629643693665

In [236]:
cross_val_score(lr1_2, X_poly_train1, y_train_1).mean()

0.8140987984337507

In [271]:
# for Kaggle DF

X_k_poly_1 = poly1.fit_transform(X_k)

In [272]:
predsk2 = lr1_2.predict(X_k_poly_1)

In [278]:
lr1_2submission = lr1_submission.copy()

In [279]:
lr1_2submission.head()

Unnamed: 0_level_0,saleprice
id,Unnamed: 1_level_1
2658,140870.020419
2718,200288.523112
2414,202516.40581
1989,126787.623458
625,173086.28988


In [280]:
lr1_2submission['saleprice'] = predsk2

In [281]:
lr1_2submission.head()

Unnamed: 0_level_0,saleprice
id,Unnamed: 1_level_1
2658,148307.435681
2718,152657.518099
2414,164377.607226
1989,123334.485195
625,171071.038972


In [282]:
lr1_2submission.to_csv('kaggle_submissions/lr1_2submission.csv')

-----

The mean cross val above is a bit lower than the train score. I'm going to try playing with taking out a couple things I think might be overfit. See below.

After determining removing Overall Quality didn't help, I decided to proceed with evalyating lr1_2.

In [263]:
preds1_2 = lr1_2.predict(X_poly_test1)

In [265]:
y_test_1.shape

(507,)

In [268]:
len(X_poly_test1)

507

In [269]:
metrics.mean_squared_error(y_test_1, preds1_2, squared = False)

26741.78505207465

In [270]:
#root mean squared error for null
metrics.mean_squared_error(y_test_1, null_pred_list, squared = False)

75922.17182221451

This RMSE of \\$26,741 represents an improvement over lr_1's RMSE (\\$30,006.83) and a significant improvement over the null RMSE (\\$75,922.17).

-------

# Playing with Removing Overall Quality both without and with interaction terms.

I suspected overall quality wasn't independent of other features here, but removing it didn't really seem to help that. See below.

In [226]:
corr_50plusdf = df[corr_above50]
corr_50plusdf.head()

Unnamed: 0,overall_qual,exter_qual,gr_liv_area,kitchen_qual,total_bsmt_sf,1st_flr_sf,year_built,year_remod/add,full_bath,mas_vnr_area,totrms_abvgrd
0,6,3,1479,3,725.0,725,1976,2005,2,289.0,6
1,7,3,2122,3,913.0,913,1996,1997,2,132.0,8
2,5,2,1057,3,1057.0,1057,1953,2007,1,0.0,5
3,5,2,1444,2,384.0,744,2006,2007,2,0.0,7
4,6,2,1445,2,676.0,831,1900,1993,2,0.0,6


In [227]:
leaner_corr50df = corr_50plusdf.drop(columns = 'overall_qual')

In [228]:
leaner_corr50df.head(2)

Unnamed: 0,exter_qual,gr_liv_area,kitchen_qual,total_bsmt_sf,1st_flr_sf,year_built,year_remod/add,full_bath,mas_vnr_area,totrms_abvgrd
0,3,1479,3,725.0,725,1976,2005,2,289.0,6
1,3,2122,3,913.0,913,1996,1997,2,132.0,8


In [229]:
X_2 = leaner_corr50df

In [230]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y)

In [237]:
lr2 = LinearRegression()

In [238]:
lr2.fit(X_train_2, y_train_2)

LinearRegression()

In [239]:
lr2.score(X_train_2, y_train_2)

0.7649574977818029

In [240]:
lr2.score(X_test_2, y_test_2)

0.8254198781269102

This model is a little worse than the one without it. But I'll see if I wind up a little less overfit with the interaction terms.

In [241]:
poly2 = PolynomialFeatures(include_bias = False)

In [245]:
X_poly_train2 = poly2.fit_transform(X_train_2)

In [258]:
X_poly_test2 = poly2.fit_transform(X_test_2)

In [247]:
lr2_1 = LinearRegression()

In [255]:
lr2_1.fit(X_poly_train2, y_train_2)

LinearRegression()

In [256]:
lr2_1.score(X_poly_train2, y_train_2)

0.8711776033067937

In [259]:
lr2_1.score(X_poly_test2, y_test_2)

0.8644453036617653

In [262]:
cross_val_score(lr2_1, X_poly_train2, y_train_2).mean()

0.7976298133133373

-----

# Building DataFrames with the lr1_2 Interaction Terms
I'm going to prepare to concatenate the training data with the Kaggle test data so I can add some categorical variables.

In [284]:
corr_50plusdf.head(2)

Unnamed: 0,overall_qual,exter_qual,gr_liv_area,kitchen_qual,total_bsmt_sf,1st_flr_sf,year_built,year_remod/add,full_bath,mas_vnr_area,totrms_abvgrd
0,6,3,1479,3,725.0,725,1976,2005,2,289.0,6
1,7,3,2122,3,913.0,913,1996,1997,2,132.0,8


In [305]:
corr_50_interactions = poly1.fit_transform(corr_50plusdf)

I took this code from the feature engineering lesson:

In [307]:
corr_50_interactions = pd.DataFrame(corr_50_interactions, columns = poly1.get_feature_names_out(corr_50plusdf.columns))


In [308]:
corr_50_interactions.head()

Unnamed: 0,overall_qual,exter_qual,gr_liv_area,kitchen_qual,total_bsmt_sf,1st_flr_sf,year_built,year_remod/add,full_bath,mas_vnr_area,...,year_remod/add^2,year_remod/add full_bath,year_remod/add mas_vnr_area,year_remod/add totrms_abvgrd,full_bath^2,full_bath mas_vnr_area,full_bath totrms_abvgrd,mas_vnr_area^2,mas_vnr_area totrms_abvgrd,totrms_abvgrd^2
0,6.0,3.0,1479.0,3.0,725.0,725.0,1976.0,2005.0,2.0,289.0,...,4020025.0,4010.0,579445.0,12030.0,4.0,578.0,12.0,83521.0,1734.0,36.0
1,7.0,3.0,2122.0,3.0,913.0,913.0,1996.0,1997.0,2.0,132.0,...,3988009.0,3994.0,263604.0,15976.0,4.0,264.0,16.0,17424.0,1056.0,64.0
2,5.0,2.0,1057.0,3.0,1057.0,1057.0,1953.0,2007.0,1.0,0.0,...,4028049.0,2007.0,0.0,10035.0,1.0,0.0,5.0,0.0,0.0,25.0
3,5.0,2.0,1444.0,2.0,384.0,744.0,2006.0,2007.0,2.0,0.0,...,4028049.0,4014.0,0.0,14049.0,4.0,0.0,14.0,0.0,0.0,49.0
4,6.0,2.0,1445.0,2.0,676.0,831.0,1900.0,1993.0,2.0,0.0,...,3972049.0,3986.0,0.0,11958.0,4.0,0.0,12.0,0.0,0.0,36.0


In [309]:
corr_50_interactions.shape

(2028, 77)

In [321]:
corr_50_interactions.head(1)

Unnamed: 0,overall_qual,exter_qual,gr_liv_area,kitchen_qual,total_bsmt_sf,1st_flr_sf,year_built,year_remod/add,full_bath,mas_vnr_area,...,year_remod/add full_bath,year_remod/add mas_vnr_area,year_remod/add totrms_abvgrd,full_bath^2,full_bath mas_vnr_area,full_bath totrms_abvgrd,mas_vnr_area^2,mas_vnr_area totrms_abvgrd,totrms_abvgrd^2,saleprice
0,6.0,3.0,1479.0,3.0,725.0,725.0,1976.0,2005.0,2.0,289.0,...,4010.0,579445.0,12030.0,4.0,578.0,12.0,83521.0,1734.0,36.0,130500


I referenced [this webpage](https://www.geeksforgeeks.org/how-to-add-column-from-another-dataframe-in-pandas/) to review how to return the 'saleprice' column to the new dataframe.

In [287]:
extracted_column = df['saleprice']

In [310]:
corr_50_interactions = corr_50_interactions.join(extracted_column)

In [311]:
corr_50_interactions.shape

(2028, 78)

# This is where I left of the night 6/15/22
I want to add neighborhood, zoning, and house type into this dataframe to cast dummies. I have to do the same beneath as well.

-----

In [312]:
X_k.shape

(878, 11)

In [314]:
X_k.head()

Unnamed: 0,overall_qual,exter_qual,gr_liv_area,kitchen_qual,total_bsmt_sf,1st_flr_sf,year_built,year_remod/add,full_bath,mas_vnr_area,totrms_abvgrd
0,6,2,1928,1,1020,908,1910,1950,2,0.0,9
1,5,2,1967,2,1967,1967,1977,1977,2,0.0,10
2,7,3,1496,3,654,664,2006,2006,2,0.0,7
3,5,3,968,2,968,968,1923,2006,1,0.0,5
4,6,2,1394,2,1394,1394,1963,1963,1,247.0,6


In [316]:
kaggle_interactions = poly1.fit_transform(X_k)

In [319]:
kaggle_interactions = pd.DataFrame(kaggle_interactions, 
                                   columns = poly1.get_feature_names_out(X_k.columns))


In [320]:
kaggle_interactions.head(1)

Unnamed: 0,overall_qual,exter_qual,gr_liv_area,kitchen_qual,total_bsmt_sf,1st_flr_sf,year_built,year_remod/add,full_bath,mas_vnr_area,...,year_remod/add^2,year_remod/add full_bath,year_remod/add mas_vnr_area,year_remod/add totrms_abvgrd,full_bath^2,full_bath mas_vnr_area,full_bath totrms_abvgrd,mas_vnr_area^2,mas_vnr_area totrms_abvgrd,totrms_abvgrd^2
0,6.0,2.0,1928.0,1.0,1020.0,908.0,1910.0,1950.0,2.0,0.0,...,3802500.0,3900.0,0.0,17550.0,4.0,0.0,18.0,0.0,0.0,81.0


In [322]:
kaggle_interactions.shape

(878, 77)

In [323]:
kaggle_interactions['saleprice'] = 0

In [324]:
kaggle_interactions.shape

(878, 78)

In [325]:
kaggle_interactions.head(1)

Unnamed: 0,overall_qual,exter_qual,gr_liv_area,kitchen_qual,total_bsmt_sf,1st_flr_sf,year_built,year_remod/add,full_bath,mas_vnr_area,...,year_remod/add full_bath,year_remod/add mas_vnr_area,year_remod/add totrms_abvgrd,full_bath^2,full_bath mas_vnr_area,full_bath totrms_abvgrd,mas_vnr_area^2,mas_vnr_area totrms_abvgrd,totrms_abvgrd^2,saleprice
0,6.0,2.0,1928.0,1.0,1020.0,908.0,1910.0,1950.0,2.0,0.0,...,3900.0,0.0,17550.0,4.0,0.0,18.0,0.0,0.0,81.0,0
