In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import Lasso, LassoCV
from sklearn import metrics

In [2]:
df = pd.read_csv('datasets/draft2_no_dummies_train.csv')

# Building Model 1
I'm using the most highly correlated features to predict the target, Sale Price. I experimented with using features that had 50+% correlation down to 30% correlation and found that 40% correlation produced the best effect.

In [3]:
correlations = pd.DataFrame(df.corr()['saleprice'].sort_values(ascending = False))
correlations.head()

Unnamed: 0,saleprice
saleprice,1.0
overall_qual,0.799028
exter_qual,0.710894
gr_liv_area,0.69816
kitchen_qual,0.69109


In [32]:
correlations.rename(columns = {'saleprice': 'corr'}, inplace = True)
correlations.head(16)

Unnamed: 0,corr
saleprice,1.0
overall_qual,0.799028
exter_qual,0.710894
gr_liv_area,0.69816
kitchen_qual,0.69109
total_bsmt_sf,0.630484
1st_flr_sf,0.620491
year_built,0.571165
year_remod/add,0.549456
full_bath,0.538883


In [6]:
corr_above40 = correlations[correlations['corr']>.4].index.tolist()

In [28]:
print(type(corr_above40))
corr_above40

<class 'list'>


['overall_qual',
 'exter_qual',
 'gr_liv_area',
 'kitchen_qual',
 'total_bsmt_sf',
 '1st_flr_sf',
 'year_built',
 'year_remod/add',
 'full_bath',
 'mas_vnr_area',
 'totrms_abvgrd',
 'has_fireplace',
 'fireplaces',
 'heating_qc',
 'bsmtfin_sf_1']

In [8]:
corr_above40.remove('saleprice')

Also removing 'has_mas_vnr' because that is covered by the 'mas_vnr_area' column, so this is redundant.

In [11]:
corr_above40.remove('has_mas_vnr')

In [12]:
print(corr_above40)

['overall_qual', 'exter_qual', 'gr_liv_area', 'kitchen_qual', 'total_bsmt_sf', '1st_flr_sf', 'year_built', 'year_remod/add', 'full_bath', 'mas_vnr_area', 'totrms_abvgrd', 'has_fireplace', 'fireplaces', 'heating_qc', 'bsmtfin_sf_1']


In [13]:
X = df[corr_above40]
y = df['saleprice']

In [14]:
X.head()

Unnamed: 0,overall_qual,exter_qual,gr_liv_area,kitchen_qual,total_bsmt_sf,1st_flr_sf,year_built,year_remod/add,full_bath,mas_vnr_area,totrms_abvgrd,has_fireplace,fireplaces,heating_qc,bsmtfin_sf_1
0,6,3,1479,3,725.0,725,1976,2005,2,289.0,6,0,0,4,533.0
1,7,3,2122,3,913.0,913,1996,1997,2,132.0,8,1,1,4,637.0
2,5,2,1057,3,1057.0,1057,1953,2007,1,0.0,5,0,0,2,731.0
3,5,2,1444,2,384.0,744,2006,2007,2,0.0,7,0,0,3,0.0
4,6,2,1445,2,676.0,831,1900,1993,2,0.0,6,0,0,2,0.0


In [15]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X, y, random_state = 42)

In [16]:
lr1 = LinearRegression()

In [17]:
lr1.fit(X_train_1, y_train_1)

LinearRegression()

In [18]:
cross_val_score(lr1, X_train_1, y_train_1)

array([0.68400273, 0.84782123, 0.8538537 , 0.84922161, 0.69500178])

In [19]:
cross_val_score(lr1, X_train_1, y_train_1).mean()

0.7859802107600512

In [20]:
lr1.score(X_train_1, y_train_1)

0.8040499309496271

In [21]:
lr1.score(X_test_1, y_test_1)

0.8622194033550727

I'm very happy with this first model. The $r^2$ score for the training data is reasonably high, and the mean of the cross-validation score is also close. The $r^2$ score for the test data is actually higher than for the training data, so the model is not overfit.

In [22]:
preds1 = lr1.predict(X_test_1)
preds1[:5]

array([303852.08846031, 211464.37724694, 166577.81371864, 149912.62686094,
       269658.48171225])

In [23]:
null_pred = y.mean()
null_pred

180839.1701183432

In [24]:
y_test_1.shape

(507,)

In [25]:
null_pred_list = ([null_pred] * 507)
print(len(null_pred_list))
null_pred_list[:5]

507


[180839.1701183432,
 180839.1701183432,
 180839.1701183432,
 180839.1701183432,
 180839.1701183432]

In [26]:
type(null_pred_list)

list

In [37]:
print(f'Model 1 RMSE: {metrics.mean_squared_error(y_test_1, preds1, squared = False)}')
print(f'Null Model RMSE: {metrics.mean_squared_error(y_test_1, null_pred_list, squared = False)}')

Model 1 RMSE: 28173.041763341123
Null Model RMSE: 75922.17182221451


In [34]:
#the mean of the test data.
y_test_1.mean()

178989.57001972388

In [35]:
28173.04/178_989.57

0.15740045635061306

The model RMSE (\\$28173.04) is higher than I'd want in a real world scenario, as it represents an error of ~16\% of the mean sale price in our test data, but the test RMSE represents a significant improvement over the null RMSE (\\$75,922.17)

Hank helped me work out how to put these together in the 4.04 exercise. I'm referencing the code from that exercise here.

In [38]:
pd.Series(lr1.coef_, index = X.columns)

overall_qual      13620.995238
exter_qual        16175.977610
gr_liv_area          31.802816
kitchen_qual      12846.681221
total_bsmt_sf         9.143823
1st_flr_sf           14.209274
year_built          203.805858
year_remod/add      167.284788
full_bath          1179.576240
mas_vnr_area         37.212760
totrms_abvgrd      2790.762566
has_fireplace      8175.984633
fireplaces         4412.431100
heating_qc         2359.093249
bsmtfin_sf_1         16.260862
dtype: float64

Thinking ahead to recommendations for homeoweners, some of these (for example, Year Built) will be hard to act directly upon. That said, even Year Built is actionable in the sense that if you're thinking to sell, there's evidence that waiting a year as a small cost of approximately $204/year.

# **COME BACK HERE TO LOOK AT THE COEFFICIENTS FURTHER FOR INFERENCE**

## First Kaggle Attempt

In [39]:
kaggledf = pd.read_csv('datasets/draft2_no_dummies_kaggle_test.csv')
kaggledf.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street_surface,alley,lot_shape,land_contour,...,mo_sold,yr_sold,sale_type,saleprice,has_bsmt,has_garage,has_mas_vnr,has_pool,has_alley,has_fireplace
0,2658,902301120,190,RM,69.0,9142,1,Grvl,0,Lvl,...,4,2006,WD,0,1,1,0,0,1,0
1,2718,905108090,90,RL,0.0,9662,1,,1,Lvl,...,8,2006,WD,0,1,1,0,0,0,0
2,2414,528218130,60,RL,58.0,17104,1,,1,Lvl,...,9,2006,New,0,1,1,0,0,0,1
3,1989,902207150,30,RM,60.0,8520,1,,0,Lvl,...,7,2007,WD,0,1,1,0,0,0,0
4,625,535105100,20,RL,0.0,9500,1,,1,Lvl,...,7,2009,WD,0,1,1,1,0,0,1


In [40]:
X_k = kaggledf[corr_above40]

In [41]:
X_k.shape

(878, 15)

In [42]:
predsk1 = lr1.predict(X_k)

In [43]:
predsk1.shape

(878,)

In [44]:
predsk1[:5]

array([134782.85156245, 177559.02004736, 217028.48005454, 117346.59365737,
       180492.13822964])

In [45]:
kaggledf['saleprice'] = predsk1

In [46]:
lr1_submission = kaggledf[['id', 'saleprice']]

In [47]:
lr1_submission.head()

Unnamed: 0,id,saleprice
0,2658,134782.851562
1,2718,177559.020047
2,2414,217028.480055
3,1989,117346.593657
4,625,180492.13823


I used [this site](https://www.geeksforgeeks.org/change-the-data-type-of-a-column-or-a-pandas-series/) to remind myself of a method to change a column's datatype, as it wasn't an integer.

In [48]:
lr1_submission = lr1_submission.astype({'id': int})

In [49]:
lr1_submission.head()

Unnamed: 0,id,saleprice
0,2658,134782.851562
1,2718,177559.020047
2,2414,217028.480055
3,1989,117346.593657
4,625,180492.13823


In [50]:
lr1_submission.shape

(878, 2)

In [51]:
lr1_submission.set_index('id', inplace = True)

In [52]:
lr1_submission.head(2)

Unnamed: 0_level_0,saleprice
id,Unnamed: 1_level_1
2658,134782.851562
2718,177559.020047


In [53]:
lr1_submission.to_csv('kaggle_submissions/lr1_submission.csv')

NOTE: I didn't actually submit this version because I'd worked through this whole notebook with the features that have 50%+ correlation. Because these models only get better, I waited until the last one to submit the improved one.

# Model 2 - Interaction Terms using Polynomial Features
I worked through this, evaluated it, then prepped the Kaggle test data.

In [54]:
poly1 = PolynomialFeatures(include_bias = False)

In [55]:
poly1

PolynomialFeatures(include_bias=False)

In [56]:
X_poly_train1 = poly1.fit_transform(X_train_1)

In [57]:
X_poly_test1 = poly1.fit_transform(X_test_1)

In [70]:
lr2 = LinearRegression()

In [71]:
lr2.fit(X_poly_train1, y_train_1)

LinearRegression()

In [72]:
lr2.score(X_poly_train1, y_train_1)

0.919698795940492

In [73]:
lr2.score(X_poly_test1, y_test_1)

0.8989234571729905

In [74]:
cross_val_score(lr2, X_poly_train1, y_train_1).mean()

0.829746516796377

In [79]:
preds2 = lr2.predict(X_poly_test1)

In [82]:
print(f'Model 1 RMSE: {metrics.mean_squared_error(y_test_1, preds1, squared = False)}')
print(f'Model 2 RMSE: {metrics.mean_squared_error(y_test_1, preds2, squared = False)}')
print(f'Null Model RMSE: {metrics.mean_squared_error(y_test_1, null_pred_list, squared = False)}')

Model 1 RMSE: 28173.041763341123
Model 2 RMSE: 24130.420945490245
Null Model RMSE: 75922.17182221451


The train and test $r^2$ scores are very close. I'd like mean cross-validation score to be closer to the train $r^2$ as the difference suggests there could be some overfitting.

The Model 2 RMSE of \\24130.42 represents an improvement over lr_1's RMSE (\\28173.043) and, again, a significant improvement over the null RMSE (\\$75,922.17).

Overall, this model represents a step forward.

-----

Prepping data for Kaggle competition:

In [83]:
# for Kaggle DF

X_k_poly_1 = poly1.fit_transform(X_k)

In [90]:
predsk2 = lr2.predict(X_k_poly_1)

In [91]:
lr2submission = lr1_submission.copy()

In [92]:
lr2submission.head()

Unnamed: 0_level_0,saleprice
id,Unnamed: 1_level_1
2658,134782.851562
2718,177559.020047
2414,217028.480055
1989,117346.593657
625,180492.13823


In [93]:
lr2submission['saleprice'] = predsk2

In [94]:
lr2submission.head()

Unnamed: 0_level_0,saleprice
id,Unnamed: 1_level_1
2658,151214.664938
2718,143522.519955
2414,175626.697975
1989,113027.564225
625,192296.594042


In [95]:
lr2submission.to_csv('kaggle_submissions/lr1_2submission.csv')

Once again, I didn't actually submit this model because there's a better result ahead, which I know from having worked through these models starting from the model with correlations of 50+%.

-----

-------

# Playing with Removing Overall Quality both without and with interaction terms.

I suspected overall quality wasn't independent of other features here, but removing it didn't really seem to help that. See below.

In [291]:
corr_50plusdf = df[corr_above50]
corr_50plusdf.head()

Unnamed: 0,overall_qual,exter_qual,gr_liv_area,kitchen_qual,total_bsmt_sf,1st_flr_sf,year_built,year_remod/add,full_bath,mas_vnr_area,totrms_abvgrd
0,6,3,1479,3,725.0,725,1976,2005,2,289.0,6
1,7,3,2122,3,913.0,913,1996,1997,2,132.0,8
2,5,2,1057,3,1057.0,1057,1953,2007,1,0.0,5
3,5,2,1444,2,384.0,744,2006,2007,2,0.0,7
4,6,2,1445,2,676.0,831,1900,1993,2,0.0,6


In [292]:
leaner_corr50df = corr_50plusdf.drop(columns = 'overall_qual')

In [293]:
leaner_corr50df.head(2)

Unnamed: 0,exter_qual,gr_liv_area,kitchen_qual,total_bsmt_sf,1st_flr_sf,year_built,year_remod/add,full_bath,mas_vnr_area,totrms_abvgrd
0,3,1479,3,725.0,725,1976,2005,2,289.0,6
1,3,2122,3,913.0,913,1996,1997,2,132.0,8


In [294]:
X_2 = leaner_corr50df

In [295]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y)

In [296]:
lr2 = LinearRegression()

In [297]:
lr2.fit(X_train_2, y_train_2)

LinearRegression()

In [298]:
lr2.score(X_train_2, y_train_2)

0.7947318589774361

In [299]:
lr2.score(X_test_2, y_test_2)

0.7461271379751175

This model is a little worse than the one without it. But I'll see if I wind up a little less overfit with the interaction terms.

In [300]:
poly2 = PolynomialFeatures(include_bias = False)

In [301]:
X_poly_train2 = poly2.fit_transform(X_train_2)

In [302]:
X_poly_test2 = poly2.fit_transform(X_test_2)

In [303]:
lr2_1 = LinearRegression()

In [304]:
lr2_1.fit(X_poly_train2, y_train_2)

LinearRegression()

In [305]:
lr2_1.score(X_poly_train2, y_train_2)

0.8747131096569097

In [306]:
lr2_1.score(X_poly_test2, y_test_2)

0.8273624043743294

In [307]:
cross_val_score(lr2_1, X_poly_train2, y_train_2).mean()

0.7353224766129915

-----

# Building DataFrames with the lr1_2 Interaction Terms
I'm going to prepare to concatenate the training data with the Kaggle test data so I can add some categorical variables.

In [333]:
corr_50plusdf.head(2)

Unnamed: 0,overall_qual,exter_qual,gr_liv_area,kitchen_qual,total_bsmt_sf,1st_flr_sf,year_built,year_remod/add,full_bath,mas_vnr_area,totrms_abvgrd
0,6,3,1479,3,725.0,725,1976,2005,2,289.0,6
1,7,3,2122,3,913.0,913,1996,1997,2,132.0,8


In [334]:
corr_50_interactions = poly1.fit_transform(corr_50plusdf)

I took this code from the feature engineering lesson:

In [335]:
corr_50_interactions = pd.DataFrame(corr_50_interactions, columns = poly1.get_feature_names_out(corr_50plusdf.columns))


In [336]:
corr_50_interactions.head()

Unnamed: 0,overall_qual,exter_qual,gr_liv_area,kitchen_qual,total_bsmt_sf,1st_flr_sf,year_built,year_remod/add,full_bath,mas_vnr_area,...,year_remod/add^2,year_remod/add full_bath,year_remod/add mas_vnr_area,year_remod/add totrms_abvgrd,full_bath^2,full_bath mas_vnr_area,full_bath totrms_abvgrd,mas_vnr_area^2,mas_vnr_area totrms_abvgrd,totrms_abvgrd^2
0,6.0,3.0,1479.0,3.0,725.0,725.0,1976.0,2005.0,2.0,289.0,...,4020025.0,4010.0,579445.0,12030.0,4.0,578.0,12.0,83521.0,1734.0,36.0
1,7.0,3.0,2122.0,3.0,913.0,913.0,1996.0,1997.0,2.0,132.0,...,3988009.0,3994.0,263604.0,15976.0,4.0,264.0,16.0,17424.0,1056.0,64.0
2,5.0,2.0,1057.0,3.0,1057.0,1057.0,1953.0,2007.0,1.0,0.0,...,4028049.0,2007.0,0.0,10035.0,1.0,0.0,5.0,0.0,0.0,25.0
3,5.0,2.0,1444.0,2.0,384.0,744.0,2006.0,2007.0,2.0,0.0,...,4028049.0,4014.0,0.0,14049.0,4.0,0.0,14.0,0.0,0.0,49.0
4,6.0,2.0,1445.0,2.0,676.0,831.0,1900.0,1993.0,2.0,0.0,...,3972049.0,3986.0,0.0,11958.0,4.0,0.0,12.0,0.0,0.0,36.0


In [337]:
corr_50_interactions.shape

(2028, 77)

In [338]:
corr_50_interactions.head(1)

Unnamed: 0,overall_qual,exter_qual,gr_liv_area,kitchen_qual,total_bsmt_sf,1st_flr_sf,year_built,year_remod/add,full_bath,mas_vnr_area,...,year_remod/add^2,year_remod/add full_bath,year_remod/add mas_vnr_area,year_remod/add totrms_abvgrd,full_bath^2,full_bath mas_vnr_area,full_bath totrms_abvgrd,mas_vnr_area^2,mas_vnr_area totrms_abvgrd,totrms_abvgrd^2
0,6.0,3.0,1479.0,3.0,725.0,725.0,1976.0,2005.0,2.0,289.0,...,4020025.0,4010.0,579445.0,12030.0,4.0,578.0,12.0,83521.0,1734.0,36.0


I referenced [this webpage](https://www.geeksforgeeks.org/how-to-add-column-from-another-dataframe-in-pandas/) to review how to return the 'saleprice' column to the new dataframe.

In [339]:
extracted_column = df['saleprice']

In [340]:
corr_50_interactions = corr_50_interactions.join(extracted_column)

In [341]:
corr_50_interactions.shape

(2028, 78)

In [342]:
df.head(1)

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street_surface,alley,lot_shape,land_contour,...,mo_sold,yr_sold,sale_type,saleprice,has_bsmt,has_garage,has_mas_vnr,has_pool,has_alley,has_fireplace
0,109,533352170,60,RL,0.0,13517,1,,1,Lvl,...,3,2010,WD,130500,1,1,1,0,0,0


-----

# Building the DF to Include first Categorical Variables (for Model 3)

In [343]:
corr_50_interactions.shape

(2028, 78)

In [344]:
corr_50_interactions_cats1 = corr_50_interactions.copy()
corr_50_interactions_cats1 = corr_50_interactions_cats1.join(df['neighborhood'])
corr_50_interactions_cats1 = corr_50_interactions_cats1.join(df['ms_zoning'])
corr_50_interactions_cats1 = corr_50_interactions_cats1.join(df['house_style'])
corr_50_interactions_cats1 = corr_50_interactions_cats1.join(df['bldg_type'])

In [345]:
corr_50_interactions_cats1.shape

(2028, 82)

In [346]:
X_k.shape

(878, 11)

In [347]:
X_k.head()

Unnamed: 0,overall_qual,exter_qual,gr_liv_area,kitchen_qual,total_bsmt_sf,1st_flr_sf,year_built,year_remod/add,full_bath,mas_vnr_area,totrms_abvgrd
0,6,2,1928,1,1020,908,1910,1950,2,0.0,9
1,5,2,1967,2,1967,1967,1977,1977,2,0.0,10
2,7,3,1496,3,654,664,2006,2006,2,0.0,7
3,5,3,968,2,968,968,1923,2006,1,0.0,5
4,6,2,1394,2,1394,1394,1963,1963,1,247.0,6


In [348]:
kaggle_interactions = poly1.fit_transform(X_k)

In [349]:
kaggle_interactions = pd.DataFrame(kaggle_interactions, 
                                   columns = poly1.get_feature_names_out(X_k.columns))


In [350]:
kaggle_interactions.head(1)

Unnamed: 0,overall_qual,exter_qual,gr_liv_area,kitchen_qual,total_bsmt_sf,1st_flr_sf,year_built,year_remod/add,full_bath,mas_vnr_area,...,year_remod/add^2,year_remod/add full_bath,year_remod/add mas_vnr_area,year_remod/add totrms_abvgrd,full_bath^2,full_bath mas_vnr_area,full_bath totrms_abvgrd,mas_vnr_area^2,mas_vnr_area totrms_abvgrd,totrms_abvgrd^2
0,6.0,2.0,1928.0,1.0,1020.0,908.0,1910.0,1950.0,2.0,0.0,...,3802500.0,3900.0,0.0,17550.0,4.0,0.0,18.0,0.0,0.0,81.0


In [351]:
kaggle_interactions.shape

(878, 77)

In [352]:
kaggle_interactions['saleprice'] = 0

In [353]:
kaggle_interactions.shape

(878, 78)

In [354]:
kaggle_interactions.head(1)

Unnamed: 0,overall_qual,exter_qual,gr_liv_area,kitchen_qual,total_bsmt_sf,1st_flr_sf,year_built,year_remod/add,full_bath,mas_vnr_area,...,year_remod/add full_bath,year_remod/add mas_vnr_area,year_remod/add totrms_abvgrd,full_bath^2,full_bath mas_vnr_area,full_bath totrms_abvgrd,mas_vnr_area^2,mas_vnr_area totrms_abvgrd,totrms_abvgrd^2,saleprice
0,6.0,2.0,1928.0,1.0,1020.0,908.0,1910.0,1950.0,2.0,0.0,...,3900.0,0.0,17550.0,4.0,0.0,18.0,0.0,0.0,81.0,0


In [355]:
kaggle_interactions_cats1 = kaggle_interactions.copy()
kaggle_interactions_cats1 = kaggle_interactions_cats1.join(kaggledf['neighborhood'])
kaggle_interactions_cats1 = kaggle_interactions_cats1.join(kaggledf['ms_zoning'])
kaggle_interactions_cats1 = kaggle_interactions_cats1.join(kaggledf['house_style'])
kaggle_interactions_cats1 = kaggle_interactions_cats1.join(kaggledf['bldg_type'])

In [356]:
kaggle_interactions_cats1.shape

(878, 82)

-----

Now, I'll concatenate the two dataframes to prepare them to cast dummy columns for these categories. I needed to review the documentation on pd.concat to get this written correctly.

In [357]:
for_dummy_df = pd.concat([corr_50_interactions_cats1, kaggle_interactions_cats1])

In [358]:
for_dummy_df.shape

(2906, 82)

-----

Now, I'll cast dummies for those 4 columns.

In [400]:
cats1 = ['neighborhood', 'ms_zoning', 'house_style', 'bldg_type']

In [402]:
all_with_dummies = pd.get_dummies(columns = cats1, drop_first = True, data = for_dummy_df)

In [403]:
all_with_dummies.shape

(2906, 122)

-----

Now, I'm going to break this dataframe back into the training set and the Kaggle test set.

In [404]:
dummydf = all_with_dummies.iloc[:2028]

In [405]:
dummydf.shape

(2028, 122)

In [406]:
kaggledummydf = all_with_dummies.iloc[2028:]

In [407]:
kaggledummydf.shape

(878, 122)

In [408]:
kaggle_interactions_cats1.shape

(878, 82)

In [409]:
kaggledummydf.head()

Unnamed: 0,overall_qual,exter_qual,gr_liv_area,kitchen_qual,total_bsmt_sf,1st_flr_sf,year_built,year_remod/add,full_bath,mas_vnr_area,...,house_style_1Story,house_style_2.5Fin,house_style_2.5Unf,house_style_2Story,house_style_SFoyer,house_style_SLvl,bldg_type_2fmCon,bldg_type_Duplex,bldg_type_Twnhs,bldg_type_TwnhsE
0,6.0,2.0,1928.0,1.0,1020.0,908.0,1910.0,1950.0,2.0,0.0,...,0,0,0,1,0,0,1,0,0,0
1,5.0,2.0,1967.0,2.0,1967.0,1967.0,1977.0,1977.0,2.0,0.0,...,1,0,0,0,0,0,0,1,0,0
2,7.0,3.0,1496.0,3.0,654.0,664.0,2006.0,2006.0,2.0,0.0,...,0,0,0,1,0,0,0,0,0,0
3,5.0,3.0,968.0,2.0,968.0,968.0,1923.0,2006.0,1.0,0.0,...,1,0,0,0,0,0,0,0,0,0
4,6.0,2.0,1394.0,2.0,1394.0,1394.0,1963.0,1963.0,1.0,247.0,...,1,0,0,0,0,0,0,0,0,0


In [410]:
dummydf.head()

Unnamed: 0,overall_qual,exter_qual,gr_liv_area,kitchen_qual,total_bsmt_sf,1st_flr_sf,year_built,year_remod/add,full_bath,mas_vnr_area,...,house_style_1Story,house_style_2.5Fin,house_style_2.5Unf,house_style_2Story,house_style_SFoyer,house_style_SLvl,bldg_type_2fmCon,bldg_type_Duplex,bldg_type_Twnhs,bldg_type_TwnhsE
0,6.0,3.0,1479.0,3.0,725.0,725.0,1976.0,2005.0,2.0,289.0,...,0,0,0,1,0,0,0,0,0,0
1,7.0,3.0,2122.0,3.0,913.0,913.0,1996.0,1997.0,2.0,132.0,...,0,0,0,1,0,0,0,0,0,0
2,5.0,2.0,1057.0,3.0,1057.0,1057.0,1953.0,2007.0,1.0,0.0,...,1,0,0,0,0,0,0,0,0,0
3,5.0,2.0,1444.0,2.0,384.0,744.0,2006.0,2007.0,2.0,0.0,...,0,0,0,1,0,0,0,0,0,0
4,6.0,2.0,1445.0,2.0,676.0,831.0,1900.0,1993.0,2.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [411]:
dummydf.shape

(2028, 122)

In [412]:
corr_50_interactions_cats1.shape

(2028, 82)

The two new dfs are the same length as the old ones and the heads match so the operation appears successful.

-----

Building new model with these features.

In [413]:
X_3 = dummydf.drop(columns = 'saleprice')
y_3 = dummydf['saleprice']

In [414]:
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X_3, y_3, random_state = 42)

In [415]:
lr3 = LinearRegression()

In [416]:
lr3.fit(X_train_3, y_train_3) 

LinearRegression()

In [417]:
lr3.score(X_train_3, y_train_3)

0.9158845453666599

In [418]:
lr3.score(X_test_3, y_test_3)

0.883567675633403

The train and test $r^2$ scores are acceptably close together.

In [419]:
cross_val_score(lr3, X_train_3, y_train_3).mean()

0.8313814632461625

I'd like the mean cross-validation score and the model $r^2$ score to be closer together. This suggests that the model is a little overfit, though the test $r^2$ was pretty close to the train $r^2$ score, so I think the difference is acceptable.

In [420]:
preds3 = lr3.predict(X_test_3)
preds3[:5]

array([288281.62678192, 184814.0822951 , 167322.59077021, 163158.81702115,
       292255.95126152])

In [421]:
print(f'Model 3 RMSE: {metrics.mean_squared_error(y_test_3, preds3, squared = False)}')
print(f'Model 1_2 (previous best) RMSE: {metrics.mean_squared_error(y_test_1, preds1_2, squared = False)}')
print(f'Null Model RMSE: {metrics.mean_squared_error(y_test_3, null_pred_list, squared = False)}')

Model 3 RMSE: 25898.612176127735
Model 1_2 (previous best) RMSE: 26741.78505207465
Null Model RMSE: 75922.17182221451


Model 3 represents an improvement over Model 1_2, both models being quite a bit better than the null model.

-----

Preparing Kaggle Data on Model 3 for Submission

In [422]:
X_k_3 = kaggledummydf.drop(columns = 'saleprice')

In [423]:
predsk3 = lr3.predict(X_k_3)
predsk3[:5]

array([139731.85870896, 147996.872743  , 166464.59007285, 116251.3772477 ,
       165325.60262951])

In [506]:
lr3_submission = pd.DataFrame(predsk3)

In [507]:
lr3_submission.head()

Unnamed: 0,0
0,139731.858709
1,147996.872743
2,166464.590073
3,116251.377248
4,165325.60263


In [508]:
lr3_submission = lr3_submission.join(kaggledf['id'])
lr3_submission.rename(columns = {'id': 'Id',
                                 0: 'SalePrice'}, inplace = True)
lr3_submission.set_index('Id', inplace = True)

In [509]:
lr3_submission.head(2)

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
2658,139731.858709
2718,147996.872743


In [510]:
lr3_submission.columns

Index(['SalePrice'], dtype='object')

In [511]:
lr3_submission.to_csv('kaggle_submissions/lr3_submission.csv')

-----

# Adding In Additional Categorical Variables

I'd like to add:
* Masonry Veneer Type _Will this refine that feature? Area is already included because of high correlation_
* Miscellaneous Features _Could this help predict higher value homes?_
* Lot Configuration _Seems logical that could be an important feature_
* Condition 1 _will this help predict diminished value?
* Condition 2 _ditto_

In [430]:
corr_50_interactions_cats1.shape

(2028, 82)

In [395]:
corr_50_interactions_cats2 = corr_50_interactions_cats1.copy()
corr_50_interactions_cats2 = corr_50_interactions_cats2.join(df['mas_vnr_type'])
corr_50_interactions_cats2 = corr_50_interactions_cats2.join(df['misc_feature'])
corr_50_interactions_cats2 = corr_50_interactions_cats2.join(df['lot_config'])
corr_50_interactions_cats2 = corr_50_interactions_cats2.join(df['condition_1'])
corr_50_interactions_cats2 = corr_50_interactions_cats2.join(df['condition_2'])
corr_50_interactions_cats2.shape

(2028, 87)

In [396]:
kaggle_interactions_cats1.shape

(878, 82)

In [397]:
kaggle_interactions_cats2 = kaggle_interactions_cats1.copy()
kaggle_interactions_cats2 = kaggle_interactions_cats2.join(kaggledf['mas_vnr_type'])
kaggle_interactions_cats2 = kaggle_interactions_cats2.join(kaggledf['misc_feature'])
kaggle_interactions_cats2 = kaggle_interactions_cats2.join(kaggledf['lot_config'])
kaggle_interactions_cats2 = kaggle_interactions_cats2.join(kaggledf['condition_1'])
kaggle_interactions_cats2 = kaggle_interactions_cats2.join(kaggledf['condition_2'])
kaggle_interactions_cats2.shape

(878, 87)

In [398]:
for_dummy_df2 = pd.concat([corr_50_interactions_cats2, kaggle_interactions_cats2])
for_dummy_df2.shape

(2906, 87)

In [399]:
878+2028

2906

In [432]:
cats_new1 = ['mas_vnr_type', 'misc_feature', 'lot_config', 'condition_1', 'condition_2']
cats2 = cats1 + cats_new1
cats2

['neighborhood',
 'ms_zoning',
 'house_style',
 'bldg_type',
 'mas_vnr_type',
 'misc_feature',
 'lot_config',
 'condition_1',
 'condition_2']

In [433]:
all_with_dummies2 = pd.get_dummies(data = for_dummy_df2, columns = cats2, drop_first= True)
all_with_dummies2.shape

(2906, 150)

In [434]:
dummydf2 = all_with_dummies2.iloc[:2028]
dummydf2.shape

(2028, 150)

In [436]:
kaggledummydf2 = all_with_dummies2.iloc[2028:]
kaggledummydf2.shape

(878, 150)

In [438]:
corr_50_interactions_cats2.head()

Unnamed: 0,overall_qual,exter_qual,gr_liv_area,kitchen_qual,total_bsmt_sf,1st_flr_sf,year_built,year_remod/add,full_bath,mas_vnr_area,...,saleprice,neighborhood,ms_zoning,house_style,bldg_type,mas_vnr_type,misc_feature,lot_config,condition_1,condition_2
0,6.0,3.0,1479.0,3.0,725.0,725.0,1976.0,2005.0,2.0,289.0,...,130500,Sawyer,RL,2Story,1Fam,BrkFace,No Addl Features,CulDSac,RRAe,Norm
1,7.0,3.0,2122.0,3.0,913.0,913.0,1996.0,1997.0,2.0,132.0,...,220000,SawyerW,RL,2Story,1Fam,BrkFace,No Addl Features,CulDSac,Norm,Norm
2,5.0,2.0,1057.0,3.0,1057.0,1057.0,1953.0,2007.0,1.0,0.0,...,109000,NAmes,RL,1Story,1Fam,,No Addl Features,Inside,Norm,Norm
3,5.0,2.0,1444.0,2.0,384.0,744.0,2006.0,2007.0,2.0,0.0,...,174000,Timber,RL,2Story,1Fam,,No Addl Features,Inside,Norm,Norm
4,6.0,2.0,1445.0,2.0,676.0,831.0,1900.0,1993.0,2.0,0.0,...,138500,SawyerW,RL,1.5Fin,1Fam,,No Addl Features,Inside,Norm,Norm


In [439]:
dummydf2.head()

Unnamed: 0,overall_qual,exter_qual,gr_liv_area,kitchen_qual,total_bsmt_sf,1st_flr_sf,year_built,year_remod/add,full_bath,mas_vnr_area,...,condition_1_RRAn,condition_1_RRNe,condition_1_RRNn,condition_2_Feedr,condition_2_Norm,condition_2_PosA,condition_2_PosN,condition_2_RRAe,condition_2_RRAn,condition_2_RRNn
0,6.0,3.0,1479.0,3.0,725.0,725.0,1976.0,2005.0,2.0,289.0,...,0,0,0,0,1,0,0,0,0,0
1,7.0,3.0,2122.0,3.0,913.0,913.0,1996.0,1997.0,2.0,132.0,...,0,0,0,0,1,0,0,0,0,0
2,5.0,2.0,1057.0,3.0,1057.0,1057.0,1953.0,2007.0,1.0,0.0,...,0,0,0,0,1,0,0,0,0,0
3,5.0,2.0,1444.0,2.0,384.0,744.0,2006.0,2007.0,2.0,0.0,...,0,0,0,0,1,0,0,0,0,0
4,6.0,2.0,1445.0,2.0,676.0,831.0,1900.0,1993.0,2.0,0.0,...,0,0,0,0,1,0,0,0,0,0


In [440]:
kaggle_interactions_cats2.head()

Unnamed: 0,overall_qual,exter_qual,gr_liv_area,kitchen_qual,total_bsmt_sf,1st_flr_sf,year_built,year_remod/add,full_bath,mas_vnr_area,...,saleprice,neighborhood,ms_zoning,house_style,bldg_type,mas_vnr_type,misc_feature,lot_config,condition_1,condition_2
0,6.0,2.0,1928.0,1.0,1020.0,908.0,1910.0,1950.0,2.0,0.0,...,0,OldTown,RM,2Story,2fmCon,,No Addl Features,Inside,Norm,Norm
1,5.0,2.0,1967.0,2.0,1967.0,1967.0,1977.0,1977.0,2.0,0.0,...,0,Sawyer,RL,1Story,Duplex,,No Addl Features,Inside,Norm,Norm
2,7.0,3.0,1496.0,3.0,654.0,664.0,2006.0,2006.0,2.0,0.0,...,0,Gilbert,RL,2Story,1Fam,,No Addl Features,Inside,Norm,Norm
3,5.0,3.0,968.0,2.0,968.0,968.0,1923.0,2006.0,1.0,0.0,...,0,OldTown,RM,1Story,1Fam,,No Addl Features,Inside,Norm,Norm
4,6.0,2.0,1394.0,2.0,1394.0,1394.0,1963.0,1963.0,1.0,247.0,...,0,NAmes,RL,1Story,1Fam,BrkFace,No Addl Features,Inside,Norm,Norm


In [441]:
kaggledummydf2.head()

Unnamed: 0,overall_qual,exter_qual,gr_liv_area,kitchen_qual,total_bsmt_sf,1st_flr_sf,year_built,year_remod/add,full_bath,mas_vnr_area,...,condition_1_RRAn,condition_1_RRNe,condition_1_RRNn,condition_2_Feedr,condition_2_Norm,condition_2_PosA,condition_2_PosN,condition_2_RRAe,condition_2_RRAn,condition_2_RRNn
0,6.0,2.0,1928.0,1.0,1020.0,908.0,1910.0,1950.0,2.0,0.0,...,0,0,0,0,1,0,0,0,0,0
1,5.0,2.0,1967.0,2.0,1967.0,1967.0,1977.0,1977.0,2.0,0.0,...,0,0,0,0,1,0,0,0,0,0
2,7.0,3.0,1496.0,3.0,654.0,664.0,2006.0,2006.0,2.0,0.0,...,0,0,0,0,1,0,0,0,0,0
3,5.0,3.0,968.0,2.0,968.0,968.0,1923.0,2006.0,1.0,0.0,...,0,0,0,0,1,0,0,0,0,0
4,6.0,2.0,1394.0,2.0,1394.0,1394.0,1963.0,1963.0,1.0,247.0,...,0,0,0,0,1,0,0,0,0,0


The new dummy dataframes match the dataframes that went into them in length and the heads match, so it appears the process produced the correct dataframes.

In [444]:
X_4 = dummydf2.drop(columns = ['saleprice'])
y_4 = dummydf2['saleprice']

In [446]:
X_train_4, X_test_4, y_train_4, y_test_4 = train_test_split(X_4, y_4, random_state = 42)

In [447]:
lr4 = LinearRegression()

In [448]:
lr4.fit(X_train_4, y_train_4)

LinearRegression()

In [449]:
lr4.score(X_train_4, y_train_4)

0.9205387679596472

In [450]:
cross_val_score(lr4, X_train_4, y_train_4).mean()

0.693702061544305

In [455]:
cross_val_score(lr4, X_train_4, y_train_4)

array([0.15992466, 0.78297104, 0.8942531 , 0.87571695, 0.75564455])

In [456]:
cross_val_score(lr4, X_4, y_4)

array([0.77259391, 0.91116963, 0.44750051, 0.87683711, 0.86236042])

In [457]:
cross_val_score(lr4, X_4, y_4).mean()

0.7740923145454642

In [451]:
lr4.score(X_test_4, y_test_4)

0.8856980244382155

In [452]:
preds4 = lr4.predict(X_test_4)

In [453]:
metrics.mean_squared_error(y_test_4, preds4, squared = False)

25660.586407816525

In [474]:
print(f'Model 4 RMSE: {metrics.mean_squared_error(y_test_4, preds4, squared = False)}')
print(f'Model 3 RMSE: {metrics.mean_squared_error(y_test_3, preds3, squared = False)}')
print(f'Model 1_2 (previous best) RMSE: {metrics.mean_squared_error(y_test_1, preds1_2, squared = False)}')
print(f'Null Model RMSE: {metrics.mean_squared_error(y_test_3, null_pred_list, squared = False)}')

Model 4 RMSE: 25660.586407816525
Model 3 RMSE: 25898.612176127735
Model 1_2 (previous best) RMSE: 26741.78505207465
Null Model RMSE: 75922.17182221451


This model doesn't represent much of an improvement, and I'm concerned about the discrepancy between the $r^2$ and the cross-validation score suggests it's overfit, anyway

-----

# Ridge Model (Model 5)
Working from Model 3 (lr3) features.

In [463]:
sc = StandardScaler()
Z_train_3 = sc.fit_transform(X_train_3)
Z_test_3 = sc.transform(X_test_3)

In [466]:
alphas = np.logspace(1, 5, 100)
ridge_cv1 = RidgeCV(alphas = alphas, cv = 5)
ridge_cv1.fit(Z_train_3, y_train_3)

RidgeCV(alphas=array([1.00000000e+01, 1.09749877e+01, 1.20450354e+01, 1.32194115e+01,
       1.45082878e+01, 1.59228279e+01, 1.74752840e+01, 1.91791026e+01,
       2.10490414e+01, 2.31012970e+01, 2.53536449e+01, 2.78255940e+01,
       3.05385551e+01, 3.35160265e+01, 3.67837977e+01, 4.03701726e+01,
       4.43062146e+01, 4.86260158e+01, 5.33669923e+01, 5.85702082e+01,
       6.42807312e+01, 7.05480231e+0...
       1.17681195e+04, 1.29154967e+04, 1.41747416e+04, 1.55567614e+04,
       1.70735265e+04, 1.87381742e+04, 2.05651231e+04, 2.25701972e+04,
       2.47707636e+04, 2.71858824e+04, 2.98364724e+04, 3.27454916e+04,
       3.59381366e+04, 3.94420606e+04, 4.32876128e+04, 4.75081016e+04,
       5.21400829e+04, 5.72236766e+04, 6.28029144e+04, 6.89261210e+04,
       7.56463328e+04, 8.30217568e+04, 9.11162756e+04, 1.00000000e+05]),
        cv=5)

In [467]:
ridge_cv1.best_score_

0.8627475268009327

In [469]:
print(f'Training score: {ridge_cv1.score(Z_train_3, y_train_3)}')
print(f'Test score: {ridge_cv1.score(Z_test_3, y_test_3)}')

Training score: 0.9023260412362292
Test score: 0.8959410316415662


In [477]:
cross_val_score(ridge_cv1, Z_train_3, y_train_3).mean()

0.8335054335859085

In [472]:
preds5 = ridge_cv1.predict(Z_test_3)
preds5[:5]

array([298484.9215135 , 195836.99662535, 164914.19257921, 166734.94786266,
       285707.82864303])

In [473]:
metrics.mean_squared_error(y_test_3, preds5, squared = False)

24483.836265119164

In [475]:
print(f'Model 5 (Ridge) RMSE: {metrics.mean_squared_error(y_test_3, preds5, squared = False)}')
print(f'Model 4 RMSE: {metrics.mean_squared_error(y_test_4, preds4, squared = False)}')
print(f'Model 3 RMSE: {metrics.mean_squared_error(y_test_3, preds3, squared = False)}')
print(f'Model 1_2 (previous best) RMSE: {metrics.mean_squared_error(y_test_1, preds1_2, squared = False)}')
print(f'Null Model RMSE: {metrics.mean_squared_error(y_test_3, null_pred_list, squared = False)}')

Model 5 (Ridge) RMSE: 24483.836265119164
Model 4 RMSE: 25660.586407816525
Model 3 RMSE: 25898.612176127735
Model 1_2 (previous best) RMSE: 26741.78505207465
Null Model RMSE: 75922.17182221451


The Ridge model represents an improvement in RMSE, though the $r^2$ score is similar to model 3.

-----

# Model 6, Lasso Model (using features developed for Model 3)

I worked off the Regularization lesson to set this up.

In [481]:
l_alphas = np.logspace(-3, 0, 100)

lasso_cv1 = LassoCV(alphas = l_alphas)

lasso_cv1.fit(Z_train_3, y_train_3)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

LassoCV(alphas=array([0.001     , 0.00107227, 0.00114976, 0.00123285, 0.00132194,
       0.00141747, 0.00151991, 0.00162975, 0.00174753, 0.00187382,
       0.00200923, 0.00215443, 0.00231013, 0.00247708, 0.00265609,
       0.00284804, 0.00305386, 0.00327455, 0.00351119, 0.00376494,
       0.00403702, 0.00432876, 0.00464159, 0.00497702, 0.0053367 ,
       0.00572237, 0.00613591, 0.00657933, 0.0070548 , 0.00756463,
       0.008...
       0.09326033, 0.1       , 0.10722672, 0.1149757 , 0.12328467,
       0.13219411, 0.14174742, 0.15199111, 0.16297508, 0.17475284,
       0.18738174, 0.2009233 , 0.21544347, 0.23101297, 0.24770764,
       0.26560878, 0.28480359, 0.30538555, 0.32745492, 0.35111917,
       0.37649358, 0.40370173, 0.43287613, 0.46415888, 0.49770236,
       0.53366992, 0.57223677, 0.61359073, 0.65793322, 0.70548023,
       0.75646333, 0.81113083, 0.869749  , 0.93260335, 1.        ]))

In [482]:
lasso_cv1.score(Z_train_3, y_train_3)

0.913956531997295

In [483]:
lasso_cv1.score(Z_test_3, y_test_3)

0.8853056655803074

In [484]:
cross_val_score(lasso_cv1, Z_train_3, y_train_3).mean()

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

0.8420701529018556

The train and test $r^2$ scores are close to each other. The mean cross-validation score of 84.2% is close to the $r^2$. The model is looking well-fit.

In [485]:
preds6 = lasso_cv1.predict(Z_test_3)
preds6[:5]

array([290851.48621169, 184317.3961959 , 169577.06229471, 162153.6505127 ,
       291298.22644188])

In [486]:
metrics.mean_squared_error(y_test_3, preds6, squared = False)

25704.59060373853

In [487]:
print(f'Model 6 (Lasso) RMSE: {metrics.mean_squared_error(y_test_3, preds6, squared = False)}')
print(f'Model 5 (Ridge) RMSE: {metrics.mean_squared_error(y_test_3, preds5, squared = False)}')
print(f'Model 4 RMSE: {metrics.mean_squared_error(y_test_4, preds4, squared = False)}')
print(f'Model 3 RMSE: {metrics.mean_squared_error(y_test_3, preds3, squared = False)}')
print(f'Model 1_2 (previous best) RMSE: {metrics.mean_squared_error(y_test_1, preds1_2, squared = False)}')
print(f'Null Model RMSE: {metrics.mean_squared_error(y_test_3, null_pred_list, squared = False)}')

Model 6 (Lasso) RMSE: 25704.59060373853
Model 5 (Ridge) RMSE: 24483.836265119164
Model 4 RMSE: 25660.586407816525
Model 3 RMSE: 25898.612176127735
Model 1_2 (previous best) RMSE: 26741.78505207465
Null Model RMSE: 75922.17182221451


The RMSE is actually up a bit from the Ridge model. For the purposes of the Kaggle competition, I'm going to make my last entry from the Ridge model.

-----

# Prepping Kaggle Data with Model 5 (Ridge)

In [488]:
Z_kaggle_3 = sc.transform(X_k_3)

In [489]:
predsk5 = ridge_cv1.predict(Z_kaggle_3)
predsk5[:5]

array([131295.17047598, 156471.27267381, 176139.64724935, 116834.4104863 ,
       166376.03975157])

In [512]:
ridge_cv1_submission = lr3_submission.copy()

ridge_cv1_submission.head(2)

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
2658,139731.858709
2718,147996.872743


In [513]:
ridge_cv1_submission['SalePrice'] = predsk5

In [515]:
ridge_cv1_submission.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
2658,131295.170476
2718,156471.272674
2414,176139.647249
1989,116834.410486
625,166376.039752


In [516]:
ridge_cv1_submission.to_csv('kaggle_submissions/ridge_cv1_submission.csv')