In [28]:
import numpy as np
import pandas as pd
import patsy

from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [29]:
df = pd.read_csv('/Users/omarcarr/Desktop/Notebooks/DSI-US-5/Projects/Project-2/train.csv')
df_test = pd.read_csv('/Users/omarcarr/Desktop/Notebooks/DSI-US-5/Projects/Project-2/test.csv')

In [30]:
df.shape

(2051, 81)

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 81 columns):
Id                 2051 non-null int64
PID                2051 non-null int64
MS SubClass        2051 non-null int64
MS Zoning          2051 non-null object
Lot Frontage       1721 non-null float64
Lot Area           2051 non-null int64
Street             2051 non-null object
Alley              140 non-null object
Lot Shape          2051 non-null object
Land Contour       2051 non-null object
Utilities          2051 non-null object
Lot Config         2051 non-null object
Land Slope         2051 non-null object
Neighborhood       2051 non-null object
Condition 1        2051 non-null object
Condition 2        2051 non-null object
Bldg Type          2051 non-null object
House Style        2051 non-null object
Overall Qual       2051 non-null int64
Overall Cond       2051 non-null int64
Year Built         2051 non-null int64
Year Remod/Add     2051 non-null int64
Roof Style         20

In [32]:
df.corr()['SalePrice'].sort_values(ascending=False).head(20)

SalePrice         1.000000
Overall Qual      0.800207
Gr Liv Area       0.697038
Garage Area       0.650270
Garage Cars       0.648220
Total Bsmt SF     0.628925
1st Flr SF        0.618486
Year Built        0.571849
Year Remod/Add    0.550370
Full Bath         0.537969
Garage Yr Blt     0.533922
Mas Vnr Area      0.512230
TotRms AbvGrd     0.504014
Fireplaces        0.471093
BsmtFin SF 1      0.423519
Lot Frontage      0.341842
Open Porch SF     0.333476
Wood Deck SF      0.326490
Lot Area          0.296566
Bsmt Full Bath    0.283662
Name: SalePrice, dtype: float64

In [33]:
feature_cols = ['Overall Qual', 'Garage Cars', 'Year Built', 'Lot Area', '1st Flr SF']

In [34]:
X = df[feature_cols]
y = df['SalePrice']

In [35]:
X.shape

(2051, 5)

In [36]:
y.shape

(2051,)

In [37]:
X.isnull().sum()

Overall Qual    0
Garage Cars     1
Year Built      0
Lot Area        0
1st Flr SF      0
dtype: int64

In [38]:
y.isnull().sum()

0

In [39]:
X[X['Garage Cars'].isnull()]

Unnamed: 0,Overall Qual,Garage Cars,Year Built,Lot Area,1st Flr SF
1712,5,,1923,9060,942


In [40]:
X['Garage Cars'].value_counts()

2.0    1136
1.0     524
3.0     263
0.0     113
4.0      13
5.0       1
Name: Garage Cars, dtype: int64

In [41]:
X['Garage Cars'] = X['Garage Cars'].fillna(value=0.0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [42]:
X.isnull().sum()

Overall Qual    0
Garage Cars     0
Year Built      0
Lot Area        0
1st Flr SF      0
dtype: int64

In [43]:
X.shape

(2051, 5)

In [44]:
y.shape

(2051,)

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [46]:
lr = LinearRegression()

In [47]:
lr.fit(X_train, y_train)
lr.score(X_train, y_train)

0.7408381524691537

In [48]:
lr.score(X_test, y_test)

0.7856757427134273

In [49]:
lr.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [50]:
df_test.isnull().sum()

Id                  0
PID                 0
MS SubClass         0
MS Zoning           0
Lot Frontage      160
Lot Area            0
Street              0
Alley             821
Lot Shape           0
Land Contour        0
Utilities           0
Lot Config          0
Land Slope          0
Neighborhood        0
Condition 1         0
Condition 2         0
Bldg Type           0
House Style         0
Overall Qual        0
Overall Cond        0
Year Built          0
Year Remod/Add      0
Roof Style          0
Roof Matl           0
Exterior 1st        0
Exterior 2nd        0
Mas Vnr Type        1
Mas Vnr Area        1
Exter Qual          0
Exter Cond          0
                 ... 
Full Bath           0
Half Bath           0
Bedroom AbvGr       0
Kitchen AbvGr       0
Kitchen Qual        0
TotRms AbvGrd       0
Functional          0
Fireplaces          0
Fireplace Qu      422
Garage Type        44
Garage Yr Blt      45
Garage Finish      45
Garage Cars         0
Garage Area         0
Garage Qua

In [51]:
X_kaggle = df_test[feature_cols]

preds = lr.predict(X_kaggle)

In [52]:
ids = df_test['Id']

In [53]:
preds

array([138705.47419887, 187798.07697582, 207279.19248654, 130850.67940581,
       189299.45851273,  67041.72712382,  96958.27804267, 139570.22454544,
       195654.66643233, 188241.3119605 , 184532.28011387, 162340.53679916,
       189002.20889951, 249801.78528016, 150128.92492793, 137989.88735352,
       163843.5500615 , 126857.25820013, 207158.25149823, 221495.76081994,
       127094.39806866, 134656.34371251, 203543.17413886, 189524.5673254 ,
       221212.7738571 , 117965.30035604, 102322.55138733, 110038.99363183,
       173530.14023462,  11778.00540129,  87959.48524824,  93159.36829379,
       235177.5189983 , 153045.25847766, 233983.21851976, 199433.35598311,
        86792.451245  , 107916.39364978, 135443.20988441, 222378.61314981,
       157689.22462229, 210060.1534302 , 152827.73985531, 145296.62281595,
       221431.67950056,  59792.63540053, 209203.28064171, 121680.88885386,
       115759.073465  , 124089.17369299, 113945.67857711, 221720.90659633,
       285160.53081589, 1

In [54]:
preds_df = pd.DataFrame({
    'Id': ids,
    'SalePrice': preds
})

In [55]:
import datetime

In [56]:
now = str(datetime.datetime.now())

In [57]:
f'predictions_{now}'

'predictions_2018-08-23 00:30:07.264839'

In [58]:
now = str(datetime.datetime.now())
preds_df.to_csv(f'kaggle_preds_{now}', index=False)

In [59]:
pd.read_csv('kaggle_preds_2018-08-23 00:31:25.234333')

Unnamed: 0,Id,SalePrice
0,2658,138705.474199
1,2718,187798.076976
2,2414,207279.192487
3,1989,130850.679406
4,625,189299.458513
5,333,67041.727124
6,1327,96958.278043
7,858,139570.224545
8,95,195654.666432
9,1568,188241.311960
