In [1]:
import numpy as np
import pandas as pd
import patsy

from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV, LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
sns.set_style('darkgrid')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [2]:
df = pd.read_csv('/Users/omarcarr/Desktop/Notebooks/DSI-US-5/Projects/Project-2/train.csv')
df_test = pd.read_csv('/Users/omarcarr/Desktop/Notebooks/DSI-US-5/Projects/Project-2/test.csv')

In [3]:
df.shape

(2051, 81)

In [4]:
df.corr()['SalePrice'].sort_values(ascending=False).head(20)

SalePrice         1.000000
Overall Qual      0.800207
Gr Liv Area       0.697038
Garage Area       0.650270
Garage Cars       0.648220
Total Bsmt SF     0.628925
1st Flr SF        0.618486
Year Built        0.571849
Year Remod/Add    0.550370
Full Bath         0.537969
Garage Yr Blt     0.533922
Mas Vnr Area      0.512230
TotRms AbvGrd     0.504014
Fireplaces        0.471093
BsmtFin SF 1      0.423519
Lot Frontage      0.341842
Open Porch SF     0.333476
Wood Deck SF      0.326490
Lot Area          0.296566
Bsmt Full Bath    0.283662
Name: SalePrice, dtype: float64

In [71]:
feature_cols = ['Overall Qual', 'Gr Liv Area', 'Garage Cars', 'Garage Area', 'Total Bsmt SF', '1st Flr SF', 'Year Built', 'Full Bath']

In [72]:
X = df[feature_cols]
y = df['SalePrice']

In [73]:
X.shape

(2051, 8)

In [74]:
y.shape

(2051,)

In [75]:
X.isnull().sum()

Overall Qual     0
Gr Liv Area      0
Garage Cars      1
Garage Area      1
Total Bsmt SF    1
1st Flr SF       0
Year Built       0
Full Bath        0
dtype: int64

In [76]:
X['Total Bsmt SF'] = X['Total Bsmt SF'].fillna(value=0.0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [77]:
X['Garage Cars'] = X['Garage Cars'].fillna(value=0.0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [78]:
X['Garage Area'] = X['Garage Area'].fillna(value=0.0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [79]:
X.isnull().sum()

Overall Qual     0
Gr Liv Area      0
Garage Cars      0
Garage Area      0
Total Bsmt SF    0
1st Flr SF       0
Year Built       0
Full Bath        0
dtype: int64

In [80]:
X.shape

(2051, 8)

In [81]:
y.shape

(2051,)

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [83]:
lr = LinearRegression()

In [84]:
lr.fit(X_train, y_train)
lr.score(X_train, y_train)

0.7717127965485745

In [85]:
lr.score(X_test, y_test)

0.8343568423026974

In [86]:
lr.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [89]:
X_kaggle = df_test[feature_cols].fillna(0.0)

In [90]:
X_kaggle.isnull().sum()

Overall Qual     0
Gr Liv Area      0
Garage Cars      0
Garage Area      0
Total Bsmt SF    0
1st Flr SF       0
Year Built       0
Full Bath        0
dtype: int64

In [91]:
preds = lr.predict(X_kaggle)

In [92]:
ids = df_test['Id']

In [93]:
preds

array([ 1.64858440e+05,  2.13448283e+05,  1.94546583e+05,  1.14701228e+05,
        1.86520538e+05,  8.96794086e+04,  1.00244861e+05,  1.44899897e+05,
        2.14387145e+05,  1.73604596e+05,  1.75325804e+05,  1.37389904e+05,
        1.79462538e+05,  2.85071837e+05,  1.62487413e+05,  1.20227590e+05,
        1.68225670e+05,  1.24534549e+05,  1.99860374e+05,  1.98248447e+05,
        1.25308690e+05,  1.28350353e+05,  2.10232827e+05,  1.55441181e+05,
        2.11456212e+05,  1.13514551e+05,  1.14467142e+05,  1.19088338e+05,
        1.55455003e+05,  1.80538272e+04,  1.02515078e+05,  1.02017612e+05,
        2.57613853e+05,  1.48158097e+05,  2.27260070e+05,  1.91964260e+05,
        1.14439152e+05,  8.51714910e+04,  1.04206276e+05,  2.15838390e+05,
        1.62935329e+05,  2.20180399e+05,  1.57182164e+05,  1.38482488e+05,
        2.19723330e+05,  8.96875110e+04,  2.21802593e+05,  1.15105440e+05,
        1.12218698e+05,  1.23210774e+05,  1.11979793e+05,  2.15912195e+05,
        2.75516237e+05,  

In [94]:
preds_df = pd.DataFrame({
    'Id': ids,
    'SalePrice': preds
})

In [95]:
import datetime

In [96]:
now = str(datetime.datetime.now())

In [97]:
f'predictions_{now}'

'predictions_2018-08-23 13:32:59.948211'

In [98]:
now = str(datetime.datetime.now())
preds_df.to_csv(f'kaggle_preds_{now}', index=False)

In [99]:
pd.read_csv('kaggle_preds_2018-08-23 13:33:02.802196')

Unnamed: 0,Id,SalePrice
0,2658,164858.439763
1,2718,213448.282779
2,2414,194546.583114
3,1989,114701.228265
4,625,186520.538408
5,333,89679.408581
6,1327,100244.861100
7,858,144899.897013
8,95,214387.145070
9,1568,173604.596174
