In [30]:
import numpy as np
import pandas as pd
import patsy

from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV, LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [2]:
df = pd.read_csv('/Users/omarcarr/Desktop/Notebooks/DSI-US-5/Projects/Project-2/train.csv')
df_test = pd.read_csv('/Users/omarcarr/Desktop/Notebooks/DSI-US-5/Projects/Project-2/test.csv')

In [3]:
df.shape

(2051, 81)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 81 columns):
Id                 2051 non-null int64
PID                2051 non-null int64
MS SubClass        2051 non-null int64
MS Zoning          2051 non-null object
Lot Frontage       1721 non-null float64
Lot Area           2051 non-null int64
Street             2051 non-null object
Alley              140 non-null object
Lot Shape          2051 non-null object
Land Contour       2051 non-null object
Utilities          2051 non-null object
Lot Config         2051 non-null object
Land Slope         2051 non-null object
Neighborhood       2051 non-null object
Condition 1        2051 non-null object
Condition 2        2051 non-null object
Bldg Type          2051 non-null object
House Style        2051 non-null object
Overall Qual       2051 non-null int64
Overall Cond       2051 non-null int64
Year Built         2051 non-null int64
Year Remod/Add     2051 non-null int64
Roof Style         20

In [5]:
df.corr()['SalePrice'].sort_values(ascending=False).head(20)

SalePrice         1.000000
Overall Qual      0.800207
Gr Liv Area       0.697038
Garage Area       0.650270
Garage Cars       0.648220
Total Bsmt SF     0.628925
1st Flr SF        0.618486
Year Built        0.571849
Year Remod/Add    0.550370
Full Bath         0.537969
Garage Yr Blt     0.533922
Mas Vnr Area      0.512230
TotRms AbvGrd     0.504014
Fireplaces        0.471093
BsmtFin SF 1      0.423519
Lot Frontage      0.341842
Open Porch SF     0.333476
Wood Deck SF      0.326490
Lot Area          0.296566
Bsmt Full Bath    0.283662
Name: SalePrice, dtype: float64

In [6]:
feature_cols = ['Overall Qual', 'Gr Liv Area', 'Garage Cars', 'Garage Area', 'Year Built', 'Mas Vnr Area', 'Lot Area']

In [7]:
X = df[feature_cols]
y = df['SalePrice']

In [8]:
X.shape

(2051, 7)

In [9]:
y.shape

(2051,)

In [10]:
X.isnull().sum()

Overall Qual     0
Gr Liv Area      0
Garage Cars      1
Garage Area      1
Year Built       0
Mas Vnr Area    22
Lot Area         0
dtype: int64

In [11]:
X[X['Mas Vnr Area'].isnull()]

Unnamed: 0,Overall Qual,Gr Liv Area,Garage Cars,Garage Area,Year Built,Mas Vnr Area,Lot Area
22,8,2253,2.0,575.0,2005,,12867
41,10,2076,3.0,850.0,2006,,13891
86,7,1200,2.0,555.0,2003,,10083
212,7,1436,2.0,529.0,2008,,7993
276,6,914,0.0,0.0,2002,,8050
338,8,1880,3.0,880.0,2007,,12217
431,8,2031,2.0,577.0,2002,,9473
451,8,1460,2.0,480.0,2006,,10037
591,7,1241,2.0,569.0,2006,,4274
844,8,1550,2.0,528.0,2006,,5330


In [12]:
X['Mas Vnr Area'] = X['Mas Vnr Area'].fillna(value=0.0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [13]:
X['Garage Cars'] = X['Garage Cars'].fillna(value=0.0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [14]:
X['Garage Area'] = X['Garage Area'].fillna(value=0.0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [15]:
X.isnull().sum()

Overall Qual    0
Gr Liv Area     0
Garage Cars     0
Garage Area     0
Year Built      0
Mas Vnr Area    0
Lot Area        0
dtype: int64

In [16]:
X.shape

(2051, 7)

In [17]:
y.shape

(2051,)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [19]:
ss = StandardScaler()
Xs = ss.fit_transform(X)

In [21]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [22]:
knn.score(X_test, y_test)

0.001949317738791423

In [23]:
print(np.mean(y_test))

182692.03118908382


In [25]:
%%time

knn_params = {
    'n_neighbors':[1,3,5,9,11,15,17,19,21,23,25,27],
    'weights':['uniform','distance'],
    'metric':['euclidean','manhattan']
}

knn_gridsearch = GridSearchCV(KNeighborsClassifier(), knn_params, cv=7, verbose=1, n_jobs=3)

knn_gridsearch = knn_gridsearch.fit(X_train, y_train)

Fitting 7 folds for each of 48 candidates, totalling 336 fits




CPU times: user 330 ms, sys: 42 ms, total: 372 ms
Wall time: 7.41 s


[Parallel(n_jobs=3)]: Done 336 out of 336 | elapsed:    7.1s finished


In [26]:
knn_gridsearch.best_score_

0.013003901170351105

In [27]:
knn_gridsearch.best_params_

{'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}

In [28]:
best_knn = knn_gridsearch.best_estimator_
best_knn.score(X_test, y_test)

0.01364522417153996

In [29]:
print('baseline:', np.mean(y_test))
print('default KNN:', knn.score(X_test, y_test))

baseline: 182692.03118908382
default KNN: 0.001949317738791423


In [31]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))

0.09947984395318596
0.003898635477582846


In [32]:
gs_params = {
    'penalty':['l1','l2'],
    'solver':['liblinear'],
    'C':np.logspace(0,5,100)
}

lr_gridsearch = GridSearchCV(LogisticRegression(), gs_params, cv=5, verbose=1)

In [None]:
%%time
lr_gridsearch = lr_gridsearch.fit(X_train, y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits




In [None]:
X_kaggle = df_test[feature_cols]
X_kaggle['Mas Vnr Area'] = X_kaggle['Mas Vnr Area'].fillna(value=0.0)
preds = lr.predict(X_kaggle)

In [42]:
ids = df_test['Id']

In [43]:
preds

array([166970.48771741, 178306.08100383, 216841.98284209, 108808.12043813,
       179335.37300653,  89378.81563445,  93322.9926649 , 129797.98552543,
       215460.96509123, 167095.22148801, 166356.2008656 , 127366.39419832,
       163497.64864366, 284191.59536025, 161454.97710284, 129919.88576237,
       177848.95972232, 118016.66662046, 186137.42066693, 207398.7313423 ,
       116954.32401798, 129366.91873116, 188009.57117505, 167956.11779379,
       202827.71675467, 112207.36169418, 115742.22454775, 115430.54409682,
       157835.87661013,  24165.33454299, 103426.94773797,  96553.05708721,
       240084.97258395, 135917.16987932, 224171.68289512, 213754.78824656,
        94364.44796394,  94113.61336022, 114421.90087197, 205668.64643008,
       174686.84460996, 224415.54519808, 142847.47452451, 135750.65589118,
       214343.13974971,  84159.4912233 , 236469.40227922, 112519.69184423,
       106068.58171915, 115127.70461773, 108020.26293427, 232344.7745017 ,
       265929.26308915, 1

In [44]:
preds_df = pd.DataFrame({
    'Id': ids,
    'SalePrice': preds
})

In [45]:
import datetime

In [46]:
now = str(datetime.datetime.now())

In [47]:
f'predictions_{now}'

'predictions_2018-08-23 01:22:52.209881'

In [48]:
now = str(datetime.datetime.now())
preds_df.to_csv(f'kaggle_preds_{now}', index=False)

In [49]:
pd.read_csv('kaggle_preds_2018-08-23 00:31:25.234333')

Unnamed: 0,Id,SalePrice
0,2658,138705.474199
1,2718,187798.076976
2,2414,207279.192487
3,1989,130850.679406
4,625,189299.458513
5,333,67041.727124
6,1327,96958.278043
7,858,139570.224545
8,95,195654.666432
9,1568,188241.311960
