In [25]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns

from sklearn.linear_model import LinearRegression, ElasticNet, ElasticNetCV
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

%matplotlib inline

In [26]:
df_data = pd.read_csv('train.csv')
final_data = pd.read_csv('test.csv')

In [27]:
def does_does_not(cell):
    if cell >= 1:
        return 1
    else:
        return 0

def clean(df):
    df.rename(columns= {col: col.lower().replace(' ','_') for col in df.columns}, inplace=True)

    df['deck_porch'] = df['wood_deck_sf'] + df['open_porch_sf'] + df['enclosed_porch'] + df['3ssn_porch'] + df['screen_porch']
    df['fence'] = df['fence'].notnull().astype('int')
#     df['overall_cond_qual'] = df['overall_cond'] + df['overall_qual']

    df.drop([
    'alley', 'lot_frontage', 'mas_vnr_type', 'fireplace_qu', 'pool_qc', 'misc_feature',
    'bsmt_qual', 'bsmt_cond', 'bsmt_exposure', 'bsmtfin_type_1', 'bsmtfin_sf_1', 'bsmtfin_type_2',
    'bsmtfin_sf_2', 'bsmt_unf_sf', 'bsmt_full_bath', 'bsmt_half_bath',
    'garage_type', 'garage_yr_blt', 'garage_finish', 'garage_area', 'garage_qual', 'garage_cond',
    'wood_deck_sf', 'open_porch_sf', 'enclosed_porch', '3ssn_porch', 'screen_porch',
#     'overall_cond', 'overall_qual'
    ], axis=1, inplace=True)

    features_replace = ['pool_area' , 'garage_cars', 'fireplaces', 'total_bsmt_sf', 'mas_vnr_area', 'deck_porch']
    for i in features_replace:
        df[i] = df[i].map(does_does_not)

    df.rename(columns={
    'pool_area': 'has_pool',
    'garage_cars': 'has_garage', 
    'fireplaces': 'has_fireplace',
    'total_bsmt_sf': 'has_bsmt',
    'mas_vnr_area': 'has_mas_vnr',
    'fence': 'has_fence'}, inplace=True)


    df['lot_shape'] =df['lot_shape'].replace({'IR3':'IR2'})
    df['condition_1'] =df['condition_1'].replace({'PosA':'Pos', 'PosN':'Pos', 'RRAe':'RRe', 'RRNe':'RRe', 'RRNn':'RRn', 'RRAn':'RRn'})
    df['condition_2'] =df['condition_2'].replace({'PosA':'Pos', 'PosN':'Pos', 'RRAe':'RRe', 'RRNe':'RRe', 'RRNn':'RRn', 'RRAn':'RRn'})
#     df['year_built'] = pd.to_datetime(df['year_built'], format='%Y')
#     df['year_remod/add'] = pd.to_datetime(df['year_remod/add'], format='%Y')
#     df['mo_sold'] = pd.to_datetime(df['mo_sold'], format='%m')
#     df['yr_sold'] = pd.to_datetime(df['yr_sold'], format='%Y')
    df['exter_qual'] = df['exter_qual'].replace({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1})
    df['exter_cond'] = df['exter_cond'].replace({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1})
    df['heating_qc'] = df['heating_qc'].replace({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1})
    df['electrical'] = df['electrical'].replace({'SBrkr': 5, 'FuseA': 4, 'FuseF': 3, 'FuseP': 2, 'Mix': 1})
    df['kitchen_qual'] = df['kitchen_qual'].replace({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1})

    df.drop(['gr_liv_area', '2nd_flr_sf', 'bedroom_abvgr'], axis=1, inplace=True)

    df = pd.get_dummies(df, columns=[
    'neighborhood',
    'lot_shape',
    'ms_subclass',
    'ms_zoning',
    'land_contour',
    'lot_config',
    'utilities',
    'land_slope',
    'condition_1',
    'condition_2',
    'bldg_type',
    'exterior_1st',
    'exterior_2nd',
    'foundation',
    'heating',
    'central_air',
    'full_bath',
    'half_bath',
    'kitchen_abvgr',
    'functional',
    'paved_drive',
    'sale_type',
    'street',
    'house_style',
    'roof_style',
    'roof_matl',
    'electrical'
    ], prefix=[
    'n_',
    'ls_',
    'mssub_',
    'mszon_',
    'landcon_',
    'lotfig_',
    'ut_',
    'landm_',
    'cond1_',
    'cond2_',
    'btype_',
    'ext1_',
    'ext2_',
    'found_',
    'heat_',
    'ac_',
    'fb_',
    'hb_',
    'kita_',
    'funct_',
    'paved_',
    'stype_',
    'st_',
    'hstyle_',
    'rst_',
    'rmat_',
    'ele_'
    ])
    
    return df 

In [28]:
df = clean(df_data)
final = clean(final_data)

In [29]:
def mia_columns(df1, df2):
    mia_df2 = list(set(df1.columns) - set(df2.columns))
    print(mia_df2)
    mia_df1 = list(set(df2.columns) - set(df1.columns))
    print(mia_df1)
    
    for i in mia_df2:
        df2[i] = 0
    
    for i in mia_df1:
        df1[i] = 0

In [None]:
# n_jobs = -1 --> split over multiple cores

In [30]:
mia_columns(df, final)

['rmat__ClyTile', 'ele__1', 'ext1__Stone', 'ele__5', 'cond2__Artery', 'heat__OthW', 'n__GrnHill', 'sale_condition', 'saleprice', 'cond2__RRn', 'mssub__150', 'rmat__Membran', 'funct__Sev', 'heat__Wall', 'ext1__CBlock', 'ele__4', 'ext1__ImStucc', 'ele__2', 'mszon__A (agr)', 'ext2__Stone', 'ut__NoSeWa', 'n__Landmrk', 'ele__3', 'funct__Sal', 'cond2__RRe']
['rmat__Roll', 'ele__3.0', 'ele__5.0', 'heat__Floor', 'ext1__PreCast', 'ext2__Other', 'ele__2.0', 'ele__4.0', 'rmat__Metal', 'ext2__PreCast', 'stype__VWD']


In [31]:
y = df['saleprice']
X = df.drop(['id', 'pid', 'sale_condition', 'saleprice'], axis=1)
# select all but id, pid, sale_condition, saleprice

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=101)
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [33]:
lr = LinearRegression()
lr.fit(X_train_sc, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [34]:
print('train', lr.score(X_train_sc, y_train))
print('test', lr.score(X_test_sc, y_test))

train 0.892002798672
test -2.31711171869e+22


In [37]:
enet = ElasticNet()
params = {
    'alpha': np.arange(.1, .4, .005),
    'l1_ratio': np.arange(.001, .01, .05)
}
gs = GridSearchCV(enet, params, n_jobs=-1)
gs.fit(X_train_sc, y_train)
gs.best_params_

{'alpha': 0.26000000000000012, 'l1_ratio': 0.001}

In [38]:
gs.score(X_test_sc, y_test)

0.85969130329704457

In [39]:
pd.DataFrame(gs.cv_results_).sort_values(by='rank_test_score').head()



Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_alpha,param_l1_ratio,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
32,0.050467,0.002005,0.830379,0.886592,0.26,0.001,"{'alpha': 0.26, 'l1_ratio': 0.001}",1,0.856013,0.883818,0.84946,0.882883,0.785576,0.893074,0.002632,0.001638,0.031762,0.004599
31,0.051136,0.001337,0.830377,0.886852,0.255,0.001,"{'alpha': 0.255, 'l1_ratio': 0.001}",2,0.856038,0.884091,0.84952,0.883169,0.785485,0.893296,0.002165,0.000945,0.031824,0.004572
33,0.054478,0.001338,0.830376,0.886332,0.265,0.001,"{'alpha': 0.265, 'l1_ratio': 0.001}",3,0.855982,0.883546,0.849396,0.882598,0.785662,0.892852,0.003692,0.000472,0.031701,0.004626
30,0.051137,0.001337,0.83037,0.887112,0.25,0.001,"{'alpha': 0.25, 'l1_ratio': 0.001}",4,0.85606,0.884363,0.849575,0.883454,0.785387,0.893518,0.005114,0.000473,0.031887,0.004545
34,0.049799,0.001671,0.830368,0.886072,0.27,0.001,"{'alpha': 0.27, 'l1_ratio': 0.001}",5,0.855947,0.883273,0.849328,0.882313,0.785742,0.892629,0.002501,0.000473,0.03164,0.004653


In [40]:
# X_final = final.drop(['id', 'pid', 'sale_condition', 'saleprice'], axis=1)
# X_final_sc = ss.transform(X_final)
# predictions = gs.predict(X_final_sc)

In [41]:
# final['SalePrice'] = predictions
# final[['id','SalePrice']].to_csv('export_clean_attempt2.csv', index=False)

## Testing Decision Tree

In [22]:
from sklearn.tree import DecisionTreeRegressor

In [71]:
dtr = DecisionTreeRegressor(random_state=101)
dtr.fit(X_train_sc, y_train)
params = {
    'max_depth': np.arange(1, 6, 1),
    'max_features': [1, 4, 'sqrt', 'log2']
}
gs = GridSearchCV(dtr, params, n_jobs=-1)
gs.fit(X_train_sc, y_train)
gs.best_params_



# max_depth grid search over
# max_features 

# possibly try ensambling trees

{'max_depth': 4, 'max_features': 'log2'}

In [72]:
gs.score(X_test_sc, y_test)

0.49367651000132406

In [73]:
gs.score(X_train_sc, y_train)

0.51339071915400081

In [80]:
# X_final = final.drop(['id', 'pid', 'sale_condition', 'saleprice', 'SalePrice'], axis=1)
# X_final_sc = ss.transform(X_final)
# predictions = gs.predict(X_final_sc)

In [81]:
# final['SalePrice'] = predictions
# final[['id','SalePrice']].to_csv('export_dtree.csv', index=False)

## Random Forest

In [67]:
from sklearn.ensemble import RandomForestRegressor

In [91]:
rf = RandomForestRegressor(random_state=101)

rf.fit(X_train_sc, y_train)
params = {
#     'criterion': ['mse', 'mae'],
    'max_depth': np.arange(1, 6, 1),
    'max_features': [1, 4, 'sqrt', 'log2']
}
gs = GridSearchCV(rf, params, n_jobs=-1)
gs.fit(X_train_sc, y_train)
gs.best_params_

{'criterion': 'mse', 'max_depth': 5, 'max_features': 'sqrt'}

In [92]:
gs.score(X_test_sc, y_test)
# 0.7113133578831945

0.7113133578831945

In [93]:
gs.score(X_train_sc, y_train)
# 0.73776835169906929

0.73776835169906929

In [88]:
# X_final = final.drop(['id', 'pid', 'sale_condition', 'saleprice', 'SalePrice'], axis=1)
# X_final_sc = ss.transform(X_final)
# predictions = gs.predict(X_final_sc)

In [89]:
# final['SalePrice'] = predictions
# final[['id','SalePrice']].to_csv('export_rf.csv', index=False)

## Gradient Boosting

In [94]:
from sklearn.ensemble import GradientBoostingRegressor

In [113]:
gb = GradientBoostingRegressor(random_state=101)

gb.fit(X_train_sc, y_train)
params = {
    'loss': ['huber'], #['ls', 'lad', 'huber', 'quantile']
    'max_depth': np.arange(4, 6, 1),
    'criterion': ['mae'], #['mse', 'mae', 'friedman_mse']
    'max_features': [1, 4, 'sqrt', 'log2'],
    'n_estimators': [300, 500, 1000]
}
gs = GridSearchCV(gb, params, n_jobs=-1)
gs.fit(X_train_sc, y_train)
gs.best_params_

{'criterion': 'mae',
 'loss': 'huber',
 'max_depth': 5,
 'max_features': 'log2',
 'n_estimators': 1000}

In [114]:
gs.score(X_test_sc, y_test)
# plain 0.90252878976979123
# huber 0.89292379875671601
# {'loss': 'huber', 'max_depth': 5} 0.90589232431861111
# {'criterion': 'mae', 'loss': 'huber', 'max_depth': 3} 0.90597834698026458
# {'criterion': 'mae', 'loss': 'huber', 'max_depth': 5, 'max_features': 'sqrt'} 0.9045417183283615
# {'criterion': 'mae',
# 'loss': 'huber', 'max_depth': 5, 'max_features': 'log2', 'n_estimators': 300} 0.90794842532518771
# {'criterion': 'mae', 'loss': 'huber', 'max_depth': 5, 'max_features': 'log2', 'n_estimators': 1000} 0.90790961372937073

0.90790961372937073

In [115]:
gs.score(X_train_sc, y_train)
# 0.93918822856034878
# 0.9309919929623941
# 0.97217982680214243
# 0.92993912736295514
# 0.94199221769889019
# 0.96241756998979489
# 0.98467198273141898

0.98467198273141898

In [116]:
# X_final = final.drop(['id', 'pid', 'sale_condition', 'saleprice', 'SalePrice'], axis=1)
# X_final_sc = ss.transform(X_final)
# predictions = gs.predict(X_final_sc)

In [117]:
# final['SalePrice'] = predictions
# final[['id','SalePrice']].to_csv('export_gb.csv', index=False)

## SVR

In [118]:
from sklearn.svm import SVR

In [141]:
svr = SVR()

gb.fit(X_train_sc, y_train)
params = {
    'kernel': ['linear'],
    'degree': [3],
    'C': [10, 15, 20],
}
gs = GridSearchCV(svr, params, n_jobs=-1)
gs.fit(X_train_sc, y_train)
gs.best_params_

{'C': 20, 'degree': 3, 'kernel': 'linear'}

In [142]:
gs.score(X_test_sc, y_test)
# -0.061633291974839681
# linear 0.16649637247328819
# c=3 0.41431378303430599

0.76676970456362969

In [143]:
gs.score(X_train_sc, y_train)
# -0.053018817800281282
# 0.17806148048421822
# 0.42678741656518404

0.77592513773713501

In [144]:
# X_final = final.drop(['id', 'pid', 'sale_condition', 'saleprice', 'SalePrice'], axis=1)
# X_final_sc = ss.transform(X_final)
# predictions = gs.predict(X_final_sc)

In [145]:
# final['SalePrice'] = predictions
# final[['id','SalePrice']].to_csv('export_svr2.csv', index=False)

## KNN

In [147]:
from sklearn.neighbors import KNeighborsRegressor

In [161]:
knn = KNeighborsRegressor()

gb.fit(X_train_sc, y_train)
params = {
    'n_neighbors': np.arange(7, 10, 1),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}
gs = GridSearchCV(knn, params, n_jobs=-1)
gs.fit(X_train_sc, y_train)
gs.best_params_

{'algorithm': 'auto', 'n_neighbors': 8, 'weights': 'distance'}

In [162]:
gs.score(X_test_sc, y_test)
# 0.7484117445075622

0.77611852510862611

In [163]:
gs.score(X_train_sc, y_train)
# 0.81229272409369702

0.99999909579182777

In [152]:
# X_final = final.drop(['id', 'pid', 'sale_condition', 'saleprice', 'SalePrice'], axis=1)
# X_final_sc = ss.transform(X_final)
# predictions = gs.predict(X_final_sc)

In [153]:
# final['SalePrice'] = predictions
# final[['id','SalePrice']].to_csv('export_knn2.csv', index=False)