In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

import remap_values as rv

import cPickle

In [2]:
from sklearn.utils import shuffle
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.base import clone

In [3]:
inp_df = pd.read_csv( "inp_data/train.csv" )

my_df      = rv.run_clean( inp_df, train=True )
my_df      = shuffle( my_df, random_state=0 )

feature_df = my_df.drop( ['SalePrice'], axis=1 ).copy()
label_df   = my_df['SalePrice'].copy()

In [4]:
test_df    = pd.read_csv( "inp_data/test.csv" )
test_df    = rv.run_clean( test_df )

In [5]:
test_df['HouseStyle_5'] = 0

train_cols = feature_df.columns
test_df    = test_df[ train_cols ].copy()

In [6]:
from sklearn.linear_model import Ridge # alpha 1
from sklearn.linear_model import Lasso # alpha 1
from sklearn.linear_model import BayesianRidge # alpha_1, alpha_2, lambda_1, lambda_2, both def 1e-6
from sklearn.linear_model import SGDRegressor
from sklearn.kernel_ridge import KernelRidge # alpha , kernel, gamma
from sklearn.ensemble     import GradientBoostingRegressor

In [7]:
gen_params = {'kernel':['rbf'], 
              'C':      [5e-1,1e0,5e0],
              'gamma':  [5e-3,1e-2,5e-2],
              'epsilon':[5e-3,1e-2,5e-2]}
svr_rbf_reg = rv.optimize_fit( SVR(), feature_df.values, label_df.values, gen_params, nf=10 )

Fold  1 accuracy: 0.8883  ,  0.8714  {'epsilon': 0.01, 'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf'}
Fold  2 accuracy: 0.8024  ,  0.8771  {'epsilon': 0.01, 'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf'}
Fold  3 accuracy: 0.9252  ,  0.8654  {'epsilon': 0.01, 'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf'}
Fold  4 accuracy: 0.8861  ,  0.8707  {'epsilon': 0.01, 'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf'}
Fold  5 accuracy: 0.8109  ,  0.8747  {'epsilon': 0.01, 'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf'}
Fold  6 accuracy: 0.8842  ,  0.8698  {'epsilon': 0.01, 'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf'}
Fold  7 accuracy: 0.8853  ,  0.8701  {'epsilon': 0.01, 'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf'}
Fold  8 accuracy: 0.8935  ,  0.8687  {'epsilon': 0.01, 'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf'}
Fold  9 accuracy: 0.9068  ,  0.8647  {'epsilon': 0.01, 'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf'}
Fold 10 accuracy: 0.8777  ,  0.8707  {'epsilon': 0.01, 'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf'}
 
Clf  0 Mean Accuracy: 0.8761

In [12]:
gen_params = {'kernel':['rbf'],
              'alpha':10**np.arange( -4, -3, 0.2),
              'gamma':10**np.arange( -4, -3, 0.2)}
kRBF_reg = rv.optimize_fit( KernelRidge(), feature_df.values, label_df.values, gen_params, nf=10 ) 

Fold  1 accuracy: 0.8685  ,  0.8659  {'alpha': 0.00039810717055349773, 'gamma': 0.00063095734448019429, 'kernel': 'rbf'}
Fold  2 accuracy: 0.7934  ,  0.8738  {'alpha': 0.00025118864315095823, 'gamma': 0.00039810717055349773, 'kernel': 'rbf'}
Fold  3 accuracy: 0.9182  ,  0.8581  {'alpha': 0.00063095734448019429, 'gamma': 0.00063095734448019429, 'kernel': 'rbf'}
Fold  4 accuracy: 0.8727  ,  0.8635  {'alpha': 0.00063095734448019429, 'gamma': 0.00063095734448019429, 'kernel': 'rbf'}
Fold  5 accuracy: 0.8383  ,  0.8693  {'alpha': 0.00063095734448019429, 'gamma': 0.00063095734448019429, 'kernel': 'rbf'}
Fold  6 accuracy: 0.8925  ,  0.8657  {'alpha': 0.00063095734448019429, 'gamma': 0.00063095734448019429, 'kernel': 'rbf'}
Fold  7 accuracy: 0.8744  ,  0.8691  {'alpha': 0.00063095734448019429, 'gamma': 0.00063095734448019429, 'kernel': 'rbf'}
Fold  8 accuracy: 0.8810  ,  0.8669  {'alpha': 0.00063095734448019429, 'gamma': 0.00063095734448019429, 'kernel': 'rbf'}
Fold  9 accuracy: 0.9032  ,  0.8

In [13]:
gen_params = {'loss':['huber'],
              'n_estimators':[300,400],
              'max_depth':[2,3,4],
              'min_samples_leaf':[3],
              'alpha':[0.8,0.85,0.9,0.95]}
boost_reg = rv.optimize_fit( GradientBoostingRegressor(), feature_df.values, label_df.values, gen_params, nf = 10 )

Fold  1 accuracy: 0.8229  ,  0.8790  {'n_estimators': 400, 'alpha': 0.9, 'min_samples_leaf': 3, 'max_depth': 3, 'loss': 'huber'}
Fold  2 accuracy: 0.8033  ,  0.8734  {'n_estimators': 400, 'alpha': 0.85, 'min_samples_leaf': 3, 'max_depth': 2, 'loss': 'huber'}
Fold  3 accuracy: 0.9158  ,  0.8678  {'n_estimators': 400, 'alpha': 0.8, 'min_samples_leaf': 3, 'max_depth': 4, 'loss': 'huber'}
Fold  4 accuracy: 0.8806  ,  0.8713  {'n_estimators': 300, 'alpha': 0.85, 'min_samples_leaf': 3, 'max_depth': 4, 'loss': 'huber'}
Fold  5 accuracy: 0.8393  ,  0.8702  {'n_estimators': 300, 'alpha': 0.9, 'min_samples_leaf': 3, 'max_depth': 2, 'loss': 'huber'}
Fold  6 accuracy: 0.8881  ,  0.8726  {'n_estimators': 300, 'alpha': 0.8, 'min_samples_leaf': 3, 'max_depth': 3, 'loss': 'huber'}
Fold  7 accuracy: 0.8756  ,  0.8731  {'n_estimators': 400, 'alpha': 0.8, 'min_samples_leaf': 3, 'max_depth': 3, 'loss': 'huber'}
Fold  8 accuracy: 0.8943  ,  0.8653  {'n_estimators': 400, 'alpha': 0.85, 'min_samples_leaf': 3

In [None]:
#svr_rbf_red
#boost_reg

In [14]:
with open('svr_rbf_reg4.pkl','wb') as fid:
    cPickle.dump( svr_rbf_reg, fid )
with open('boost_reg4.pkl','wb') as fid:
    cPickle.dump( boost_reg, fid )
with open('krbf_reg4.pkl','wb') as fid:
    cPickle.dump( kRBF_reg, fid )

In [6]:
with open('svr_rbf_reg2.pkl','r') as fid:
    svr_rbf_reg = cPickle.load( fid )
with open('boost_reg2.pkl','r') as fid:
    boost_reg   = cPickle.load( fid )

In [15]:
lin_pred = 10**(   boost_reg.predict( test_df ) )
rbf_pred = 10**( svr_rbf_reg.predict( test_df ) )
krbf_pred= 10**(    kRBF_reg.predict( test_df ) )

In [16]:
id_df  = pd.read_csv( "inp_data/test.csv" )[['Id','Fence']]

lin_df = id_df.copy()
rbf_df = id_df.copy()
kbf_df = id_df.copy()

lin_df['SalePrice'] =  lin_pred
rbf_df['SalePrice'] =  rbf_pred
kbf_df['SalePrice'] = krbf_pred

lin_df = lin_df.drop( ['Fence'], axis=1 )
rbf_df = rbf_df.drop( ['Fence'], axis=1 )
kbf_df = kbf_df.drop( ['Fence'], axis=1 )

In [17]:
lin_df.to_csv( 'boost_4.csv', index=False )
rbf_df.to_csv( 'rbf_4.csv'  , index=False )
kbf_df.to_csv( 'krbf_4.csv'  , index=False )

In [6]:
with open('svr_lin.pkl', 'rb') as fid:
    svr_lin = cPickle.load(fid)
with open('svr_rbf.pkl', 'rb') as fid:
    svr_rbf = cPickle.load(fid)

In [26]:
lin_pred = (   boost_reg.predict( feature_df ) )
rbf_pred = ( svr_rbf_reg.predict( feature_df ) )

In [27]:
lin_diff = label_df.values - lin_pred
rbf_diff = label_df.values - rbf_pred

In [28]:
my_df['LinPred'] = lin_pred
my_df['RBFPred'] = rbf_pred
my_df['LinDiff'] = lin_diff
my_df['RBFDiff'] = rbf_diff

In [29]:
preds = ['LinPred','RBFPred']
for pred in preds:
    my_df.plot(kind='scatter',x='SalePrice',y='LinPred')
    plt.show()

  if self._edgecolors == str('face'):


In [46]:
outliers = my_df[ abs(my_df['RBFDiff']) >0.09 ].copy()
#outliers =    my_df[    my_df['Electrical'] == 0 ].copy()
#outliers = outliers[ outliers['CentralAir'] == 0 ]
#outliers = outliers[ outliers['SaleCondition'] != 0 ]
#outliers = outliers[ outliers['MSZoning'  ] != 2./3 ]
#outliers = outliers[ outliers['MSZoning'  ] != 2./3 ]
#outliers = outliers[ outliers['MSZoning'  ] != 2./3 ]

In [51]:
cols = my_df.drop( ['InsideSF','OutsideSF','GarageArea','LotArea', 'LotFrontage',
                    'SalePrice', 'RBFPred'], axis=1 ).columns.values
#sns.lmplot( 'SalePrice', 'RBFPred', data=my_df, hue='LotFrontage', fit_reg = False)#, palette="rainbow" )

lowScat = ['CentralAir', 'Electrical','SaleCondition',
           'InsideSF','OutsideSF', 'LotArea',
           'RBFPred','RBFDiff','SalePrice']
scat_df = outliers[lowScat].copy()


for col in lowScat[:-3]:
    g = sns.lmplot( 'SalePrice', 'RBFPred', data=scat_df, hue=col, fit_reg = False, palette="rainbow" )
    g.fig.suptitle( col )
    plt.plot( [4,6], [4,6], color='b')
    plt.xlim( 4.3,5.7 )
    plt.ylim( 4.3,5.7 )
    plt.show()
#for col in cols:
#    g = sns.lmplot( 'SalePrice', 'RBFPred', data=my_df, hue=col, fit_reg=False)
#    g.fig.suptitle( col )
#    plt.show()
#    plt.clf()

In [38]:
for col in lowScat[:-2]:
    scat_df[col].plot( kind='hist', title=col, bins=50 )
    plt.show()

In [None]:
#0.051, 0.091,, 0.15, 0.2
#Zoning 3 or 4? whichever makes 0.6..., 0.6, 0.6
#CentralAir 1? 1, 0, lots of 0
#Exterior1st 0? 0, 0, 0
#Electrical 0? 0, 0, 0
#SaleCOnd 0? 0/1/5, 1, 1/5