In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns

from sklearn.linear_model import LinearRegression, ElasticNet, ElasticNetCV
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

%matplotlib inline

In [2]:
df_data = pd.read_csv('train.csv')
final_data = pd.read_csv('test.csv')

In [3]:
def does_does_not(cell):
    if cell >= 1:
        return 1
    else:
        return 0

def clean(df):
    # CLEAN COL NAMES
    df.rename(columns= {col: col.lower().replace(' ','_') for col in df.columns}, inplace=True)
    
#     df = df.loc[df['gr_liv_area'] < 4000].copy()

    # CREATION OF DECK/PROCH COLUMN AND SETTING FENCE TO INT
    df['deck_porch'] = df['wood_deck_sf'] + df['open_porch_sf'] + df['enclosed_porch'] + df['3ssn_porch'] + df['screen_porch']
    df['fence'] = df['fence'].notnull().astype('int')
#     df['overall_cond_qual'] = df['overall_cond'] + df['overall_qual']

    # DROPPING COLUMS THAT ARE GOING TO BE BINARY
    df.drop([
    'alley', 'lot_frontage', 'mas_vnr_type', 'fireplace_qu', 'pool_qc', 'misc_feature',
    'bsmt_qual', 'bsmt_cond', 'bsmt_exposure', 'bsmtfin_type_1', 'bsmtfin_sf_1', 'bsmtfin_type_2',
    'bsmtfin_sf_2', 'bsmt_unf_sf', 'bsmt_full_bath', 'bsmt_half_bath',
    'garage_type', 'garage_yr_blt', 'garage_finish', 'garage_area', 'garage_qual', 'garage_cond',
    'wood_deck_sf', 'open_porch_sf', 'enclosed_porch', '3ssn_porch', 'screen_porch',
#     'overall_cond', 'overall_qual'
    ], axis=1, inplace=True)

    # MAKING COLUMNS BINARY
    features_replace = ['pool_area' , 'garage_cars', 'fireplaces', 'mas_vnr_area', 'deck_porch'] # 'total_bsmt_sf', 
    for i in features_replace:
        df[i] = df[i].map(does_does_not)

    # RENAMING BINARY COLUMNS
    df.rename(columns={
    'pool_area': 'has_pool',
    'garage_cars': 'has_garage', 
    'fireplaces': 'has_fireplace',
#     'total_bsmt_sf': 'has_bsmt',
    'mas_vnr_area': 'has_mas_vnr',
    'fence': 'has_fence'}, inplace=True)


    # COMBINING SIMILAR/SAMLL VALES
    df['lot_shape'] =df['lot_shape'].replace({'IR3':'IR2'})
    df['condition_1'] =df['condition_1'].replace({'PosA':'Pos', 'PosN':'Pos', 'RRAe':'RRe', 'RRNe':'RRe', 'RRNn':'RRn', 'RRAn':'RRn'})
    df['condition_2'] =df['condition_2'].replace({'PosA':'Pos', 'PosN':'Pos', 'RRAe':'RRe', 'RRNe':'RRe', 'RRNn':'RRn', 'RRAn':'RRn'})
#     df['year_built'] = pd.to_datetime(df['year_built'], format='%Y')
#     df['year_remod/add'] = pd.to_datetime(df['year_remod/add'], format='%Y')
#     df['mo_sold'] = pd.to_datetime(df['mo_sold'], format='%m')
#     df['yr_sold'] = pd.to_datetime(df['yr_sold'], format='%Y')
    
    # CATEROGRY TO NUMBERICAL
    df['exter_qual'] = df['exter_qual'].replace({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1})
    df['exter_cond'] = df['exter_cond'].replace({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1})
    df['heating_qc'] = df['heating_qc'].replace({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1})
    df['electrical'] = df['electrical'].replace({'SBrkr': 5, 'FuseA': 4, 'FuseF': 3, 'FuseP': 2, 'Mix': 1})
    df['kitchen_qual'] = df['kitchen_qual'].replace({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1})

    # DROPPING SIMILAR COLUMNS
    df.drop(['1st_flr_sf', '2nd_flr_sf', 'bedroom_abvgr'], axis=1, inplace=True) # , 'gr_liv_area'

    # DUMMIES FOR CAT COLUMNS
    df = pd.get_dummies(df, columns=[
    'neighborhood',
    'lot_shape',
    'ms_subclass',
    'ms_zoning',
    'land_contour',
    'lot_config',
    'utilities',
    'land_slope',
    'condition_1',
    'condition_2',
    'bldg_type',
    'exterior_1st',
    'exterior_2nd',
    'foundation',
    'heating',
    'central_air',
    'full_bath',
    'half_bath',
    'kitchen_abvgr',
    'functional',
    'paved_drive',
    'sale_type',
    'street',
    'house_style',
    'roof_style',
    'roof_matl',
    'electrical'
    ], prefix=[
    'n_',
    'ls_',
    'mssub_',
    'mszon_',
    'landcon_',
    'lotfig_',
    'ut_',
    'landm_',
    'cond1_',
    'cond2_',
    'btype_',
    'ext1_',
    'ext2_',
    'found_',
    'heat_',
    'ac_',
    'fb_',
    'hb_',
    'kita_',
    'funct_',
    'paved_',
    'stype_',
    'st_',
    'hstyle_',
    'rst_',
    'rmat_',
    'ele_'
    ])
    
    return df 

In [4]:
df = clean(df_data)
final = clean(final_data)
df = df.loc[df['gr_liv_area'] < 4000].copy()

In [5]:
def mia_columns(df1, df2):
    mia_df2 = list(set(df1.columns) - set(df2.columns))
    print(mia_df2)
    mia_df1 = list(set(df2.columns) - set(df1.columns))
    print(mia_df1)
    
    for i in mia_df2:
        df2[i] = 0
    
    for i in mia_df1:
        df2.drop([i], axis=1, inplace=True)
    
    df1['total_bsmt_sf'].fillna(0, inplace=True)
    df2['total_bsmt_sf'].fillna(0, inplace=True)

In [6]:
mia_columns(df, final)

['saleprice', 'mszon__A (agr)', 'mssub__150', 'ext1__CBlock', 'ext2__Stone', 'ele__3', 'ext1__ImStucc', 'ext1__Stone', 'ele__2', 'cond2__RRn', 'cond2__RRe', 'heat__Wall', 'sale_condition', 'rmat__ClyTile', 'funct__Sal', 'n__Landmrk', 'ele__5', 'heat__OthW', 'ut__NoSeWa', 'cond2__Artery', 'ele__4', 'funct__Sev', 'n__GrnHill', 'rmat__Membran', 'ele__1']
['ext2__Other', 'ele__3.0', 'rmat__Metal', 'ext2__PreCast', 'ext1__PreCast', 'ele__2.0', 'heat__Floor', 'ele__4.0', 'stype__VWD', 'ele__5.0', 'rmat__Roll']


In [7]:
df['sale_condition'] = df['sale_condition'].map(lambda x: 1 if x == 'Abnorml' else 0)

In [8]:
y = df['sale_condition']
X = df.drop(['id', 'pid', 'sale_condition', 'saleprice'], axis=1)

In [9]:
# Baseline Accuracy
y.value_counts(normalize = True)

0    0.935578
1    0.064422
Name: sale_condition, dtype: float64

In [59]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

In [60]:
X_train, X_test, y_train, y_test= train_test_split(X, y)

In [61]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('pca', PCA()),
    ('knn', KNeighborsClassifier())
])

In [62]:
params = {
    
}
gs = GridSearchCV(pipe, param_grid=params, n_jobs=-1, verbose=1)
gs.fit(X_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   30.1s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params=None, iid=True, n_jobs=-1, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [63]:
print(gs.best_score_, gs.best_params_)

0.935546875 {}


In [64]:
X_final = final.drop(['id', 'pid', 'sale_condition', 'saleprice'], axis=1) # , 'Sale Condition'
pred = gs.predict(X_final)

In [66]:
final['Sale Condition'] = pred
final[['id','Sale Condition']].to_csv('class_knn.csv', index=False) 