### Basic Setup

In [0]:
#!pip install --upgrade Cython
#!pip install --upgrade git+https://github.com/statsmodels/statsmodels
  
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math


In [10]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

#if asked for authorization code, paste this code and press enter:
'''
4/UwFkrWBE8Eosyg0IayQntSJtqYl-W-fNawLpJuVJGXKMczzg7EAtEhw
'''

train = pd.read_csv('/content/drive/My Drive/train.csv')
test = pd.read_csv('/content/drive/My Drive/test.csv')
pd.concat([train, test], axis = 0)
train.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


FileNotFoundError: ignored

### Helper Functions used for data manipulation

In [0]:
# function that adds a bunch of new fields
def beefup(ogdf):
    
    df = ogdf.copy()
    
    df['TotalBath'] = df['FullBath'] + df['HalfBath']
    df['BsmtTotBath'] = df['BsmtHalfBath'] + df['BsmtFullBath']
    
    df['RemodHome'] = (df['YearRemodAdd'] > df['YearBuilt']).apply(lambda x: 1 if x else 0)
    df['RemodGarage'] = (df['GarageYrBlt'] > df['YearBuilt']).apply(lambda x: 1 if x else 0)
    
    df['TotalPorchSF'] = df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] + df['ScreenPorch']
    df['TotalPorchDeckSF'] = df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] + df['ScreenPorch'] + df['WoodDeckSF']
    df['YardSF'] = df['LotArea'] - df['GrLivArea']
    
    df['Season'] = df['MoSold'].replace([12,1,2],"Winter").replace([3,4,5], "Spring").replace([6,7,8], "Summer").replace([9,10,11], "Fall")
    
    df['IndoorSF'] = df['1stFlrSF']+ df['2ndFlrSF'] + df['LowQualFinSF'] + df['TotalBsmtSF']
    df['AvgSFRm'] = (df['1stFlrSF'] + df['2ndFlrSF']) / df['TotRmsAbvGrd']
    
    df['MultiKitchen'] = df['KitchenAbvGr'].apply(lambda x: 1 if x > 1 else 0)
    df['MultiFireplace'] =  df['Fireplaces'].apply(lambda x: 1 if x > 1 else 0)
    df['ExtraRooms'] = df['TotRmsAbvGrd']- df['BedroomAbvGr'] - df['KitchenAbvGr']
    
    df['BsmtFinSF'] = df['BsmtFinSF1'] + df['BsmtFinSF2']
    df['BsmtFinYN'] = df['BsmtFinSF'].apply(lambda x: 1 if x > 0 else 0)
    
    df['PoolYN'] = df['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
    df['FireplaceYN'] = df['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
    df['FenceYN'] = df['Fence'].notnull().apply(lambda x: 1 if x else 0)
    
    df['ExteriorMixedYN'] = (df['Exterior1st'] != df['Exterior2nd']).apply(lambda x: 1 if x else 0)  
    
    return df


In [0]:
#%% function fixes NAs by imputing 0, mean/mode or setting to "Unknown"/"None"
def nafix(df):    
    
    fixdf = df.copy()
    
    # specific oclumn drops
    fixdf = fixdf.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'Id'], axis = 1)
      
    # specific quants
    fixdf['GarageYrBlt'].fillna(fixdf['YearRemodAdd'], inplace = True)
    
    # specific cats
    catnonecol = ['MasVnrType', 'FireplaceQu',
                  'BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2',
                  'GarageType','GarageFinish','GarageQual','GarageCond']
    
    for col in fixdf:
        if col in catnonecol:
            fixdf[col].fillna("None", inplace = True)
    
    # Catch the rest of NAs & split into cat/quant
    cdf = fixdf.drop(fixdf._get_numeric_data().columns, axis = 1)
    t = np.sum(cdf.isnull(), axis = 0) 
    cnacol = list(t[t>0].index.values)
    
    # If cat, then unknown. elseif quant, then 0.
    for col in fixdf:
        if col in cnacol:
            fixdf[col].fillna("Unknown", inplace = True)
        else:
            fixdf[col].fillna(0, inplace = True)
            
    return fixdf

In [0]:
#%% function that converts quant to category
def q_to_cat(ogdf, cols_to_convert):
    
    df = ogdf.copy()
    for col in df:
        if col in cols_to_convert:
            df[col] = df[col].astype(str)
    
    return df

In [0]:
#%% function does CATsuggested action
def optcat(ogdf, y, train_or_test):
    
    df = ogdf.copy()
    
    from CatAnalysis import catdf
    suggestdf = catdf(df, y, train_or_test)['suggest']
    
    quantifycols = list(suggestdf[suggestdf == "quantify"].index.values)
    ovacols = list(suggestdf[suggestdf == "1vA"].index.values)
    ignorecols = list(suggestdf[suggestdf == "ignore"].index.values)
    
    dumcols = list(suggestdf[suggestdf == 'dummify'].index.values)
    dumcols = dumcols + list(suggestdf[suggestdf == 'binary'].index.values)
    
    for col in df:
        if col in quantifycols:
            df[col] = df[col].fillna(0).replace('None', 0).replace('Po', 1).replace('Fa',2).replace('TA', 3).replace('Gd',4).replace('Ex',5)
        elif col in ovacols:
            df[col] = df[col].eq(df[col].mode()[0]).mul(1)
        elif col in ignorecols:
            df[col] = df.drop(col, axie = 1)
    
    df = pd.get_dummies(df, columns = dumcols, drop_first = True)
    
    return df

In [0]:
#%% Feature Manipulation GO!!!!
    
def thewholeshabang(ogdf, y, tot):
    temp = beefup(ogdf)
    temp = q_to_cat(temp, ['MSSubClass', 'OverallCond', 'MoSold'])
    temp = nafix(temp)
    temp = optcat(temp, y, tot)
    return temp


In [0]:
### function to get metrics for CAT cols - fits 2xLR: dummy & one.v.all (one is top/mode)
def catdf(df,y,tot):
    
    df = df.drop(df._get_numeric_data().columns, axis = 1)
    
    dfdf = pd.DataFrame(columns = ["unique", "set", 
                                   "mode", "mode%", "NAs",
                                   "dummyLRscore", "ovaLRscore", 
                                   "quantLRscore", "suggest"])
    for col in df:
        
        temp = df.describe()
        quantcol = ['BsmtQual', 'BsmtCond', 'KitchenQual', 'ExterQual', 'ExterCond', 
            'GarageQual', 'GarageCond', 'HeatingQC', 'FireplaceQu', 'PoolQC', 'OverallCond', 'OverallQual']
        
        xunique = temp.loc['unique', col]
        xset = df[col].unique()
        
        xmode = temp.loc['top', col]
        xmodep = round((temp.loc['freq', col] / df.shape[0]) *100, 2)
        xnas = df.shape[0] - temp.loc['count', col]

        if tot == "train":
            from sklearn import linear_model
            xdummy = pd.get_dummies(df[col], drop_first=True)
            lrdummy = linear_model.LinearRegression()
            lrdummy.fit(xdummy, y)
            
            xova = df[col].eq(xmode).mul(1).values.reshape(-1, 1)
            lrova = linear_model.LinearRegression()
            lrova.fit(xova, y)
            
            xcorr = round(lrdummy.score(xdummy,y),4)
            xcorr2 = round(lrova.score(xova, y),4) 
            xcorr3 = 0
            
            # only if in QUANTABLE columns
            if col in quantcol:
                if col in ['OverallCond', 'OverallQual']:
                    xquant = df[col].astype(int).values.reshape(-1, 1)
                else:
                    xquant = df[col].fillna(0).replace('None', 0).replace('Po', 1).replace('Fa',2).replace('TA', 3).replace('Gd',4).replace('Ex',5).values.reshape(-1, 1)
                lrquant = linear_model.LinearRegression()
                lrquant.fit(xquant, y)
                xcorr3 = round(lrquant.score(xquant, y),4)
            
            # determine action based on metrics
            if xnas >= df.shape[0]*.9:
                xaction = "ignore"
            elif xunique == 2:
                xaction = "binary"
            elif (xcorr2 + .01) > xcorr:
                xaction = "1vA"
            elif (xcorr3 + .01) > xcorr:
                xaction = "quantify"
            else:
                xaction = "dummify"
            
        else:
            xcorr = "test"
            xcorr2 = "test"
            xcorr3 = "test"
            xaction = "test"
    
        dfdf.loc[col] = [xunique, xset, xmode, xmodep, xnas, xcorr, xcorr2, xcorr3, xaction]
        
        # save results so can duplicate on test set later
        dfdf.to_csv("FeatureSuggestion.csv", index = True)
    
    return dfdf

In [0]:
# function to create QUANTDF 
def quantdf(df, y, tot):
    
    df = df._get_numeric_data()
    
    dfdf = pd.DataFrame(columns = ["range", "mean", "std", 
                                   "NAs", "non0",
                                   "outliers", "corr", "LRfit"])
    for col in df:
        
        xrange = str(df[col].min()) + " to " + str(df[col].max())
        xmean = df[col].mean()
        xstd = df[col].std()
        
        xnas = df[col].isnull().sum()
        xnon0 = sum(df[col] != 0)
        
        xoutliers = len(detect_outlier(df[col]))
              
        if tot == "train":
            from sklearn import linear_model
            x = df[col].fillna(0).values.reshape(-1,1) # fillNA for LR only
            lr = linear_model.LinearRegression()
            lr.fit(x, y)
            xfit = round(lr.score(x,y),4)  
            xcorr = round(df[col].corr(y),5)
        else:
            xfit = "na/test"
            xcorr = "na/test"
    
        dfdf.loc[col] = [xrange, xmean, xstd,
                xnas, xnon0, 
                xoutliers, xcorr, xfit]
    
    return dfdf

### Tree function

In [0]:
# function that takes a TRAIN DF, runs through random forest & optimizes, and returns key metrics
def treerun(df, logyn):
    
    import time
    import math
    stime = time.time()
    
    x = df.drop('SalePrice', axis = 1)
    y = np.log(df['SalePrice']) if logyn else df['SalePrice']
    cols = list(x.columns)
    
    # Tree Setup
    from sklearn import ensemble
    rf = ensemble.RandomForestRegressor()
    rf.set_params(random_state = 0, n_estimators = 100, max_features = round(math.sqrt(df.shape[1]),0)) #sqrt(numcol)
    
    # Test/Train split
    from sklearn.model_selection import train_test_split
    xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.2, random_state = 88)
         
    # CV to optimize max features & num trees
    from sklearn.model_selection import GridSearchCV
    grid = [{ "max_features": range(1, 25)}]
    grid_search = GridSearchCV(rf, grid, cv=5, n_jobs=-1)
    grid_search.fit(x, y)
 
    # Run again with BEST parameters
    bestrf = grid_search.best_estimator_
    bestscore = round(bestrf.score(xtest, ytest),4).mean()
    besterr = round(np.mean(abs(bestrf.predict(xtest) - ytest)),2)   
    
    # Rank importance
    rank = list(bestrf.feature_importances_)
    rank = [(x, round(rank, 2)) for x, rank in zip(cols, rank)]
    rank = sorted(rank, key = lambda x: x[1], reverse = True)
        
    # return score, mean abs error, top 5, time
    return bestscore, besterr, time.time() - stime, grid_search.best_params_['max_features'], rank[:5]

### Trying different datasets with random forest and recording score in RESULTSDF

In [0]:
# Create 3 datasets for model comparison
from FeatureAdd import nafix, thewholeshabang, beefup, q_to_cat
df_dumcat = train.copy() ; df_dumcat = nafix(df_dumcat); df_dumcat = pd.get_dummies(df_dumcat, drop_first = True)
df_nocat = train.copy() ; df_nocat = df_nocat._get_numeric_data().fillna(0) 
df_optcat = train.copy() ; df_optcat = thewholeshabang(df_optcat, df_optcat['SalePrice'], "train")
df_qcat = train.copy() ; df_qcat = nafix(df_qcat)

# Setup ResultsDf to store various model results
resultsdf = pd.DataFrame(columns = ['Score', 'Mean Abs Err', 'Comptime', 'MaxFeatures', 'Top5'])

# Run with NOCAT (38f) - dropping all categories
xscore, xerr, xtime, xmaxf, xtop5 = treerun(df_nocat, logyn = False)
resultsdf.loc["Nocat"] = [xscore, xerr, xtime, xmaxf, xtop5]
xscore, xerr, xtime, xmaxf, xtop5 = treerun(df_nocat, logyn = True)
resultsdf.loc["Nocat_log"] = [xscore, xerr, xtime, xmaxf, xtop5]

# Run with QCAT (46f) - C->Q for Poor-Excellent categories, and then dropping all categories
quantcol = ['BsmtQual', 'BsmtCond', 'KitchenQual', 'ExterQual', 'ExterCond', 'GarageQual', 'GarageCond', 'HeatingQC', 'FireplaceQu', 'PoolQC']
for col in df_qcat:
    if col in quantcol:
        df_qcat[col] = df_qcat[col].replace('None', 0).replace('Po', 1).replace('Fa',2).replace('TA', 3).replace('Gd',4).replace('Ex',5)
df_qcat = df_qcat._get_numeric_data()

xscore, xerr, xtime, xmaxf, xtop5 = treerun(df_qcat, logyn = False)
resultsdf.loc["Qcat"] = [xscore, xerr, xtime, xmaxf, xtop5]
xscore, xerr, xtime, xmaxf, xtop5 = treerun(df_qcat, logyn = True)
resultsdf.loc["Qcat_log"] = [xscore, xerr, xtime, xmaxf, xtop5]

# Run with DUMCAT (249f) - dummifying EVERYTHING
xscore, xerr, xtime, xmaxf, xtop5 = treerun(df_dumcat, logyn = False)
resultsdf.loc["Dumcat"] = [xscore, xerr, xtime, xmaxf, xtop5]
xscore, xerr, xtime, xmaxf, xtop5 = treerun(df_dumcat, logyn = True)
resultsdf.loc["Dumcat_log"] = [xscore, xerr, xtime, xmaxf, xtop5]

# Run with OPTCAT (242f)- After hours of new feature adds, algorithms to decide what to do with each column, etc...
xscore, xerr, xtime, xmaxf, xtop5 = treerun(df_optcat, logyn = False)
resultsdf.loc["Optcat"] = [xscore, xerr, xtime, xmaxf, xtop5]
xscore, xerr, xtime, xmaxf, xtop5 = treerun(df_optcat, logyn = True)
resultsdf.loc["Optcat_log"] = [xscore, xerr, xtime, xmaxf, xtop5]

# Save Resultsdf
resultsdf.to_csv('RandomForest_ModelCompare.csv', index=True)

In [0]:
resultsdf

### Fitting TEST with best model above

In [0]:
#%% Code to use model on test!
    
test = pd.read_csv("test.csv")
df = test.copy(); df.set_index('Id')

# Go through same feature engineering as train set
df = beefup(df)
df = nafix(df) 
df = q_to_cat(df, ['MSSubClass', 'OverallCond', 'MoSold'])

# Take the suggest list from previosly ran CatAnalysis for TRAIN set
suggestdf = pd.read_csv("FeatureSuggestion.csv")
suggestdf = suggestdf.set_index('Unnamed: 0')['suggest']

# split into different feature sets     
quantifycols = list(suggestdf[suggestdf == "quantify"].index.values)
ovacols = list(suggestdf[suggestdf == "1vA"].index.values)
ignorecols = list(suggestdf[suggestdf == "ignore"].index.values)
dumcols = list(suggestdf[suggestdf == 'dummify'].index.values)
dumcols = dumcols + list(suggestdf[suggestdf == 'binary'].index.values)

# perform the suggested action
for col in df:
    if col in quantifycols:
        df[col] = df[col].fillna(0).replace('None', 0).replace('Po', 1).replace('Fa',2).replace('TA', 3).replace('Gd',4).replace('Ex',5)
    elif col in ovacols:
        df[col] = df[col].eq(df[col].mode()[0]).mul(1)
    elif col in ignorecols:
        df[col] = df.drop(col, axis = 1)

df = pd.get_dummies(df, columns = dumcols, drop_first = True)

# Now run transformed DF through tree
ytrain = np.log(df_optcat['SalePrice'])
xtrain = df_optcat.drop('SalePrice', axis = 1)
xtest = test

# with just dummy all
#ytrain = df_dumcat['SalePrice']
#xtrain = df_dumcat.drop(['SalePrice', 'Id'], axis = 1)
#df = nafix(df)
#xtest = pd.get_dummies(df, drop_first = True)

# Tree Setup
from sklearn import ensemble
rf = ensemble.RandomForestRegressor()
rf.set_params(random_state = 0, n_estimators = 100, max_features = 18) # as per results 
rf.fit(xtrain, ytrain)

# Predict!
submission = pd.Series(rf.predict(xtest))
submission = pd.concat([test['Id'], submission], axis = 1)
submission.columns = ['Id','SalePrice']
submission['SalePrice'] = round(np.exp(submission['SalePrice']),2)

submission.to_csv("RF_Submission.csv", index = False)

# which columns are missing?
#diffcol = list(set(list(xtrain.columns)) - set(list(xtest.columns)))
#len(list(set(list(xtrain.columns)) - set(list(xtest.columns))))
#for i in diffcol[:10]:
#    xtest[i] = 0
