In [278]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from IPython.display import display

from sklearn import metrics
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np


PATH = 'C:\\Kaggle\\House_Prices\\data\\'
df = pd.read_csv(f'{PATH}train.csv')
df_test = pd.read_csv(f'{PATH}test.csv')
df.sort_values(by='YrSold',axis=0,inplace=True)

df_test.tail()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal
1458,2919,60,RL,74.0,9627,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,11,2006,WD,Normal


In [279]:
def convertToCategory(df_):
    cat_num_cols=['MSSubClass','OverallQual','OverallCond']
    df_[cat_num_cols] = df_[cat_num_cols].astype('category')
    return df_


df = convertToCategory(df)
df_test = convertToCategory(df_test)

In [280]:
def replaceZerosWithNans(df_):
    cols=['FullBath','BedroomAbvGr','KitchenAbvGr','GarageCars','GarageArea']

    for col in cols:
        df_[col].replace([0],[None],inplace=True)
        
replaceZerosWithNans(df)
replaceZerosWithNans(df_test)

In [252]:
def processTrainSet(df_):                     
    # Drop columns where 90% or above are missing
    df_.drop(['PoolQC','Fence','MiscFeature','Alley'], axis=1,inplace=True)#since 90% cof rows are missing values

    # Drop Id column from df
    id_col = df_['Id']
    df_.drop(['Id'], axis=1, inplace=True)

    # Separate Target solumn from train set
    target = np.log(df_['SalePrice'])
    df_.drop(['SalePrice'], axis = 1, inplace=True)
                     
    low_variance_cols=['LandContour','LandSlope','BldgType',
                'RoofMatl','ExterCond','BsmtCond','CentralAir','Functional','GarageQual','GarageCond','PavedDrive',
                'SaleType','SaleCondition','MSZoning','Street', 'RoofStyle','KitchenAbvGr','Electrical','BsmtFinType2',
                'Heating']

    for c in low_variance_cols:
        if(c in df):
            df.drop(c,inplace=True,axis=1)
            
    return target

y = processTrainSet(df)    

In [253]:
def handleNulls_NumercialCols(df_):
    dt_ser=df_.dtypes
    int_cols = list(dt_ser[dt_ser == 'int64'].index)
    float_cols=list(dt_ser[dt_ser == 'float64'].index)
    num_cols = int_cols + float_cols
    df_num = df_[num_cols]
    cols_with_nulls=list(df_num.isnull().sum()[df_num[num_cols].isnull().sum() > 0].index)
 

    for col in cols_with_nulls:
        df_[col].fillna(df_[col].mean(), inplace=True)   
        
    return df_
    
df = handleNulls_NumercialCols(df)
df_test = handleNulls_NumercialCols(df_test)

In [254]:
def handleNulls_CatCols(df_) :
    dt_ser=df_.dtypes
    cat_cols=list(dt_ser[dt_ser == 'object'].index)
    
    if len(cat_cols) > 0:
        df_[cat_cols]=df_[cat_cols].astype('category')
    
    dt_ser=df_.dtypes
    cat_cols=list(dt_ser[dt_ser == 'category'].index)

    cat_counts_ser = df_[cat_cols].isnull().sum()
    cats_with_null = cat_counts_ser[cat_counts_ser > 0].index

    if(len(cats_with_null)>0 ):
        df_summary=df_[cats_with_null].describe().T
        top_values= (df_summary['top']).to_dict()
    
    for c in cats_with_null:
        if(c in df_):
            df_[c].fillna(top_values.get(c), inplace=True) 

    return df_

df = handleNulls_CatCols(df)       
df_test = handleNulls_CatCols(df_test)
df[['BedroomAbvGr','FullBath','GarageCars','GarageArea']].isnull().any()

BedroomAbvGr    False
FullBath        False
GarageCars      False
GarageArea      False
dtype: bool

In [255]:
ordered_dict={
                'GarageCond':['Ex', 'Gd', 'Fa', 'Po', 'TA'], 
              'GarageQual':['Ex', 'Gd', 'Fa', 'Po', 'TA'], 
              'BsmtCond':['Gd', 'Fa', 'Po', 'TA'], 
              'BsmtQual':['Ex', 'Gd', 'Fa', 'TA'], 
              'ExterQual':['Ex', 'Gd', 'Fa', 'TA'], 
              'ExterCond':['Ex', 'Gd', 'Fa', 'TA'],
              'HeatingQC':['Ex', 'Gd', 'Fa', 'Po', 'TA'],
              'BsmtExposure':['Gd', 'Av', 'Mn','No'],
            # 'Utilities':['ELO','NoSeWa','NoSewr','AllPub'],
            # 'LandSlope':['Gtl','Mod','Sev'],
             'LandContour':['Lvl','Bnk','HLS','Low'],
             'BldgType':['TwnhsI','TwnhsE','Duplx','1FmCon','1Fam'],
             'KitchenQual':['Ex','Gd','TA','Fa','Po'],
             'Functional':['Typ','Min1','Min2','Mod','Maj1','Maj2','Sev','Sal'],
             'FireplaceQu':['Ex','Gd','TA','Fa','Po','NA'],
             'GarageFinish':['Fin','RFn','Unf','NA'],
             'PavedDrive':['Y','P','N'],
             'OverallQual':[1,2,3,4,5,6,7,8,9,10],
             'OverallCond':[1,2,3,4,5,6,7,8,9,10],
              'LotShape':['Reg','IR1','IR2','IR3']
             }

def handleOrdinals(df_):
    for c in ordered_dict.keys():
         if(c in df_):
            df_[c].cat.set_categories(ordered_dict.get(c), ordered=True, inplace=True)
    return df_

df = handleOrdinals(df)
df_test = handleOrdinals(df_test)

In [256]:
df[['BedroomAbvGr','FullBath','GarageCars','GarageArea']].isnull().any()

BedroomAbvGr    False
FullBath        False
GarageCars      False
GarageArea      False
dtype: bool

In [257]:
def catsToNumericals(df_):
    cat_num_cols=['BedroomAbvGr','FullBath','GarageCars','GarageArea']
    df_[cat_num_cols]=df_[cat_num_cols].astype('int64', inplace = True)
    return df_

df = catsToNumericals(df)
df_test = catsToNumericals(df_test)

In [258]:
def oneHotEncode_NonOrdinals(df_):
    dt_ser = df_.dtypes
    cat_cols = list(dt_ser[dt_ser == 'category'].index)
    nominal_cols = [item for item in set(df_[cat_cols].columns) if item not in set(ordered_dict.keys())]
    df_dummies = pd.get_dummies(df_[nominal_cols])
    
    all_cols=df_.columns
    dummy_cols=df_dummies.columns

    to_join = []

    for c in dummy_cols:
        if (c not in all_cols):
            to_join.append(c)
        
    df_ = df_.join(df_dummies[to_join])
        
    to_drop = []

    for c in nominal_cols:
        if c in all_cols:
            df_.drop([c],axis=1,inplace=True)
                     
    return df_;                         

df = oneHotEncode_NonOrdinals(df)
df_test = oneHotEncode_NonOrdinals(df_test)

In [259]:
def setCodesForCategoricals(df_):
    dt_ser=df_.dtypes
    cat_cols=list(dt_ser[dt_ser == 'category'].index)


    for c in df_[cat_cols]:
        df_[c]=df_[c].cat.codes
                     
    return df_
        
df = setCodesForCategoricals(df)
df_test = setCodesForCategoricals(df_test)

In [260]:
df.tail()

Unnamed: 0,LotFrontage,LotArea,LotShape,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,BsmtQual,...,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood
353,60.0,8520,0,5,7,1928,2003,0.0,3,3,...,0,0,0,1,1,0,0,0,0,0
1322,107.0,10186,1,6,4,1992,1992,0.0,1,2,...,0,0,0,1,0,0,1,0,0,0
162,95.0,12182,0,6,4,2005,2005,226.0,1,2,...,0,0,0,0,0,0,1,0,0,0
158,100.0,12552,0,6,4,2004,2005,0.0,1,2,...,0,0,0,0,0,0,1,0,0,0
1284,50.0,9638,0,5,6,1919,1990,0.0,3,3,...,0,0,0,1,0,0,1,0,0,0


In [261]:
def constructNewFeats(df_):
    
    df_['Age'] = df_['YrSold'] - df_['YearBuilt']     
    df_['GarageAge']=df_['YrSold'] - df_['GarageYrBlt']
    df_['TtlBsmtFinSF']=df_['BsmtFinSF1'] + df_['BsmtFinSF2']
    df_['TtlSqFtAboveGround']=df_['1stFlrSF'] + df_['2ndFlrSF']
    df_['YearsElapsedRemod']=df_['YrSold'] - df_['YearRemodAdd']
    df_['TtlSf']=df_['1stFlrSF'] + df_['2ndFlrSF'] + df_['TotalBsmtSF']
    df_['TtlBaths']=df_['FullBath'] + df_['BsmtFullBath']

    df_['TtlFullBaths']= df_['FullBath'] + df_['BsmtFullBath']
    df_['TtlHalfBaths'] = df_['HalfBath'] + df_['BsmtHalfBath']

    l_to_drop = ['GrLivArea','BsmtFinSF1','1stFlrSF','YearBuilt']


    for c in l_to_drop:
        if (c in df_raw.columns):
            df_.drop([c], axis=1, inplace=True)
                     
    return df_                     
            
df = constructNewFeats(df)
df_test = constructNewFeats(df_test)

In [262]:
df.tail()

Unnamed: 0,LotFrontage,LotArea,LotShape,OverallQual,OverallCond,YearRemodAdd,MasVnrArea,ExterQual,BsmtQual,BsmtExposure,...,Foundation_Wood,Age,GarageAge,TtlBsmtFinSF,TtlSqFtAboveGround,YearsElapsedRemod,TtlSf,TtlBaths,TtlFullBaths,TtlHalfBaths
353,60.0,8520,0,5,7,2003,0.0,3,3,3,...,0,82,5.0,0,720,7,1344,1,1,0
1322,107.0,10186,1,6,4,1992,0.0,1,2,3,...,0,18,18.0,674,1923,18,2673,3,3,1
162,95.0,12182,0,6,4,2005,226.0,1,2,2,...,0,5,5.0,1201,1541,5,3082,2,2,0
158,100.0,12552,0,6,4,2005,0.0,1,2,3,...,0,6,6.0,222,1947,5,2938,2,2,1
1284,50.0,9638,0,5,6,1990,0.0,3,3,3,...,0,91,41.0,0,2447,20,3251,2,2,0


In [264]:
max_features_= 0.7
m = RandomForestRegressor(n_estimators=90, min_samples_leaf=3, max_features = max_features_)
m.fit(df, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.7, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=3, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=90, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [275]:
print(df.shape)
print(df_test.shape)

(1460, 172)
(1459, 246)


In [276]:
feature_difference = set(df_test) - set(df)

import numpy as np

rows = df_test.shape[0]
cols =  len(feature_difference)

zero_matrix = np.zeros((rows,cols))


cols_names = list(feature_difference)

feature_difference_df = pd.DataFrame(data=zero_matrix, columns=cols_names)
df_test = df_test.join(feature_difference_df)


df_test.shape

(1459, 246)

In [None]:
preds = m.predict(df_test)
my_submission = pd.DataFrame({'Id': df_test['Id'], 'SalePrice': preds})
my_submission.to_csv('submission.csv', index=False)