In [1]:
import preprocessing as PRE
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

import scipy.stats as stats
from scipy.special import boxcox1p

import warnings
warnings.filterwarnings('ignore')

### Encode and Split

In [35]:
def encode(df):
    df = df.copy()
    
    # Numerical to String
    col_to_str = ['MSSubClass', 'OverallQual', 'YrSold', 'MoSold']
    
    # Categorical to LabelEncode
    col_to_encode = ['BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
            'ExterQual', 'ExterCond','HeatingQC', 'BsmtFinType1', 
            'BsmtFinType2', 'BsmtExposure', 'GarageFinish', 'LandSlope',
            'LotShape', 'PavedDrive', 'Street', 'CentralAir', 'MSSubClass', 'OverallCond', 
            'YrSold', 'MoSold', 'KitchenQual', 'Functional', 'OverallQual']
    
    # Apply
    df[col_to_str] = df[col_to_str].applymap(str)
    df[col_to_encode] = df[col_to_encode].apply(LabelEncoder().fit_transform)

    return(df)

In [5]:
def split_num_cat(df):
    df = df.copy()
    
    # Split by object datatype
    num_features = df.dtypes[df.dtypes != "object"].index
    cat_features = df.dtypes[df.dtypes == "object"].index
    
    df_num = df[num_features]
    df_cat = df[cat_features]
    
    return(df_num, df_cat)

### Numerical Features

In [56]:
def get_boxcox(df, threshold=0.75, l=0.15):
    df = df.copy()
    
    # Check Skew
    skewness = df.apply(lambda x: stats.skew(x.dropna())).sort_values(ascending=False)
    skewness = skewness[abs(skewness) > threshold]

    skewed_features = skewness.index
    l = l
    for f in skewed_features:
        df[f] = boxcox1p(df[f], l)
        
    return(df)

### Categorical Data

In [57]:
def get_dummies(df):
    df = df.copy()
    df = pd.get_dummies(df)
    return(df)

### SalePrice

In [None]:
def get_log_salePrice

In [1]:
def feature_engineer(df, boxcox=True, threshold=0.75, l=0.15):
    df = df.copy()
    
    df_encoded = encode(df)
    df_num, df_cat = split_num_cat(df_encoded)
    if boxcox:
        df_num = get_boxcox(df_num, threshold=threshold, l=l)
    df_cat = get_dummies(df_cat)
    
    
    df_full = pd.concat([df_num, df_cat], axis=1)
    
    return(df_full)

In [63]:
df_pre = PRE.preprocess(filepath='Data/RAw/', frac=0.2, outliers=False)
df_boxcox = label_data(df_pre)
df_skew = label_data(df_pre, boxcox=False)

In [86]:
from sklearn.ensemble import RandomForestRegressor
ntrain, ntest = PRE.get_lengths('Data/Raw/')

y = PRE.get_SalePrice('Data/Raw/')

X1 = df_boxcox.iloc[:ntrain,:]
test1 = df_boxcox.iloc[ntrain:,]

X2 = df_boxcox.iloc[:ntrain,:]
test2 = df_boxcox.iloc[ntrain:,]

In [87]:
rf = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=100)
rf_ = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=100)

In [88]:
rf.fit(X1, y)
rf_.fit(X2, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [89]:
rf.predict(test1)

array([137027.74288987, 137429.55294609, 154077.24296625, ...,
       137429.55294609, 137027.74288987, 245950.77219986])

In [90]:
rf.predict(test2)

array([137027.74288987, 137429.55294609, 154077.24296625, ...,
       137429.55294609, 137027.74288987, 245950.77219986])