In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from catboost import CatBoostRegressor
from scipy.stats import chisquare, chi2_contingency
from sklearn.metrics import mean_squared_error
import os
import gc
import shap

%matplotlib inline

In [2]:
datapath= 'train.csv'
def get_type(datapath):
    extension = datapath.split('.')[1]
    assert datapath.endswith(tuple(['xls', 'xlsx', 'csv'])), 'Our system currently only accepts csv, xls or xlsx extensions, your input was {}'.format(extension)
    if 'csv' in datapath:
        seplist = [',', '|', ';', '\t']
        return seplist
    elif 'xls'in datapath or 'xlsx' in datapath:
        xl = pd.ExcelFile(datapath)
        return xl.sheet_names
    else:
        print('Our system currently only accepts csv, xls or xlsx extensions')

def read_data(datapath, select):
    extension = datapath.split('.')[1]
    assert datapath.endswith(tuple(['xls', 'xlsx', 'csv'])), 'Our system currently only accepts csv, xls or xlsx extensions, your input was {}'.format(extension)
    if 'csv' in datapath:
        return pd.read_csv(datapath, sep=select)
    elif 'xls'in datapath or 'xlsx' in datapath:
        return pd.read_excel(datapath, sheet=select)
    else:
        print('Our system currently only accepts csv, xls or xlsx extensions')
    

In [3]:
data = read_data(datapath, get_type(datapath)[0])

In [4]:
columns = list(data)
target = 'SalePrice'

In [5]:
## Remove IDs and single values
a = data.shape[1]
for col in list(data):
    if data[col].nunique() ==1:
        data.drop(columns=[col], inplace=True)
    elif data[col].nunique() == len(data):
        data.drop(columns=[col], inplace=True)
    else:
        None

b = data.shape[1]
print("data was dropped from {} to {}".format(a, b))

data was dropped from 81 to 80


In [6]:
addition = [np.nan] * (data.shape[1]-1)
additional = pd.concat([pd.DataFrame(addition), pd.DataFrame([0])], ignore_index=True).T
additional.columns = data.columns.values

data = pd.concat([data, additional], ignore_index=True)


In [7]:
data = data.dropna(axis=1, thresh=int(np.ceil(0.2*len(data))))
y = data[target]
x = data.drop([target], axis=1)

In [9]:
initial_use = list(x)

In [10]:
def get_num_ob(x):
    x_num = x.select_dtypes(exclude='object')
    x_ob = x.select_dtypes(include='object')
    return x_num, x_ob
def imput_fit_transform(x):
    imput = SimpleImputer(strategy='median')
    x_numeric_imp = pd.DataFrame(imput.fit_transform(x), columns = x.columns, index = x.index)
    return imput, x_numeric_imp

def imput_transform(x, imput):
    x_numeric_imp = pd.DataFrame(imput.transform(x), columns = x.columns, index = x.index)
    return x_numeric_imp
    

In [11]:

xnum, xobj = get_num_ob(x)

In [12]:
categories = list(xobj)

In [13]:
from collections import defaultdict
d = defaultdict(LabelEncoder)

xobj.fillna('Unknown', inplace=True)

## get labeled
def le_fit_transform(df):
    le = LabelEncoder()
    fit = df.apply(lambda x: d[x.name].fit_transform(x))
    return fit, d
def le_transform(df, le):
    x_transformed = df.apply(lambda x: le[x.name].transform(x))
    return x_transformed

x_obj_le, le = le_fit_transform(xobj)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [14]:
x_con = pd.concat([xnum, x_obj_le], axis=1)

In [15]:
droppage = len(x_con)-1
x_con.drop(droppage, inplace=True)
y.drop(droppage, inplace=True)

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x_con, y, test_size=0.2, random_state=42)

In [17]:
x_train_obj = x_train[categories]
x_train_num = x_train[set(list(x_train)) - set(categories)]

In [18]:
def multicol_filter(df, min_v, max_v):
    columnss = np.full((df.shape[0],), True, dtype=bool)
    for i in range(df.shape[0]):
        for j in range(i+1, df.shape[0]):
            if df.iloc[i,j] >=max_v or df.iloc[i,j] <=min_v:
                if columnss[j]:
                    columnss[j] = False

    ss = df.head(1)
    selected_columnss = ss.columns[columnss]
    ts = list(ss[selected_columnss])
    return ts

In [19]:

ze = x_train_num.corr()
t = multicol_filter(ze, -0.8, 0.8)

xnum_clean = x_train_num[t]
print(x_train_num.shape, xnum_clean.shape)

(1168, 36) (1168, 32)


In [20]:
def cramers_V(var1,var2) :
    crosstab =np.array(pd.crosstab(var1,var2, rownames=None, colnames=None)) # Cross table building
    stat = chi2_contingency(crosstab)[0] # Keeping of the test statistic of the Chi2 test
    obs = np.sum(crosstab) # Number of observations
    mini = min(crosstab.shape)-1 # Take the minimum value between the columns and the rows of the cross table
    return (stat/(obs*mini))

def chi_test(data, categories):
    drop_cols = []
    for j in range(len(categories)-1):
            for k in range(j+1, len(categories)):

                pvalue = chi2_contingency(pd.crosstab(data[categories[j]],data[categories[k]]))[1]
                if pvalue < 0.05:
                    if categories[k] in drop_cols:
                        None
                    else:
                        drop_cols.append(categories[k])
                else:
                    None
    return set(categories) - set(np.unique(drop_cols))

def cramer_test(data, max_v):
    rows= []
    for var1 in data:
        col = []
        for var2 in data :
            cramers =cramers_V(data[var1], data[var2]) # Cramer's V test
            col.append(round(cramers,2)) # Keeping of the rounded value of the Cramer's V  
        rows.append(col)

    cramers_results = np.array(rows)
    df = pd.DataFrame(cramers_results, columns = data.columns, index =data.columns)
    return multicol_filter(df, -max_v, max_v)
def categorical_filter(data, max_v, method = 'chi2'):
    '''
    filter the categorical features using either cramers v, chi squared or intersection of both
    data = dataframe to add
    max_v = used for the common benchmark of colinearity value
    method = string input accepting either 'chi2', 'cramer', or 'both'
    '''
    assert method in ['chi2', 'cramer', 'both'], 'method not understandable, please use either chi2, cramer or both'
    categories = list(data)
    
    if method == 'chi2':
        keep_cols = chi_test(data, categories)
        
    elif method == 'cramer':
        keep_cols = cramer_test(data, max_v)
    
    elif method == 'both':
        keep_cols_chi = chi_test(data, categories)
        keep_cols_cv = cramer_test(data, max_v)
        del_chi = set(categories) - set(keep_cols_chi)
        del_cv = set(categories) - set(keep_cols_cv)
        del_both = set(del_chi).intersection(set(del_cv))
        keep_cols = set(categories) -  set(del_both)
    else:
        print("error with method")
    
    return keep_cols


In [21]:
filterer = categorical_filter(x_train_obj, 0.8, method = 'both')

In [22]:
x_obj_clean = x_train_obj[filterer]

In [23]:
# Concat back to x_train
x_train_clean = pd.concat([xnum_clean, x_obj_clean], axis=1)
x_test = x_test[x_train_clean.columns]

In [24]:
imputer, x_train_imputed = imput_fit_transform(x_train_clean)
x_test_imputed = imput_transform(x_test, imputer)

### Machine Learning

In [25]:
dt = DecisionTreeRegressor(random_state=108)
rf = RandomForestRegressor(random_state=108)
gb = GradientBoostingRegressor(random_state=108)
cb = CatBoostRegressor(random_state=108, verbose=False)
dt_param = {'max_depth':[1, 3, 5, 10], 'min_samples_split':[2,4,8,16], 'min_samples_leaf':[1,2,4,6,8,10]}


n_estimators = [10, 25, 50, 100]
max_features = ['auto', 'sqrt']
max_depth = [3, 5, 10, 12, None]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
random_strength = [0.0001, 0.001, 0.1, 1]
border_count = [1, 5, 10, 25, 50, 100, 255]
l2_leaf_reg = [1, 2, 3, 4, 5, 6, 10, 15, 30]
bagging_temperature = [0, 1, 2, 3, 4, 5]

rf_param = {'n_estimators': n_estimators, 'max_features':max_features, 'max_depth':max_depth, 'min_samples_split':min_samples_split,'min_samples_leaf':min_samples_leaf}

learning_rates = [1, 0.5, 0.25, 0.1, 0.05, 0.01]
gb_param = {'learning_rate':learning_rates, 'n_estimators': n_estimators, 'max_depth':max_depth, 'min_samples_split':min_samples_split,'min_samples_leaf':min_samples_leaf, 'max_features':max_features}
cb_param = {'learning_rate':learning_rates, 'iterations': n_estimators, 'depth':max_depth, 'random_strength':random_strength,'border_count':border_count, 'l2_leaf_reg':l2_leaf_reg, 'bagging_temperature':bagging_temperature}

In [29]:

name = []
k = []
tr_auc = []
te_auc = []
method = []
features = []
trans = dict()
for data_used in [[x_train_imputed, y_train, 'normal']]:
    x_use = data_used[0]
    y_use = data_used[1]
    gdt = RandomizedSearchCV(dt, dt_param, n_jobs=-1, scoring='neg_root_mean_squared_error', n_iter=10, random_state=108)
    grf = RandomizedSearchCV(rf, rf_param, n_jobs=-1, scoring='neg_root_mean_squared_error', n_iter=10, random_state=108)
    ggb = RandomizedSearchCV(gb, gb_param, n_jobs=-1, scoring='neg_root_mean_squared_error', n_iter=10, random_state=108)
    gcb = RandomizedSearchCV(cb, cb_param, n_jobs=-1, scoring='neg_root_mean_squared_error', n_iter=20, random_state=108)
    new_dt = DecisionTreeRegressor(**gdt.fit(x_use, y_use).best_params_, random_state=108)
    
    new_rf = RandomForestRegressor(**grf.fit(x_use, y_use).best_params_, random_state=108)
    
    new_gb = GradientBoostingRegressor(**ggb.fit(x_use, y_use).best_params_, random_state=108)
    
    new_cb = CatBoostRegressor(**gcb.fit(x_use, y_use).best_params_, random_state=108, verbose=False)


    for algo in [[new_dt, 'dt'], [new_rf, 'rf'], [new_gb, 'gb'], [new_cb, 'cb']]:
        algo[0].fit(x_use, y_use)
        current = np.inf
        num = x_train_imputed.shape[1]
        used_feature = list(x_use)
        usee = pd.DataFrame({'params':x_use.columns, 'importances':algo[0].feature_importances_}).sort_values('importances', ascending=False)
        for kbest in [5, 10, 15, 25, 50]:
            uses = usee.head(kbest)['params']
            

            x_tr_try= x_use[uses]
            
            hold = np.mean(-cross_val_score(estimator=algo[0], X=x_tr_try, y=y_use, cv = 5, scoring = 'neg_root_mean_squared_error'))
            if hold < current:
                current = hold
                num = kbest       
                sampling = data_used[2]
                used_feature = list(uses)
            else:
                None

        x_tr_fin = x_use[usee.head(num)['params']]
        x_te_fin = x_test_imputed[usee.head(num)['params']]
        
        y_pred = algo[0].fit(x_tr_fin, y_use).predict(x_te_fin)
        store = mean_squared_error(y_test, y_pred)**0.5
        name.append(algo[1])
        k.append(num)
        tr_auc.append(current)
        te_auc.append(store)
        method.append(sampling)
        features.append(used_feature)

result = pd.DataFrame({'algo':name, 'n_features':k, 'train_RMSE':tr_auc, 'test_RMSE':te_auc, 'method':method, 'features':features}).sort_values('test_RMSE', ascending=True)
result.sort_values('test_RMSE', ascending=True).head(1)



Unnamed: 0,algo,n_features,train_RMSE,test_RMSE,method,features
2,gb,15,29685.056366,29492.666642,normal,"[OverallQual, GrLivArea, 2ndFlrSF, TotalBsmtSF..."


In [30]:
algo_used = result['algo'].iloc[0]
features_used = result['features'].iloc[0]

In [31]:
if algo_used == 'dt':
    do_train = new_dt
elif algo_used == 'gb':
    do_train = new_gb
elif algo_used == 'rf':
    do_train = new_rf
elif algo_used == 'cb':
    do_train = new_cb

## Prepare to retrain using all dataset

In [32]:
# we happened to already do our part in the x_con, so we will reuse x_con as our main retraining dataset.
# Since we already do label encoding, we no longer need to label encode it again
imputer, x_imputed = imput_fit_transform(x_con)


In [33]:
# use only the best features from train_test_split
x_imputed_use = x_imputed[features_used]

In [34]:
x_imputed_use.shape

(1460, 15)

In [35]:
do_train.fit(x_imputed_use, y)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls',
                          max_depth=None, max_features='auto',
                          max_leaf_nodes=None, min_impurity_decrease=0.0,
                          min_impurity_split=None, min_samples_leaf=2,
                          min_samples_split=10, min_weight_fraction_leaf=0.0,
                          n_estimators=50, n_iter_no_change=None,
                          presort='deprecated', random_state=108, subsample=1.0,
                          tol=0.0001, validation_fraction=0.1, verbose=0,
                          warm_start=False)

In [55]:
mean_squared_error(do_train.predict(x_imputed_use), y)**0.5

1747.462100379974

# Prepare data to predict

In [40]:
pred_datapath= 'train.csv'
pred_data = read_data(datapath, get_type(datapath)[0])
pred_data = pred_data[list(x_con)]

In [41]:
pred_data_obj = pred_data[categories]
pred_data_obj.fillna('Unknown', inplace=True)
pred_data_num = pred_data[set(pred_data) - set(categories)]

In [42]:
pred_data_obj_le = le_transform(pred_data_obj, le)

In [75]:
pred_data_con = pd.concat([pred_data_num, pred_data_obj_le], axis=1)

In [78]:
pred_data_con = pred_data_con[list(x_con)]
pred_con_imputed = imput_transform(pred_data_con, imputer)

In [80]:
pred_data['prediction_result'] = do_train.predict(pred_con_use)