# Data processing

In [176]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

from imblearn.under_sampling import RandomUnderSampler

import pandas as pd

import numpy as np

In [177]:
def remove_low_variance(df):
    """
    remove columns with variance <= threshold
    """
    selector = VarianceThreshold(threshold=0.0)
    selector.fit(df)
    df_transformed = df[df.columns[selector.get_support(indices=True)]]
    return df_transformed


def remove_low_variance_wmeta(df, cols_meta):
    """
    remove columns with variance <= threshold
    """
    selector = VarianceThreshold(threshold=0.0)
    selector.fit(df.drop(columns=cols_meta))
    df_transformed = df[df.columns[selector.get_support(indices=True)]]
    df_transformed[cols_meta] = df[cols_meta]
    return df_transformed


def remove_zeros_rows(df, cols):
    return df.loc[(df[cols]!=0).any(axis=1)]


def std_scaler(df):
    sc = StandardScaler()
    return pd.DataFrame(sc.fit_transform(df.values),columns=df.columns)


def rs_scaler(df):
    rsc = RobustScaler()
    return pd.DataFrame(rsc.fit_transform(df.values),columns=df.columns)


def text_tfidf(df):
    vec = TfidfVectorizer()
    X = vec.fit_transform(df.values)
    return pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
  

def cols_eda(df): 
    eda_df = {}
    eda_df['null_sum'] = df.isnull().sum()
    eda_df['null_%'] = df.isnull().mean()
    eda_df['dtypes'] = df.dtypes
    eda_df['count'] = df.count()
    eda_df['mean'] = df.mean()
    eda_df['median'] = df.median()
    eda_df['min'] = df.min()
    eda_df['max'] = df.max()
    return pd.DataFrame(eda_df)

In [178]:
dict_ = {
    'id':['a','b','c','d','e'], 
    'f1':[2,np.nan,3,4,np.nan], 
    'f2':[4,np.nan,6,7,8], 
    'f3':[1,1,1,1,1],           
    'f4':['low','medium',np.nan,'high','low'], 
    'f5':['blue','green','red','purple',np.nan], 
    'f6':['una mattina','mi son svegliato','o bella ciao','bella ciao','bella ciao'],
    'label':[1,0,1,1,1]
}
df = pd.DataFrame(dict_)
df

Unnamed: 0,f1,f2,f3,f4,f5,f6,id,label
0,2.0,4.0,1,low,blue,una mattina,a,1
1,,,1,medium,green,mi son svegliato,b,0
2,3.0,6.0,1,,red,o bella ciao,c,1
3,4.0,7.0,1,high,purple,bella ciao,d,1
4,,8.0,1,low,,bella ciao,e,1


In [179]:
num_vars = ['f1', 'f2', 'f3']
cat_vars = ['f4', 'f5', 'f6']
meta_vars = ['id','label']

# numerical variables
df_num = df[num_vars].copy()
df_num.fillna(df_num.median(), inplace=True)
df_num = remove_low_variance(df_num)
df_num = std_scaler(df_num) # standard scaler, YOU DO NOT NEED THIS FOR RANDOM FOREST
df_num = rs_scaler(df_num) # robust scaler, YOU DO NOT NEED THIS FOR RANDOM FOREST

# categorical variables
df_cat = df[cat_vars].copy()
df_cat.fillna('missing', inplace=True)
## ordinal
cat = pd.Categorical(df_cat.f4, categories=['missing', 'low', 'medium', 'high'], ordered=True)
labels, unique = pd.factorize(cat, sort=True)
df_cat.f4 = labels
## text
df_cat_text = text_tfidf(df_cat.f6)
df_cat = pd.concat([df_cat.drop('f6', axis=1), df_cat_text], axis=1)
## cardinal
df_cat = pd.get_dummies(df_cat, drop_first=True)

# put df back together
df_final = pd.concat([df_num, df_cat, df[meta_vars]], axis=1)
df_final


# # EDA
cols_eda(df)


# under sampling
rus = RandomUnderSampler(random_state=42)
X, y = rus.fit_sample(df_final.drop('label', axis=1), df_final['label'])
df_balanced = pd.concat([X,y], axis=1)
df_balanced


# PCA
sc = StandardScaler()
X_std = sc.fit_transform(X)
X_pca = PCA(n_components=16).fit_transform(X_std)


# # remove low variance with meta
# cols_meta = ['id','label']
# df = remove_low_variance(df, cols_meta)

Unnamed: 0,f1,f2,f4,bella,ciao,mattina,mi,son,svegliato,una,f5_green,f5_missing,f5_purple,f5_red,id,label
0,-1.581139,-2.5,1,0.0,0.0,0.707107,0.0,0.0,0.0,0.707107,0,0,0,0,a,1
1,0.0,0.0,2,0.0,0.0,0.0,0.57735,0.57735,0.57735,0.0,1,0,0,0,b,0
2,0.0,-0.5,0,0.707107,0.707107,0.0,0.0,0.0,0.0,0.0,0,0,0,1,c,1
3,1.581139,0.5,3,0.707107,0.707107,0.0,0.0,0.0,0.0,0.0,0,0,1,0,d,1
4,0.0,1.5,1,0.707107,0.707107,0.0,0.0,0.0,0.0,0.0,0,1,0,0,e,1


# Model

In [180]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, auc, roc_curve
from sklearn.preprocessing import StandardScaler

from tqdm import tqdm_notebook as tqdm

In [None]:
X = df.drop(['label','id'], axis=1)
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
forest = RandomForestClassifier(oob_score=True)
forest.fit(X_train, y_train)

print(f'training accuracy: {forest.score(X_train, y_train)}')
print(f'oob score: {forest.oob_score_}')

print(f'test accuracy: {forest.score(X_test, y_test)}')
y_pred = forest.predict(X_test)
pred_probs = forest.predict_proba(X_test)[:, 1]

## grid search with oob

In [None]:
def grid_search(params, x, y):
        oob_score = 0
        best_params = {}
        print('hyperparameters tuning starts...')
        for n in tqdm(params['n_estimators']):
            for max_ in params['max_depth']:
                for min_ in params['min_samples_leaf']:
                    for max_f in params['max_features']:
                        for min_split in params['min_samples_split']:
                            for criterion in params['criterion']:
                                forest = RandomForestClassifier(criterion=criterion, n_estimators=n, max_depth=max_,
                                                                min_samples_leaf=min_, min_samples_split=min_split,
                                                                max_features=max_f,
                                                                random_state=params['random_state'], n_jobs=-1,
                                                                oob_score=params['oob_score'])
                                forest.fit(x, y)
                                if forest.oob_score_ > oob_score:
                                    print(f'updated OOB score: {forest.oob_score_}')
                                    oob_score = forest.oob_score_
                                    best_params['n_estimators'] = n
                                    best_params['max_depth'] = max_
                                    best_params['min_samples_leaf'] = min_
                                    best_params['max_features'] = max_f
                                    best_params['min_samples_split'] = min_split
                                    best_params['criterion'] = criterion
        print(f'best OOB score is {oob_score}')
        print(f'best parameters {best_params}')

        forest = RandomForestClassifier(criterion=best_params['criterion'], n_estimators=best_params['n_estimators'],
                                        max_depth=best_params['max_depth'],
                                        min_samples_leaf=best_params['min_samples_leaf'],
                                        min_samples_split=best_params['min_samples_split'],
                                        max_features=best_params['max_features'],
                                        random_state=self.random_state, n_jobs=-1,
                                        oob_score=self.oob_score)
        return forest.fit(x, y)

In [None]:
params = {
    'max_depth':[None],
    'max_features':['auto'],
    'min_samples_leaf': [2],
    'n_estimators': [100, 1000],
    'min_samples_split': [2],
    'criterion':['entropy'],
    'random_state':42,
    'oob_score': True
}

forest = grid_search(params, X_train, y_train)

print(f'The training accuracy is {forest.score(X_train, y_train)}')
print(f'The test accuracy is {forest.score(X_test, y_test)}')

# Plots

## feature importance

In [None]:
def plot_feature_importance(df, forest, n):
    features = df.columns
    importances = forest.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    fig = plt.figure(figsize=(8,6))
    plt.bar(np.arange(0,n,1), importances[indices[:n]], align='center', width=.5, alpha=.5, linewidth=1.0, edgecolor='k',
           label='individual feature importance')
    plt.step(np.arange(0,n,1), np.cumsum(importances[indices[:n]]), where='mid', label='cumulative feature importance')
    plt.xticks(np.arange(0,n,1), features[indices[:n]], rotation=90)
    plt.ylabel('Importance')
    plt.legend(loc='best')

In [None]:
plot_feature_importance(X_train, forest, 20)

## confusion matrix

In [None]:
def plot_confmat(y_true, y_pred):
    confmat = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(5,5))
    ax = sns.heatmap(confmat, annot=True, annot_kws={'size':20}, fmt=".0f", 
                linewidths=.5, square = True, cmap = 'Blues', cbar_kws={"shrink": .5})
    plt.ylabel('True label', size=20)
    plt.xlabel('Predicted label', size=20)
    plt.tick_params(axis='both', labelsize=20)
    cbar = ax.collections[0].colorbar
    cbar.ax.tick_params(labelsize=20)
    plt.text(2, 0.5, classification_report(y_true, y_pred, target_names=['Failure [0]', 'Success [1]']), size=15)

In [None]:
plot_confmat(y_test, y_pred)

## ROC curve

In [None]:
def plot_ROC_curve(y_true, pred_probs):
        fig = plt.figure(figsize=(8,8))
        fpr, tpr, thresholds = roc_curve(y_true, pred_probs, pos_label=1)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, linewidth=2, label=f'ROC (area= %.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], linestyle='--', linewidth=3, color=(0.6, 0.6, 0.6), label='random guessing')
        plt.plot([0, 0, 1], [0, 1, 1], linestyle=':', linewidth=3, color='k', label='perfect performance')
        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('false positive rate', size=20)
        plt.ylabel('true positive rate', size=20)
        plt.legend(loc="lower right", fontsize=20)

In [None]:
plot_ROC_curve(y_test, pred_probs)

## Prediction Probability Distributions

In [None]:
def plot_probs(probs0, probs1):
        fig = plt.figure(figsize=(10,6))
        sns.distplot(probs0, hist=False, kde_kws={"shade": True}, label='Failure')
        sns.distplot(probs1, hist=False, kde_kws={"shade": True}, label='Success')
        plt.xlabel('Probability of success', size=20)
        plt.legend(loc='best', fontsize=15)
        plt.xlim([0.0, 1])

In [None]:
df_probs = pd.DataFrame({'y_true':y_test.values, 'probs':pred_probs})
probs0 = df_probs[df_probs.y_true==0].probs
probs1 = df_probs[df_probs.y_true==1].probs
plot_probs(probs0, probs1)

## Correlation matrix

In [None]:
corr = df.corr()
mask = np.triu(np.ones_like(corr, dtype=np.bool)) # mask to have just a triangular matrix
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(250, 9, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmin=-0.7, vmax=0.7, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

## Correlation with target

In [None]:
correlations = df.corrwith(df['Target']).iloc[:-1].to_frame() # you should have features only and label as last column in df
correlations['abs'] = correlations[0].abs()
sorted_correlations = correlations.sort_values('abs', ascending=False)[0]
fig, ax = plt.subplots(figsize=(6,10))
sns.heatmap(sorted_correlations[:10].to_frame(), cmap='coolwarm', annot=True, vmin=-0.75, vmax=0.75, ax=ax);

## Most correlated features distribution

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(12, 12)) # this is for 9 plots
for i, col in enumerate(sorted_correlations.index[:9]):
    sns.distplot(df[df['Target']==0][col], label='0', ax=axes[i//3][i%3])
    sns.distplot(df[df['Target']==1][col], label='1', ax=axes[i//3][i%3])
plt.legend()
plt.tight_layout()

## Count plots

In [None]:
sns.countplot('label', data=df)

## Dist plots with cumulative

In [None]:
plt.figure(figsize=(10,6))
kwargs = {'cumulative': True, "histtype": "step"}
sns.distplot(df.FEATURE, hist_kws=kwargs, kde=False, bins=75, label='cumulative FEATURE')
sns.distplot(df.FEATURE, kde=False, bins=75, label='individual FEATURE')
plt.xlabel('x label')
plt.ylabel('y label')
plt.xlim([0,1400])
plt.legend(loc='best');

## Bar plots with hue

In [None]:
fig = plt.figure(figsize=(8,6))
splot = sns.barplot('date', 'count', hue='game_name', data=df)
for item in splot.get_xticklabels():
    item.set_rotation(45)