## Implementing hyperparameter tuning

In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)
## multiple outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



# read the data
path="../../Data Processing/Data/" # path to file relative to working directory
numerical = pd.read_csv(path+'numerical.csv')
categorical = pd.read_csv(path+'categorical.csv')
targets = pd.read_csv(path+'target.csv')
data = pd.concat([numerical, categorical, targets], axis = 1)

In [None]:
RAND_STATE = 42 # for reproducible shuffling
TT_RATIO = 0.25 # test/train

In [None]:
data['TARGET_B'].value_counts() # distribution of target "B" (binary label)

In [None]:
# X,y
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis=1)
# split the data by type
numericalX = X.select_dtypes(np.number)
categoricalX = X.select_dtypes(object)

In [None]:
# onehot encoding (needed for SMOTE and sci-kit learn's random forest)
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='if_binary').fit(categoricalX) #  drop the first category in each feature to reduce frame size
encoded_categorical = pd.DataFrame(encoder.transform(categoricalX).toarray()) # encode
X = pd.concat([numericalX, encoded_categorical], axis = 1) # rejoin
X.head(3)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TT_RATIO, random_state=RAND_STATE)

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

y_train_regression = X_train['TARGET_D']
y_test_regression = X_test['TARGET_D']

# Now we can remove the column target d from the set of features
X_train = X_train.drop(['TARGET_D'], axis=1)
X_test = X_test.drop(['TARGET_D'],   axis=1)

In [None]:
# again we need to drop the null (we use the same shuffling)
na_idx = X_train[X_train.isna().any(axis=1)].index
X_train = pd.DataFrame(X_train).drop(na_idx)
y_train = pd.DataFrame(y_train).drop(na_idx)

In [None]:
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

In [None]:
X_train.columns = X_train.columns.astype(str)
y_train.columns = y_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)
y_test.columns = y_test.columns.astype(str)

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_train, y_train = smote.fit_resample(X_train, y_train)
y_train.value_counts()

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [50, 100],
    'min_samples_split': [80],
    'min_samples_leaf' : [50],
    'max_features': ['sqrt'],
    ##'max_samples' : ['None', 0.5],
   'max_depth':[3,5]
    ## 'bootstrap':[True,False]
    }
clf = RandomForestClassifier(random_state=RAND_STATE)

In [None]:
grid_search = GridSearchCV(clf, param_grid, cv=5,return_train_score=True,n_jobs=-1,)

In [None]:
grid_search.fit(X_train,y_train.values.ravel())

In [None]:
best_params = grid_search.best_params_ #To check the best set of parameters returned
best_params

In [None]:
pd.DataFrame(grid_search.cv_results_)

<b> Please check RandomSearch as another algorithm comparable to GridSearch

In [None]:
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(random_state=RAND_STATE, **best_params)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=5)
print(np.mean(cross_val_scores))

## Feature Importance

<b> Higher the score, the more important the feature is

In [None]:
clf.fit(X_train, y_train)

In [None]:
len(X_train.columns)

In [None]:
feature_names = X_train.columns
feature_names = list(feature_names)

In [None]:
df = pd.DataFrame(list(zip(feature_names, clf.feature_importances_)))
df.columns = ['columns_name', 'score_feature_importance']
df.sort_values(by=['score_feature_importance'], ascending = False)