In [46]:
import numpy as np
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ShuffleSplit, GridSearchCV
import pickle

In [4]:
X_train = np.load('Data/X_train.npy')
y_train = np.load('Data/y_train.npy')
X_test = np.load('Data/X_test.npy')
y_test = np.load('Data/y_test.npy')

In [49]:
model = RandomForestClassifier(
    n_estimators=150,  # The number of trees in the forest.
    criterion='entropy',  #  function to measure the quality of a split (Gini impurity and “entropy”)
    max_depth=None,  # The maximum depth of the tree
    min_samples_split=2,  # The minimum number of samples required to split
    min_samples_leaf=1,  # The minimum number of samples required to be at a leaf node
    min_weight_fraction_leaf=0.0,  # The minimum weighted fraction of the sum total of weights
    max_features='log2',  # The number of features to consider when looking for the best split.
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,  # Threshold for early stopping in tree growth.
    bootstrap=True,
    oob_score=False,
    n_jobs=-1,  # The number of jobs to run in parallel. -1 means using all processors.
    random_state=0,  #
    verbose=0,
    warm_start=False,
    class_weight='balanced')

param_grid = {
    'n_estimators': [int(x) for x in np.linspace(start = 100, stop = 200, num = 10)],
    'max_features': ['log2'],
    'max_depth': [int(x) for x in np.linspace(start = 25, stop = 75, num = 10)]
}

cv = ShuffleSplit(n_splits=2, test_size=0.01, train_size=0.02, random_state=0)
clf = GridSearchCV(model, param_grid, cv=cv)
clf.fit(X_train, y_train)

# getting best result
print(clf.best_params_)
print(clf.best_score_)

{'max_depth': 58, 'max_features': 'log2', 'n_estimators': 166}
0.5184804928131417


In [50]:
best_model = RandomForestClassifier(**clf.best_params_)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -2       0.53      0.70      0.60       473
          -1       0.67      0.68      0.67      2746
           0       0.80      0.72      0.76      4174
           1       0.52      0.71      0.60       522
           2       0.48      0.75      0.59       105

    accuracy                           0.70      8020
   macro avg       0.60      0.71      0.64      8020
weighted avg       0.72      0.70      0.71      8020



In [51]:
with open("Trained_models/Random_Forest.pkl", "wb") as f:
    pickle.dump(best_model, f)