In [8]:
import numpy as np
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ShuffleSplit, GridSearchCV
import pickle

In [9]:
# X_train = np.load('Data/X_train.npy')
X_train = np.load('Data/doc_embeds_X_train.npy')
y_train = np.load('Data/y_train.npy')

# X_test = np.load('Data/X_test.npy')
X_test = np.load('Data/doc_embeds_X_test.npy')
y_test = np.load('Data/y_test.npy')

In [16]:
model = RandomForestClassifier(
    n_estimators=150,  # The number of trees in the forest.
    criterion='entropy',  #  function to measure the quality of a split (Gini impurity and “entropy”)
    max_depth=None,  # The maximum depth of the tree
    min_samples_split=2,  # The minimum number of samples required to split
    min_samples_leaf=1,  # The minimum number of samples required to be at a leaf node
    min_weight_fraction_leaf=0.0,  # The minimum weighted fraction of the sum total of weights
    max_features='log2',  # The number of features to consider when looking for the best split.
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,  # Threshold for early stopping in tree growth.
    bootstrap=True,
    oob_score=False,
    n_jobs=-1,  # The number of jobs to run in parallel. -1 means using all processors.
    random_state=0,  #
    verbose=0,
    warm_start=False,
    class_weight='balanced')

param_grid = {
    'n_estimators': [int(x) for x in np.linspace(start = 100, stop = 200, num = 10)],
    'max_features': ['log2', 'sqrt'],
    'max_depth': [int(x) for x in np.linspace(start = 10, stop = 50, num = 10)]
}

cv = ShuffleSplit(n_splits=3, test_size=0.01, train_size=0.02, random_state=0)
clf = GridSearchCV(model, param_grid, cv=cv)
clf.fit(X_train, y_train)

# getting best result
print(clf.best_params_)
print(clf.best_score_)

{'max_depth': 10, 'max_features': 'log2', 'n_estimators': 200}
0.5140314852840521


In [24]:
params = {'max_depth': None, 'max_features': 'log2', 'n_estimators': 180}
best_model = RandomForestClassifier(**params)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -2       0.55      0.72      0.62       473
          -1       0.68      0.70      0.69      2746
           0       0.81      0.72      0.77      4174
           1       0.55      0.74      0.63       522
           2       0.49      0.79      0.61       105

    accuracy                           0.72      8020
   macro avg       0.62      0.73      0.66      8020
weighted avg       0.73      0.72      0.72      8020



In [51]:
with open("Trained_models/Random_Forest.pkl", "wb") as f:
    pickle.dump(best_model, f)