In [1]:
import numpy as np
from sklearn.metrics import classification_report, f1_score
from sklearn.ensemble import RandomForestClassifier
from time import time
import pickle

### tf-idf as input

In [5]:
X_train = np.load('Data/unbalanced/tfidf_X_train.npy')
y_train = np.load('Data/unbalanced/y_train.npy')

X_test = np.load('Data/unbalanced/tfidf_X_test.npy')
y_test = np.load('Data/unbalanced/y_test.npy')

print(X_train.shape)

(18715, 5353)


In [4]:
params = {'max_depth': None, 'max_features': 'log2', 'n_estimators': 200}
best_model = RandomForestClassifier(**params, class_weight='balanced')
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -2       0.55      0.71      0.62       473
          -1       0.75      0.70      0.72      2746
           0       0.82      0.78      0.80      4174
           1       0.56      0.73      0.64       522
           2       0.49      0.79      0.61       105

    accuracy                           0.75      8020
   macro avg       0.63      0.74      0.68      8020
weighted avg       0.76      0.75      0.75      8020



### doc2vec as input

In [10]:
X_train = np.load('Data/unbalanced/doc2vec_X_train.npy')
y_train = np.load('Data/unbalanced/y_train.npy')

X_test = np.load('Data/unbalanced/doc2vec_X_test.npy')
y_test = np.load('Data/unbalanced/y_test.npy')

print(X_train.shape)

(18715, 300)


In [7]:
start = time()

scores = []
for n in range(10, 50, 5):
    params = {'max_depth': 65, 'max_features': 'log2', 'n_estimators': n}
    model = RandomForestClassifier(**params, class_weight='balanced')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    scores.append(f1_score(y_test, y_pred, average='macro'))
    print(f'n: {n}, f1_score: {scores[-1]}')

end = time()
print('time:', end-start)

n: 10, f1_score: 0.6789060714605956
n: 15, f1_score: 0.6790925152549938
n: 20, f1_score: 0.6808008642247632
n: 25, f1_score: 0.6814552430934377
n: 30, f1_score: 0.6858414097886532
n: 35, f1_score: 0.6854434568654
n: 40, f1_score: 0.6807267699787517
n: 45, f1_score: 0.688187273810885
time: 33.86594295501709


In [8]:
start = time()

scores = []
for d in range(10, 100, 5):
    params = {'max_depth': d, 'max_features': 'log2', 'n_estimators': 50}
    model = RandomForestClassifier(**params, class_weight='balanced')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    scores.append(f1_score(y_test, y_pred, average='macro'))
    print(f'depth: {d}, f1_score: {scores[-1]}')

end = time()
print('time:', end-start)

depth: 10, f1_score: 0.5861791015155366
depth: 15, f1_score: 0.681367723362298
depth: 20, f1_score: 0.6855034786496043
depth: 25, f1_score: 0.6866213026219827
depth: 30, f1_score: 0.6838212052483372
depth: 35, f1_score: 0.6856153002999927
depth: 40, f1_score: 0.6871252741575271
depth: 45, f1_score: 0.6858415693422069
depth: 50, f1_score: 0.685083450792518
depth: 55, f1_score: 0.6877499041451134
depth: 60, f1_score: 0.685061715580349
depth: 65, f1_score: 0.6843359696145977
depth: 70, f1_score: 0.686316016323796
depth: 75, f1_score: 0.6858839911876651
depth: 80, f1_score: 0.6840021298762914
depth: 85, f1_score: 0.6849612578733382
depth: 90, f1_score: 0.6866510591947701
depth: 95, f1_score: 0.6841370215500052
time: 133.34087300300598


In [12]:
params = {'max_depth': 30, 'max_features': 'log2', 'n_estimators': 30}
best_model = RandomForestClassifier(**params, class_weight='balanced')
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

          -2       0.60      0.70      0.64       473
          -1       0.74      0.73      0.73      2746
           0       0.82      0.78      0.80      4174
           1       0.59      0.70      0.64       522
           2       0.51      0.78      0.62       105

    accuracy                           0.75      8020
   macro avg       0.65      0.74      0.69      8020
weighted avg       0.76      0.75      0.76      8020



In [13]:
with open('report_rf.txt', 'w') as f:
    f.write(report)

In [13]:
with open("Trained_models/Random_Forest_doc2vec.pkl", "wb") as f:
    pickle.dump(best_model, f)