In [2]:
import numpy as np
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

In [4]:
X_train = np.load('../Data/unbalanced/doc2vec_X_train.npy')
y_train = np.load('../Data/unbalanced/y_train.npy')

X_test = np.load('../Data/unbalanced/doc2vec_X_test.npy')
y_test = np.load('../Data/unbalanced/y_test.npy')

print(X_test.shape)

(8020, 300)


# Voting

In [5]:
rf = RandomForestClassifier(max_depth=30, max_features='log2', n_estimators=30, class_weight='balanced')
knn = KNeighborsClassifier(n_neighbors=1, n_jobs=-1)
tree = DecisionTreeClassifier(class_weight='balanced')
xgboost = xgb.XGBClassifier()

estimators = [('rf', rf), ('tree', tree), ('knn', knn), ('xgboost', xgboost)]
weights = [0.35, 0.15, 0.15, 0.35]  # according to solo f-scores: [0.68, 0.66, 0.66, 0.68]

ensemble = VotingClassifier(estimators, weights=weights, voting='soft')
ensemble.fit(X_train, y_train + 2)

In [6]:
y_pred = ensemble.predict(X_test) - 2
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

          -2       0.61      0.64      0.63       473
          -1       0.74      0.72      0.73      2746
           0       0.82      0.80      0.81      4174
           1       0.61      0.68      0.64       522
           2       0.53      0.75      0.62       105

    accuracy                           0.76      8020
   macro avg       0.66      0.72      0.69      8020
weighted avg       0.76      0.76      0.76      8020



In [8]:
with open('report.txt', 'w') as f:
    f.write(report)

# Stack

In [40]:
rf = RandomForestClassifier(max_depth=30, max_features='log2', n_estimators=30, class_weight='balanced')
knn = KNeighborsClassifier(n_neighbors=1, n_jobs=-1)
tree = DecisionTreeClassifier(class_weight='balanced')
xgboost = xgb.XGBClassifier()

estimators = [('rf', rf), ('tree', tree), ('knn', knn), ('xgboost', xgboost)]
weights = [0.35, 0.15, 0.15, 0.35]  # according to solo f-scores: [0.68, 0.66, 0.66, 0.68]

ensemble = VotingClassifier(estimators, weights=weights, voting='soft')
ensemble.fit(X_train, y_train)

StackingClassifier(estimators=estimators,
                         final_estimator=LogisticRegression(solver='sag', max_iter=100))

In [43]:
y_pred = ensemble.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -2       0.61      0.64      0.63       473
          -1       0.74      0.72      0.73      2746
           0       0.82      0.80      0.81      4174
           1       0.61      0.68      0.64       522
           2       0.52      0.76      0.62       105

    accuracy                           0.76      8020
   macro avg       0.66      0.72      0.69      8020
weighted avg       0.76      0.76      0.76      8020

              precision    recall  f1-score   support

          -2       0.61      0.64      0.63       473
          -1       0.74      0.72      0.73      2746
           0       0.82      0.80      0.81      4174
           1       0.61      0.68      0.64       522
           2       0.52      0.76      0.62       105

    accuracy                           0.76      8020
   macro avg       0.66      0.72      0.69      8020
weighted avg       0.76      0.76      0.76      8020

