# Imports & Downloads

In [78]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split

# models
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, ComplementNB
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

# evaluation & tuning
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, KFold, RepeatedStratifiedKFold, cross_val_score, train_test_split

# Loading the data

In [79]:
dt = pd.read_csv("./data/twitter_training_cleaned_preprocessed.csv", index_col=0)

In [80]:
dt.head()

Unnamed: 0,tweet,sentiment,no_sw,no_sw_lem
0,booo,Negative,booo,booo
1,ok hear me out microsoft is going to pull a mi...,Neutral,hear microsoft pull mix ass prove choose xbox ...,hear microsoft pull mix ass prove choose xbox ...
2,hopping on the uzi is pretty good fortunate...,Positive,hopping uzi pretty fortunate time twitchtvjoke65,hopping uzi pretty fortunate time twitchtvjoke65
3,mr christoph sandrock best pubg teammate rescu...,Positive,mr christoph sandrock pubg teammate rescuer cr...,mr christoph sandrock pubg teammate rescuer cr...
4,eamaddennfl what is up with these player ratin...,Negative,eamaddennfl player ratings algorithm wrong,eamaddennfl player ratings algorithm wrong


In [4]:
dt.isna().sum()

tweet          4
sentiment      0
no_sw        734
no_sw_lem    734
dtype: int64

In [5]:
dt[dt.no_sw.isna()]

Unnamed: 0,tweet,sentiment,no_sw,no_sw_lem
9,is,Negative,,
63,,Positive,,
64,,Neutral,,
68,yo ill take that off you thank you very much,Positive,,
103,you,Neutral,,
...,...,...,...,...
56888,so so what,Positive,,
56946,good good,Positive,,
57000,so thats all right,Positive,,
57139,am i good,Irrelevant,,


In [6]:
dt.dropna(inplace=True)
dt.isna().sum()

tweet        0
sentiment    0
no_sw        0
no_sw_lem    0
dtype: int64

# Preparing the data for classification

## Tokenization

In [7]:
dt.head()

Unnamed: 0,tweet,sentiment,no_sw,no_sw_lem
0,booo,Negative,booo,booo
1,ok hear me out microsoft is going to pull a mi...,Neutral,hear microsoft pull mix ass prove choose xbox ...,hear microsoft pull mix ass prove choose xbox ...
2,hopping on the uzi is pretty good fortunate...,Positive,hopping uzi pretty fortunate time twitchtvjoke65,hopping uzi pretty fortunate time twitchtvjoke65
3,mr christoph sandrock best pubg teammate rescu...,Positive,mr christoph sandrock pubg teammate rescuer cr...,mr christoph sandrock pubg teammate rescuer cr...
4,eamaddennfl what is up with these player ratin...,Negative,eamaddennfl player ratings algorithm wrong,eamaddennfl player ratings algorithm wrong


In [8]:
df = dt.drop(columns=['tweet', 'no_sw'])
df.columns = ['sentiment', 'tweet']
df.head()

Unnamed: 0,sentiment,tweet
0,Negative,booo
1,Neutral,hear microsoft pull mix ass prove choose xbox ...
2,Positive,hopping uzi pretty fortunate time twitchtvjoke65
3,Positive,mr christoph sandrock pubg teammate rescuer cr...
4,Negative,eamaddennfl player ratings algorithm wrong


In [9]:
tokenized_tweet = df['tweet'].apply(lambda x: x.split())
tokenized_tweet.head(5)

0                                               [booo]
1    [hear, microsoft, pull, mix, ass, prove, choos...
2    [hopping, uzi, pretty, fortunate, time, twitch...
3    [mr, christoph, sandrock, pubg, teammate, resc...
4     [eamaddennfl, player, ratings, algorithm, wrong]
Name: tweet, dtype: object

In [10]:
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english', ngram_range=(1, 1), tokenizer=token.tokenize)
text_counts = cv.fit_transform(df['tweet'])



## TF-IDF

In [11]:
tfidf = TfidfVectorizer()
text_count_2 = tfidf.fit_transform(df['tweet'])

# Preparation for model evaluation

In [12]:
evaluation = pd.DataFrame(columns=["model", "target", "variant", "train_acc", "train_prec", "train_rec", "train_f1", "test_acc", "test_prec", "test_rec", "test_f1"])
evaluation

Unnamed: 0,model,target,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1


In [13]:
def add_to_eval_df(model, model_name, target, variant, x_data_train, y_data_train, x_data_test, y_data_test):
    train_acc = model.score(x_data_train, y_data_train)
    train_precision = precision_score(y_data_train, model.predict(x_data_train), average="weighted")
    train_recall = recall_score(y_data_train, model.predict(x_data_train), average="weighted")
    train_f1 = f1_score(y_data_train, model.predict(x_data_train), average="weighted")

    test_acc = model.score(x_data_test, y_data_test)
    test_precision = precision_score(y_data_test, model.predict(x_data_test), average="weighted")
    test_recall = recall_score(y_data_test, model.predict(x_data_test), average="weighted")
    test_f1 = f1_score(y_data_test, model.predict(x_data_test), average="weighted")

    evaluation.loc[len(evaluation.index)] = [model_name, target, variant, train_acc, train_precision, train_recall, train_f1, test_acc, test_precision, test_recall, test_f1]

In [14]:
def evaluate_model(model, x_test, y_test, model_name):
    pred = model.predict(x_test)
    accscore = metrics.accuracy_score(pred, y_test)

    print(f'{model_name} model accuracy for multi-class classification is =', str('{:04.2f}'.format(accscore * 100)) + '%')
    print('------------------------------------------------')
    print('Confusion Matrix:')
    print(pd.DataFrame(confusion_matrix(y_test, pred)))
    print('------------------------------------------------')
    print('Classification Report:')
    print(classification_report(y_test, pred))

# Prepare target & split data

## binary target

### target prep

In [81]:
nb_bin = df.copy()

In [82]:
nb_bin["sentiment_label"] = df["sentiment"].map({"Positive": 0, "Negative": 1, "Neutral": 0, "Irrelevant": 0})
nb_bin.drop(columns=['sentiment'], axis=1, inplace=True)
nb_bin.head()

Unnamed: 0,tweet,sentiment_label
0,booo,1
1,hear microsoft pull mix ass prove choose xbox ...,0
2,hopping uzi pretty fortunate time twitchtvjoke65,0
3,mr christoph sandrock pubg teammate rescuer cr...,0
4,eamaddennfl player ratings algorithm wrong,1


### data split

#### tokenization

In [17]:
X_bin = text_counts
y_bin = nb_bin['sentiment_label']
X_train_binary, X_test_binary, y_train_binary, y_test_binary = train_test_split(X_bin, y_bin, test_size=0.25, random_state=30)

In [18]:
print(f"X-Train: {X_train_binary.shape}, ")
print(f"X-Test: {X_test_binary.shape}")

X-Train: (40812, 34495), 
X-Test: (13605, 34495)


#### tf-idf

In [19]:
X_bin_tf = text_count_2
y_bin = nb_bin['sentiment_label']
X_train_binary_tf, X_test_binary_tf, y_train_binary_tf, y_test_binary_tf = train_test_split(X_bin_tf, y_bin, test_size=0.25, random_state=30)

In [20]:
print(f"X-Train: {X_train_binary.shape}, ")
print(f"X-Test: {X_test_binary.shape}")

X-Train: (40812, 34495), 
X-Test: (13605, 34495)


## multi-class

### target prep

In [21]:
nb_mul = df.copy()

In [22]:
nb_mul["sentiment_label"] = df["sentiment"].map({"Positive": 1, "Negative": 2, "Neutral": 3, "Irrelevant": 4})
nb_mul.drop(columns=['sentiment'], axis=1, inplace=True)
nb_mul.head()

Unnamed: 0,tweet,sentiment_label
0,booo,2
1,hear microsoft pull mix ass prove choose xbox ...,3
2,hopping uzi pretty fortunate time twitchtvjoke65,1
3,mr christoph sandrock pubg teammate rescuer cr...,1
4,eamaddennfl player ratings algorithm wrong,2


### data split

#### Tokenizer

In [23]:
X_multi = text_counts
y_multi = nb_mul['sentiment_label']
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(X_multi, y_multi, test_size=0.25, random_state=30)

In [24]:
print(f"X-Train: {X_train_multi.shape}")
print(f"X-Test: {X_test_multi.shape}")

X-Train: (40812, 34495)
X-Test: (13605, 34495)


#### TF-IDF

In [25]:
X_multi_tf = text_count_2
y_multi = nb_mul['sentiment_label']
X_train_multi_tf, X_test_multi_tf, y_train_multi_tf, y_test_multi_tf = train_test_split(X_multi_tf, y_multi, test_size=0.25, random_state=30)

In [26]:
print(f"X-Train: {X_train_multi_tf.shape}")
print(f"X-Test: {X_test_multi_tf.shape}")

X-Train: (40812, 34641)
X-Test: (13605, 34641)


# Model: Naive-Bayes - binary-class target

## binary target

### BernoulliNB

In [27]:
BNB_bin = BernoulliNB()
BNB_bin.fit(X_train_binary, y_train_binary)

In [28]:
predicted = BNB_bin.predict(X_test_binary)
acc_score = metrics.accuracy_score(predicted, y_test_binary)

In [29]:
print('BernoulliNB model accuracy for binary classification is = ' + str('{:4.2f}'.format(acc_score * 100)) + '%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test_binary, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test_binary, predicted))

BernoulliNB model accuracy for binary classification is = 85.23%
------------------------------------------------
Confusion Matrix:
      0     1
0  8844   580
1  1429  2752
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.94      0.90      9424
           1       0.83      0.66      0.73      4181

    accuracy                           0.85     13605
   macro avg       0.84      0.80      0.82     13605
weighted avg       0.85      0.85      0.85     13605



In [30]:
evaluate_model(BNB_bin, X_test_binary, y_test_binary, "BernoulliNB")

BernoulliNB model accuracy for multi-class classification is = 85.23%
------------------------------------------------
Confusion Matrix:
      0     1
0  8844   580
1  1429  2752
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.94      0.90      9424
           1       0.83      0.66      0.73      4181

    accuracy                           0.85     13605
   macro avg       0.84      0.80      0.82     13605
weighted avg       0.85      0.85      0.85     13605



In [31]:
add_to_eval_df(BNB_bin, "BernoulliNB", "binary", "tokenizer", X_train_binary, y_train_binary, X_test_binary, y_test_binary)

### MultinomialNB

In [32]:
MNB_bin = MultinomialNB()
MNB_bin.fit(X_train_binary, y_train_binary)

In [33]:
predicted = MNB_bin.predict(X_test_binary)
acc_score = metrics.accuracy_score(predicted, y_test_binary)

In [34]:
print('MultinominalNB model accuracy for binary classification is = ', str('{:04.2f}'.format(acc_score * 100)) + '%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test_binary, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test_binary, predicted))

MultinominalNB model accuracy for binary classification is =  86.32%
------------------------------------------------
Confusion Matrix:
      0     1
0  8555   869
1   992  3189
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.91      0.90      9424
           1       0.79      0.76      0.77      4181

    accuracy                           0.86     13605
   macro avg       0.84      0.84      0.84     13605
weighted avg       0.86      0.86      0.86     13605



In [35]:
evaluate_model(MNB_bin, X_test_binary, y_test_binary, "ComplementNB")

ComplementNB model accuracy for multi-class classification is = 86.32%
------------------------------------------------
Confusion Matrix:
      0     1
0  8555   869
1   992  3189
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.91      0.90      9424
           1       0.79      0.76      0.77      4181

    accuracy                           0.86     13605
   macro avg       0.84      0.84      0.84     13605
weighted avg       0.86      0.86      0.86     13605



In [36]:
add_to_eval_df(MNB_bin, "MultinominalNB", "binary", "tokenizer", X_train_binary, y_train_binary, X_test_binary, y_test_binary)

### ComplementNB

In [37]:
CNB_bin = ComplementNB()
CNB_bin.fit(X_train_binary, y_train_binary)

In [38]:
predicted = CNB_bin.predict(X_test_binary)
acc_score = metrics.accuracy_score(predicted, y_test_binary)

In [39]:
print('ComplementNB model accuracy for binary classification is = ', str('{:04.2f}'.format(acc_score * 100)) + '%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test_binary, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test_binary, predicted))

ComplementNB model accuracy for binary classification is =  83.84%
------------------------------------------------
Confusion Matrix:
      0     1
0  7819  1605
1   594  3587
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.83      0.88      9424
           1       0.69      0.86      0.77      4181

    accuracy                           0.84     13605
   macro avg       0.81      0.84      0.82     13605
weighted avg       0.86      0.84      0.84     13605



In [40]:
evaluate_model(CNB_bin, X_test_binary, y_test_binary, "ComplementNB")

ComplementNB model accuracy for multi-class classification is = 83.84%
------------------------------------------------
Confusion Matrix:
      0     1
0  7819  1605
1   594  3587
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.83      0.88      9424
           1       0.69      0.86      0.77      4181

    accuracy                           0.84     13605
   macro avg       0.81      0.84      0.82     13605
weighted avg       0.86      0.84      0.84     13605



In [41]:
add_to_eval_df(CNB_bin, "ComplementNB", "binary", "tokenizer", X_train_binary, y_train_binary, X_test_binary, y_test_binary)

## multi-class target

### BernoulliNB

In [42]:
BNB_multi = BernoulliNB()
BNB_multi.fit(X_train_multi, y_train_multi)

In [43]:
predicted = BNB_multi.predict(X_test_multi)
acc_score = metrics.accuracy_score(predicted,y_test_multi)

In [44]:
print('BernoulliNB model accuracy for multi-class classification is = ' + str('{:4.2f}'.format(acc_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test_multi, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test_multi, predicted))

BernoulliNB model accuracy for multi-class classification is = 71.90%
------------------------------------------------
Confusion Matrix:
      0     1     2     3
0  3157   369   100    43
1   487  3530   115    49
2   854   533  1897    83
3   661   450    79  1198
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           1       0.61      0.86      0.72      3669
           2       0.72      0.84      0.78      4181
           3       0.87      0.56      0.68      3367
           4       0.87      0.50      0.64      2388

    accuracy                           0.72     13605
   macro avg       0.77      0.69      0.70     13605
weighted avg       0.75      0.72      0.71     13605



In [45]:
evaluate_model(BNB_multi, X_test_multi, y_test_multi, "BernoulliNB")

BernoulliNB model accuracy for multi-class classification is = 71.90%
------------------------------------------------
Confusion Matrix:
      0     1     2     3
0  3157   369   100    43
1   487  3530   115    49
2   854   533  1897    83
3   661   450    79  1198
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           1       0.61      0.86      0.72      3669
           2       0.72      0.84      0.78      4181
           3       0.87      0.56      0.68      3367
           4       0.87      0.50      0.64      2388

    accuracy                           0.72     13605
   macro avg       0.77      0.69      0.70     13605
weighted avg       0.75      0.72      0.71     13605



In [46]:
add_to_eval_df(BNB_multi, "BernoulliNB", "multi-class", "tokenizer", X_train_multi, y_train_multi, X_test_multi, y_test_multi)

### MultinomialNB

In [47]:
MNB_multi = MultinomialNB()
MNB_multi.fit(X_train_multi, y_train_multi)

In [48]:
predicted = MNB_multi.predict(X_test_multi)
acc_score = metrics.accuracy_score(predicted, y_test_multi)

In [49]:
print('MultinominalNB model accuracy for multi-class classification is =',str('{:04.2f}'.format(acc_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test_multi, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test_multi, predicted))

MultinominalNB model accuracy for multi-class classification is = 74.30%
------------------------------------------------
Confusion Matrix:
      0     1     2     3
0  2910   417   217   125
1   375  3506   196   104
2   543   477  2210   137
3   389   382   134  1483
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           1       0.69      0.79      0.74      3669
           2       0.73      0.84      0.78      4181
           3       0.80      0.66      0.72      3367
           4       0.80      0.62      0.70      2388

    accuracy                           0.74     13605
   macro avg       0.76      0.73      0.74     13605
weighted avg       0.75      0.74      0.74     13605



In [50]:
evaluate_model(MNB_multi, X_test_multi, y_test_multi, "MultinominalNB")

MultinominalNB model accuracy for multi-class classification is = 74.30%
------------------------------------------------
Confusion Matrix:
      0     1     2     3
0  2910   417   217   125
1   375  3506   196   104
2   543   477  2210   137
3   389   382   134  1483
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           1       0.69      0.79      0.74      3669
           2       0.73      0.84      0.78      4181
           3       0.80      0.66      0.72      3367
           4       0.80      0.62      0.70      2388

    accuracy                           0.74     13605
   macro avg       0.76      0.73      0.74     13605
weighted avg       0.75      0.74      0.74     13605



In [51]:
add_to_eval_df(MNB_multi, "MultinominalNB", "multi-class", "tokenizer", X_train_multi, y_train_multi, X_test_multi, y_test_multi)

### ComplementNB

In [52]:
CNB_multi = ComplementNB()
CNB_multi.fit(X_train_multi, y_train_multi)

In [53]:
predicted = CNB_multi.predict(X_test_multi)
acc_score = metrics.accuracy_score(predicted, y_test_multi)

In [54]:
print('ComplementNB model accuracy for multi-class classification is =',str('{:04.2f}'.format(acc_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test_multi, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test_multi, predicted))

ComplementNB model accuracy for multi-class classification is = 76.38%
------------------------------------------------
Confusion Matrix:
      0     1     2     3
0  2824   419   223   203
1   303  3511   208   159
2   400   401  2391   175
3   315   295   112  1666
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           1       0.74      0.77      0.75      3669
           2       0.76      0.84      0.80      4181
           3       0.81      0.71      0.76      3367
           4       0.76      0.70      0.73      2388

    accuracy                           0.76     13605
   macro avg       0.77      0.75      0.76     13605
weighted avg       0.77      0.76      0.76     13605



In [55]:
evaluate_model(CNB_multi, X_test_multi, y_test_multi, "ComplementNB")

ComplementNB model accuracy for multi-class classification is = 76.38%
------------------------------------------------
Confusion Matrix:
      0     1     2     3
0  2824   419   223   203
1   303  3511   208   159
2   400   401  2391   175
3   315   295   112  1666
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           1       0.74      0.77      0.75      3669
           2       0.76      0.84      0.80      4181
           3       0.81      0.71      0.76      3367
           4       0.76      0.70      0.73      2388

    accuracy                           0.76     13605
   macro avg       0.77      0.75      0.76     13605
weighted avg       0.77      0.76      0.76     13605



In [56]:
add_to_eval_df(CNB_multi, "ComplementNB", "multi-class", "tokenizer", X_train_multi, y_train_multi, X_test_multi, y_test_multi)

# Model: RandomForest - binary-class target

## binary target

### base model

In [57]:
RF_bin = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
RF_bin.fit(X_train_binary, y_train_binary)

In [58]:
evaluate_model(RF_bin, X_test_binary, y_test_binary, "RandomForest")

RandomForest model accuracy for multi-class classification is = 93.49%
------------------------------------------------
Confusion Matrix:
      0     1
0  9038   386
1   500  3681
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.96      0.95      9424
           1       0.91      0.88      0.89      4181

    accuracy                           0.93     13605
   macro avg       0.93      0.92      0.92     13605
weighted avg       0.93      0.93      0.93     13605



In [59]:
add_to_eval_df(RF_bin, "RandomForest", "binary", "tokenizer", X_train_binary, y_train_binary, X_test_binary, y_test_binary)

### tuned model

In [75]:
RF_bin.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': 10,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [65]:
rf = RandomForestClassifier(n_jobs=-1)

params = {
    "n_estimators": np.arange(start=50, stop=250, step=50),
    'max_features': [None, "sqrt"],
    'bootstrap': [True, False],
    'max_samples': [None, .3, .5],
    'max_depth': np.arange(start=2, stop=7, step=1),
    'min_samples_leaf': [1, 3, 5],
}

rand = RandomizedSearchCV(rf, params, scoring="accuracy")

rand.fit(X_train_binary, y_train_binary)

10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\admin\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\admin\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 397, in fit
    raise ValueError(
ValueError: `max_sample` cannot be set if `bootstrap=False`. Either switch to `bootstrap=True` or set `max_sample=None`.

 0.7374547  0.69695188 0.72405183 0.69695188]


In [66]:
rand.best_params_

{'n_estimators': 200,
 'min_samples_leaf': 5,
 'max_samples': 0.3,
 'max_features': None,
 'max_depth': 6,
 'bootstrap': True}

{'n_estimators': 200,
 'min_samples_leaf': 5,
 'max_samples': 0.3,
 'max_features': None,
 'max_depth': 6,
 'bootstrap': True}

In [70]:
RF_bin_tuned = RandomForestClassifier(**rand.best_params_, n_jobs=-1, random_state=42)
RF_bin_tuned.fit(X_train_binary, y_train_binary)

In [71]:
evaluate_model(RF_bin_tuned, X_test_binary, y_test_binary, "RandomForest-tuned")

RandomForest-tuned model accuracy for multi-class classification is = 73.25%
------------------------------------------------
Confusion Matrix:
      0    1
0  9249  175
1  3464  717
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.98      0.84      9424
           1       0.80      0.17      0.28      4181

    accuracy                           0.73     13605
   macro avg       0.77      0.58      0.56     13605
weighted avg       0.75      0.73      0.67     13605



In [72]:
add_to_eval_df(RF_bin_tuned, "RandomForest-tuned", "binary", "tokenizer", X_train_binary, y_train_binary, X_test_binary, y_test_binary)

## multi-class target

In [60]:
RF_multi = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
RF_multi.fit(X_train_multi, y_train_multi)

In [61]:
evaluate_model(RF_multi, X_test_multi, y_test_multi, "RandomForest")

RandomForest model accuracy for multi-class classification is = 86.98%
------------------------------------------------
Confusion Matrix:
      0     1     2     3
0  3301   204   102    62
1   215  3844    76    46
2   282   255  2778    52
3   218   174    85  1911
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           1       0.82      0.90      0.86      3669
           2       0.86      0.92      0.89      4181
           3       0.91      0.83      0.87      3367
           4       0.92      0.80      0.86      2388

    accuracy                           0.87     13605
   macro avg       0.88      0.86      0.87     13605
weighted avg       0.87      0.87      0.87     13605



In [62]:
add_to_eval_df(RF_multi, "RandomForest", "multi-class", "tokenizer", X_train_multi, y_train_multi, X_test_multi, y_test_multi)

# Model comparison

In [73]:
evaluation.sort_values(by=["train_f1"], ascending=False)

Unnamed: 0,model,target,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
6,RandomForest,binary,tokenizer,0.995148,0.99515,0.995148,0.995143,0.934877,0.934521,0.934877,0.934623
7,RandomForest,multi-class,tokenizer,0.990836,0.990843,0.990836,0.990837,0.869827,0.873572,0.869827,0.869586
1,MultinominalNB,binary,tokenizer,0.900176,0.899257,0.900176,0.899537,0.863212,0.862215,0.863212,0.862634
2,ComplementNB,binary,tokenizer,0.885426,0.89482,0.885426,0.887694,0.838368,0.856093,0.838368,0.842505
0,BernoulliNB,binary,tokenizer,0.886136,0.886121,0.886136,0.88239,0.852334,0.850152,0.852334,0.847173
5,ComplementNB,multi-class,tokenizer,0.838038,0.840647,0.838038,0.83788,0.763837,0.765885,0.763837,0.763028
4,MultinominalNB,multi-class,tokenizer,0.813413,0.821357,0.813413,0.812622,0.743036,0.750569,0.743036,0.74094
3,BernoulliNB,multi-class,tokenizer,0.783373,0.817397,0.783373,0.780211,0.719,0.754661,0.719,0.713033
8,RandomForest-tuned,binary,tokenizer,0.7377,0.755797,0.7377,0.672488,0.732525,0.750968,0.732525,0.665688
