# Imports & Downloads

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split

# models
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, ComplementNB
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

# evaluation & tuning
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, KFold, RepeatedStratifiedKFold, cross_val_score, train_test_split

# Loading the data

In [2]:
dt = pd.read_csv("./data/twitter_training_cleaned_preprocessed.csv", index_col=0)

In [3]:
dt.head()

Unnamed: 0,tweet,sentiment,no_sw,no_sw_lem
0,booo,Negative,booo,booo
1,ok hear me out microsoft is going to pull a mi...,Neutral,hear microsoft pull mix ass prove choose xbox ...,hear microsoft pull mix ass prove choose xbox ...
2,hopping on the uzi is pretty good fortunate...,Positive,hopping uzi pretty fortunate time twitchtvjoke65,hopping uzi pretty fortunate time twitchtvjoke65
3,mr christoph sandrock best pubg teammate rescu...,Positive,mr christoph sandrock pubg teammate rescuer cr...,mr christoph sandrock pubg teammate rescuer cr...
4,eamaddennfl what is up with these player ratin...,Negative,eamaddennfl player ratings algorithm wrong,eamaddennfl player ratings algorithm wrong


In [4]:
dt.isna().sum()

tweet          4
sentiment      0
no_sw        734
no_sw_lem    734
dtype: int64

In [5]:
dt[dt.no_sw.isna()]

Unnamed: 0,tweet,sentiment,no_sw,no_sw_lem
9,is,Negative,,
63,,Positive,,
64,,Neutral,,
68,yo ill take that off you thank you very much,Positive,,
103,you,Neutral,,
...,...,...,...,...
56888,so so what,Positive,,
56946,good good,Positive,,
57000,so thats all right,Positive,,
57139,am i good,Irrelevant,,


In [6]:
dt.dropna(inplace=True)
dt.isna().sum()

tweet        0
sentiment    0
no_sw        0
no_sw_lem    0
dtype: int64

# Preparing the data for classification

In [7]:
dt.head()

Unnamed: 0,tweet,sentiment,no_sw,no_sw_lem
0,booo,Negative,booo,booo
1,ok hear me out microsoft is going to pull a mi...,Neutral,hear microsoft pull mix ass prove choose xbox ...,hear microsoft pull mix ass prove choose xbox ...
2,hopping on the uzi is pretty good fortunate...,Positive,hopping uzi pretty fortunate time twitchtvjoke65,hopping uzi pretty fortunate time twitchtvjoke65
3,mr christoph sandrock best pubg teammate rescu...,Positive,mr christoph sandrock pubg teammate rescuer cr...,mr christoph sandrock pubg teammate rescuer cr...
4,eamaddennfl what is up with these player ratin...,Negative,eamaddennfl player ratings algorithm wrong,eamaddennfl player ratings algorithm wrong


In [8]:
df = dt.drop(columns=['tweet', 'no_sw'])
df.columns = ['sentiment', 'tweet']
df.head()

Unnamed: 0,sentiment,tweet
0,Negative,booo
1,Neutral,hear microsoft pull mix ass prove choose xbox ...
2,Positive,hopping uzi pretty fortunate time twitchtvjoke65
3,Positive,mr christoph sandrock pubg teammate rescuer cr...
4,Negative,eamaddennfl player ratings algorithm wrong


## Tokenization

In [9]:
tokenized_tweet = df['tweet'].apply(lambda x: x.split())
tokenized_tweet.head(5)

0                                               [booo]
1    [hear, microsoft, pull, mix, ass, prove, choos...
2    [hopping, uzi, pretty, fortunate, time, twitch...
3    [mr, christoph, sandrock, pubg, teammate, resc...
4     [eamaddennfl, player, ratings, algorithm, wrong]
Name: tweet, dtype: object

In [10]:
max_features = 20000
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english', ngram_range=(1, 1), tokenizer=token.tokenize, max_features=max_features)
text_counts = cv.fit_transform(df['tweet'])



## TF-IDF

In [11]:
max_features = 20000
tfidf = TfidfVectorizer(max_features=max_features)
text_count_2 = tfidf.fit_transform(df['tweet'])

# Preparation for model evaluation

In [12]:
evaluation = pd.DataFrame(columns=["model", "target", "variant", "train_acc", "train_prec", "train_rec", "train_f1", "test_acc", "test_prec", "test_rec", "test_f1"])
evaluation

Unnamed: 0,model,target,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1


In [13]:
def add_to_eval_df(model, model_name, target, variant, x_data_train, y_data_train, x_data_test, y_data_test):
    train_acc = model.score(x_data_train, y_data_train)
    train_precision = precision_score(y_data_train, model.predict(x_data_train), average="weighted")
    train_recall = recall_score(y_data_train, model.predict(x_data_train), average="weighted")
    train_f1 = f1_score(y_data_train, model.predict(x_data_train), average="weighted")

    test_acc = model.score(x_data_test, y_data_test)
    test_precision = precision_score(y_data_test, model.predict(x_data_test), average="weighted")
    test_recall = recall_score(y_data_test, model.predict(x_data_test), average="weighted")
    test_f1 = f1_score(y_data_test, model.predict(x_data_test), average="weighted")

    evaluation.loc[len(evaluation.index)] = [model_name, target, variant, train_acc, train_precision, train_recall, train_f1, test_acc, test_precision, test_recall, test_f1]

In [14]:
def evaluate_model(model, x_test, y_test, model_name):
    pred = model.predict(x_test)
    accscore = metrics.accuracy_score(pred, y_test)

    print(f'{model_name} model accuracy for multi-class classification is =', str('{:04.2f}'.format(accscore * 100)) + '%')
    print('------------------------------------------------')
    print('Confusion Matrix:')
    print(pd.DataFrame(confusion_matrix(y_test, pred)))
    print('------------------------------------------------')
    print('Classification Report:')
    print(classification_report(y_test, pred))

# Prepare target & split data

## binary target

### target prep

In [15]:
nb_bin = df.copy()

In [16]:
nb_bin["sentiment_label"] = df["sentiment"].map({"Positive": 0, "Negative": 1, "Neutral": 0, "Irrelevant": 0})
nb_bin.drop(columns=['sentiment'], axis=1, inplace=True)
nb_bin.head()

Unnamed: 0,tweet,sentiment_label
0,booo,1
1,hear microsoft pull mix ass prove choose xbox ...,0
2,hopping uzi pretty fortunate time twitchtvjoke65,0
3,mr christoph sandrock pubg teammate rescuer cr...,0
4,eamaddennfl player ratings algorithm wrong,1


### data split

#### tokenization

In [17]:
X_bin = text_counts
y_bin = nb_bin['sentiment_label']
X_train_binary, X_test_binary, y_train_binary, y_test_binary = train_test_split(X_bin, y_bin, test_size=0.25, random_state=30)

In [18]:
print(f"X-Train: {X_train_binary.shape}, ")
print(f"X-Test: {X_test_binary.shape}")

X-Train: (40812, 20000), 
X-Test: (13605, 20000)


#### tf-idf

In [19]:
X_bin_tf = text_count_2
y_bin = nb_bin['sentiment_label']
X_train_binary_tf, X_test_binary_tf, y_train_binary_tf, y_test_binary_tf = train_test_split(X_bin_tf, y_bin, test_size=0.25, random_state=30)

In [20]:
print(f"X-Train: {X_train_binary.shape}, ")
print(f"X-Test: {X_test_binary.shape}")

X-Train: (40812, 20000), 
X-Test: (13605, 20000)


## multi-class

### target prep

In [21]:
nb_mul = df.copy()

In [22]:
nb_mul["sentiment_label"] = df["sentiment"].map({"Positive": 1, "Negative": 2, "Neutral": 3, "Irrelevant": 4})
nb_mul.drop(columns=['sentiment'], axis=1, inplace=True)
nb_mul.head()

Unnamed: 0,tweet,sentiment_label
0,booo,2
1,hear microsoft pull mix ass prove choose xbox ...,3
2,hopping uzi pretty fortunate time twitchtvjoke65,1
3,mr christoph sandrock pubg teammate rescuer cr...,1
4,eamaddennfl player ratings algorithm wrong,2


### data split

#### Tokenizer

In [23]:
X_multi = text_counts
y_multi = nb_mul['sentiment_label']
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(X_multi, y_multi, test_size=0.25, random_state=30)

In [24]:
print(f"X-Train: {X_train_multi.shape}")
print(f"X-Test: {X_test_multi.shape}")

X-Train: (40812, 20000)
X-Test: (13605, 20000)


#### TF-IDF

In [25]:
X_multi_tf = text_count_2
y_multi = nb_mul['sentiment_label']
X_train_multi_tf, X_test_multi_tf, y_train_multi_tf, y_test_multi_tf = train_test_split(X_multi_tf, y_multi, test_size=0.25, random_state=30)

In [26]:
print(f"X-Train: {X_train_multi_tf.shape}")
print(f"X-Test: {X_test_multi_tf.shape}")

X-Train: (40812, 20000)
X-Test: (13605, 20000)


# Model: Naive-Bayes

## binary target

### BernoulliNB

#### tokenization

In [27]:
BNB_bin = BernoulliNB()
BNB_bin.fit(X_train_binary, y_train_binary)

In [28]:
predicted = BNB_bin.predict(X_test_binary)
acc_score = metrics.accuracy_score(predicted, y_test_binary)

In [29]:
evaluate_model(BNB_bin, X_test_binary, y_test_binary, "BernoulliNB")

BernoulliNB model accuracy for multi-class classification is = 85.55%
------------------------------------------------
Confusion Matrix:
      0     1
0  8729   695
1  1271  2910
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.93      0.90      9424
           1       0.81      0.70      0.75      4181

    accuracy                           0.86     13605
   macro avg       0.84      0.81      0.82     13605
weighted avg       0.85      0.86      0.85     13605



In [30]:
add_to_eval_df(BNB_bin, "BernoulliNB", "binary", "tokenizer", X_train_binary, y_train_binary, X_test_binary, y_test_binary)

#### tfidf

In [31]:
BNB_bin_tf = BernoulliNB()
BNB_bin_tf.fit(X_train_binary_tf, y_train_binary_tf)

In [32]:
predicted = BNB_bin_tf.predict(X_test_binary_tf)
acc_score = metrics.accuracy_score(predicted, y_test_binary_tf)

In [33]:
evaluate_model(BNB_bin_tf, X_test_binary_tf, y_test_binary_tf, "BernoulliNB")

BernoulliNB model accuracy for multi-class classification is = 85.58%
------------------------------------------------
Confusion Matrix:
      0     1
0  8728   696
1  1266  2915
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.93      0.90      9424
           1       0.81      0.70      0.75      4181

    accuracy                           0.86     13605
   macro avg       0.84      0.81      0.82     13605
weighted avg       0.85      0.86      0.85     13605



In [34]:
add_to_eval_df(BNB_bin_tf, "BernoulliNB", "binary", "tf-idf", X_train_binary_tf, y_train_binary_tf, X_test_binary_tf, y_test_binary_tf)

### MultinomialNB

#### tokenization

In [35]:
MNB_bin = MultinomialNB()
MNB_bin.fit(X_train_binary, y_train_binary)

In [36]:
predicted = MNB_bin.predict(X_test_binary)
acc_score = metrics.accuracy_score(predicted, y_test_binary)

In [37]:
evaluate_model(MNB_bin, X_test_binary, y_test_binary, "ComplementNB")

ComplementNB model accuracy for multi-class classification is = 86.38%
------------------------------------------------
Confusion Matrix:
      0     1
0  8524   900
1   953  3228
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.90      0.90      9424
           1       0.78      0.77      0.78      4181

    accuracy                           0.86     13605
   macro avg       0.84      0.84      0.84     13605
weighted avg       0.86      0.86      0.86     13605



In [38]:
add_to_eval_df(MNB_bin, "MultinominalNB", "binary", "tokenizer", X_train_binary, y_train_binary, X_test_binary, y_test_binary)

#### tf-idf

In [39]:
MNB_bin = MultinomialNB()
MNB_bin.fit(X_train_binary, y_train_binary)

In [40]:
predicted = MNB_bin.predict(X_test_binary)
acc_score = metrics.accuracy_score(predicted, y_test_binary)

In [41]:
evaluate_model(MNB_bin, X_test_binary, y_test_binary, "ComplementNB")

ComplementNB model accuracy for multi-class classification is = 86.38%
------------------------------------------------
Confusion Matrix:
      0     1
0  8524   900
1   953  3228
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.90      0.90      9424
           1       0.78      0.77      0.78      4181

    accuracy                           0.86     13605
   macro avg       0.84      0.84      0.84     13605
weighted avg       0.86      0.86      0.86     13605



In [42]:
add_to_eval_df(MNB_bin, "MultinominalNB", "binary", "tokenizer", X_train_binary, y_train_binary, X_test_binary, y_test_binary)

### ComplementNB

In [43]:
CNB_bin = ComplementNB()
CNB_bin.fit(X_train_binary, y_train_binary)

In [44]:
predicted = CNB_bin.predict(X_test_binary)
acc_score = metrics.accuracy_score(predicted, y_test_binary)

In [45]:
evaluate_model(CNB_bin, X_test_binary, y_test_binary, "ComplementNB")

ComplementNB model accuracy for multi-class classification is = 83.95%
------------------------------------------------
Confusion Matrix:
      0     1
0  7831  1593
1   591  3590
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.83      0.88      9424
           1       0.69      0.86      0.77      4181

    accuracy                           0.84     13605
   macro avg       0.81      0.84      0.82     13605
weighted avg       0.86      0.84      0.84     13605



In [46]:
add_to_eval_df(CNB_bin, "ComplementNB", "binary", "tokenizer", X_train_binary, y_train_binary, X_test_binary, y_test_binary)

## multi-class target

### BernoulliNB

In [47]:
BNB_multi = BernoulliNB()
BNB_multi.fit(X_train_multi, y_train_multi)

In [48]:
predicted = BNB_multi.predict(X_test_multi)
acc_score = metrics.accuracy_score(predicted,y_test_multi)

In [49]:
evaluate_model(BNB_multi, X_test_multi, y_test_multi, "BernoulliNB")

BernoulliNB model accuracy for multi-class classification is = 72.93%
------------------------------------------------
Confusion Matrix:
      0     1     2     3
0  3116   339   140    74
1   515  3427   146    93
2   772   458  2021   116
3   568   365    97  1358
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           1       0.63      0.85      0.72      3669
           2       0.75      0.82      0.78      4181
           3       0.84      0.60      0.70      3367
           4       0.83      0.57      0.67      2388

    accuracy                           0.73     13605
   macro avg       0.76      0.71      0.72     13605
weighted avg       0.75      0.73      0.73     13605



In [50]:
add_to_eval_df(BNB_multi, "BernoulliNB", "multi-class", "tokenizer", X_train_multi, y_train_multi, X_test_multi, y_test_multi)

### MultinomialNB

In [51]:
MNB_multi = MultinomialNB()
MNB_multi.fit(X_train_multi, y_train_multi)

In [52]:
predicted = MNB_multi.predict(X_test_multi)
acc_score = metrics.accuracy_score(predicted, y_test_multi)

In [53]:
evaluate_model(MNB_multi, X_test_multi, y_test_multi, "MultinominalNB")

MultinominalNB model accuracy for multi-class classification is = 73.78%
------------------------------------------------
Confusion Matrix:
      0     1     2     3
0  2871   416   240   142
1   398  3429   226   128
2   526   476  2219   146
3   364   355   150  1519
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           1       0.69      0.78      0.73      3669
           2       0.73      0.82      0.77      4181
           3       0.78      0.66      0.72      3367
           4       0.79      0.64      0.70      2388

    accuracy                           0.74     13605
   macro avg       0.75      0.72      0.73     13605
weighted avg       0.74      0.74      0.74     13605



In [54]:
add_to_eval_df(MNB_multi, "MultinominalNB", "multi-class", "tokenizer", X_train_multi, y_train_multi, X_test_multi, y_test_multi)

### ComplementNB

In [55]:
CNB_multi = ComplementNB()
CNB_multi.fit(X_train_multi, y_train_multi)

In [56]:
predicted = CNB_multi.predict(X_test_multi)
acc_score = metrics.accuracy_score(predicted, y_test_multi)

In [57]:
evaluate_model(CNB_multi, X_test_multi, y_test_multi, "ComplementNB")

ComplementNB model accuracy for multi-class classification is = 75.40%
------------------------------------------------
Confusion Matrix:
      0     1     2     3
0  2797   413   253   206
1   330  3461   227   163
2   430   398  2369   170
3   329   285   143  1631
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           1       0.72      0.76      0.74      3669
           2       0.76      0.83      0.79      4181
           3       0.79      0.70      0.75      3367
           4       0.75      0.68      0.72      2388

    accuracy                           0.75     13605
   macro avg       0.76      0.74      0.75     13605
weighted avg       0.76      0.75      0.75     13605



In [58]:
add_to_eval_df(CNB_multi, "ComplementNB", "multi-class", "tokenizer", X_train_multi, y_train_multi, X_test_multi, y_test_multi)

# Model: RandomForest

## binary target

### tokenization

#### base model

In [59]:
RF_bin = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
RF_bin.fit(X_train_binary, y_train_binary)

In [60]:
evaluate_model(RF_bin, X_test_binary, y_test_binary, "RandomForest")

RandomForest model accuracy for multi-class classification is = 93.38%
------------------------------------------------
Confusion Matrix:
      0     1
0  9021   403
1   498  3683
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.96      0.95      9424
           1       0.90      0.88      0.89      4181

    accuracy                           0.93     13605
   macro avg       0.92      0.92      0.92     13605
weighted avg       0.93      0.93      0.93     13605



In [61]:
add_to_eval_df(RF_bin, "RandomForest", "binary", "tokenizer", X_train_binary, y_train_binary, X_test_binary, y_test_binary)

#### tuned model

Suche dauert ein paar Minuten

In [62]:
RF_bin.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [63]:
# rf = RandomForestClassifier(n_jobs=-1)
#
# params = {
#     "n_estimators": np.arange(start=50, stop=250, step=50),
#     'max_features': [None, "sqrt"],
#     'bootstrap': [True, False],
#     'max_samples': [None, .3, .5],
#     'max_depth': np.arange(start=2, stop=7, step=1),
#     'min_samples_leaf': [1, 3, 5],
# }
#
# rand = RandomizedSearchCV(rf, params, scoring="accuracy")
#
# rand.fit(X_train_binary, y_train_binary)

In [64]:
# rand.best_params_

{'n_estimators': 200,
 'min_samples_leaf': 5,
 'max_samples': 0.3,
 'max_features': None,
 'max_depth': 6,
 'bootstrap': True}

In [65]:
# RF_bin_tuned = RandomForestClassifier(**rand.best_params_, n_jobs=-1, random_state=42)
# RF_bin_tuned.fit(X_train_binary, y_train_binary)

In [66]:
# evaluate_model(RF_bin_tuned, X_test_binary, y_test_binary, "RandomForest-tuned")

In [67]:
# add_to_eval_df(RF_bin_tuned, "RandomForest-tuned", "binary", "tokenizer", X_train_binary, y_train_binary, X_test_binary, y_test_binary)

### tf-idf

In [68]:
RF_bin_tf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
RF_bin_tf.fit(X_train_binary_tf, y_train_binary_tf)

In [69]:
evaluate_model(RF_bin_tf, X_test_binary_tf, y_test_binary_tf, "RandomForest")

RandomForest model accuracy for multi-class classification is = 93.76%
------------------------------------------------
Confusion Matrix:
      0     1
0  9218   206
1   643  3538
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.98      0.96      9424
           1       0.94      0.85      0.89      4181

    accuracy                           0.94     13605
   macro avg       0.94      0.91      0.92     13605
weighted avg       0.94      0.94      0.94     13605



In [70]:
add_to_eval_df(RF_bin_tf, "RandomForest", "binary", "tf-idf", X_train_binary_tf, y_train_binary_tf, X_test_binary_tf, y_test_binary_tf)

## multi-class target

In [71]:
RF_multi = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
RF_multi.fit(X_train_multi, y_train_multi)

In [72]:
evaluate_model(RF_multi, X_test_multi, y_test_multi, "RandomForest")

RandomForest model accuracy for multi-class classification is = 87.16%
------------------------------------------------
Confusion Matrix:
      0     1     2     3
0  3281   167   149    72
1   223  3785   113    60
2   274   192  2842    59
3   200   120   118  1950
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           1       0.82      0.89      0.86      3669
           2       0.89      0.91      0.90      4181
           3       0.88      0.84      0.86      3367
           4       0.91      0.82      0.86      2388

    accuracy                           0.87     13605
   macro avg       0.88      0.87      0.87     13605
weighted avg       0.87      0.87      0.87     13605



In [73]:
add_to_eval_df(RF_multi, "RandomForest", "multi-class", "tokenizer", X_train_multi, y_train_multi, X_test_multi, y_test_multi)

# Model comparison

In [74]:
evaluation.sort_values(by=["test_f1"], ascending=False)

Unnamed: 0,model,target,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
9,RandomForest,binary,tf-idf,0.994046,0.994048,0.994046,0.994038,0.937596,0.937924,0.937596,0.936583
8,RandomForest,binary,tokenizer,0.993923,0.993929,0.993923,0.993914,0.933774,0.933451,0.933774,0.93356
10,RandomForest,multi-class,tokenizer,0.987381,0.987417,0.987381,0.987383,0.871591,0.87338,0.871591,0.871526
2,MultinominalNB,binary,tokenizer,0.890106,0.889563,0.890106,0.8898,0.8638,0.863342,0.8638,0.863557
3,MultinominalNB,binary,tokenizer,0.890106,0.889563,0.890106,0.8898,0.8638,0.863342,0.8638,0.863557
1,BernoulliNB,binary,tf-idf,0.881897,0.880283,0.881897,0.879306,0.855788,0.85302,0.855788,0.85263
0,BernoulliNB,binary,tokenizer,0.881848,0.880224,0.881848,0.87927,0.855494,0.852713,0.855494,0.852292
4,ComplementNB,binary,tokenizer,0.870969,0.884263,0.870969,0.873996,0.839471,0.856939,0.839471,0.843553
7,ComplementNB,multi-class,tokenizer,0.806037,0.808439,0.806037,0.805611,0.753988,0.755385,0.753988,0.753138
6,MultinominalNB,multi-class,tokenizer,0.794154,0.800321,0.794154,0.793192,0.737817,0.743019,0.737817,0.736212


# Model evaluation

## Evaluation with new validation dataset

In [75]:
# todo

## Evaluation with original validation dataset

In [76]:
# todo