## Using Naive Bayes and Count Vectorizer

In [1]:
# import cleaned data
import pandas as pd
train_data = pd.read_csv('../../data/cleaned_data.csv', index_col='id')

In [2]:
# Apply count vectorizer to comment data
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
count_tranformed_train_data = count_vectorizer.fit_transform(train_data.comment_text)

In [3]:
# Select columns and split data
from sklearn.model_selection import train_test_split
y = train_data.target
X = count_tranformed_train_data

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                    test_size=.2, random_state=1, stratify=y)

print("Number of features: ", X_train.shape[1])
print("Number of train samples: ", y_train.shape[0])
print("Number of train samples: ", y_test.shape[0])

Number of features:  500866
Number of train samples:  1443538
Number of train samples:  360885


In [4]:
# Import and train classifier
from sklearn.naive_bayes import MultinomialNB
trained_count_Multi_NB = MultinomialNB().fit(X_train, y_train)

In [5]:
# Make predictions on test data
count_Multi_NB_predictions = trained_count_Multi_NB.predict(X_test)

In [6]:
# Find missclassified samples
import numpy as np
count_Multi_NB_misclassified = np.where(y_test != count_Multi_NB_predictions)
count_Multi_NB_miss_df = pd.DataFrame(count_Multi_NB_misclassified)

In [7]:
# Calculate and show model metrics
from sklearn import metrics
print(metrics.classification_report(y_test, count_Multi_NB_predictions,target_names=['Non-Toxic', 'Toxic']))

              precision    recall  f1-score   support

   Non-Toxic       0.95      0.96      0.95    332018
       Toxic       0.48      0.41      0.44     28867

    accuracy                           0.92    360885
   macro avg       0.71      0.69      0.70    360885
weighted avg       0.91      0.92      0.91    360885

