# Multinomial Naïve Bayes Classifier - the YouTube Dataset

### Introducing the database

The database for this example is taken from https://archive.ics.uci.edu/ml/machine-learning-databases/00380/ 

We usually modify the databases slightly such that they fit the purpose of the course. Therefore, we suggest you use the database provided in the resources in order to obtain the same results as the ones in the lectures.

### Importing the necessary libraries

In [None]:
import pandas as pd
import glob

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.metrics import classification_report, ConfusionMatrixDisplay

import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np

### Reading the database

In [None]:
files = glob.glob('youtube-dataset\\*.csv')
files

In [None]:
all_df = []

for i in files:
    all_df.append(pd.read_csv(i).drop(['COMMENT_ID', 'AUTHOR', 'DATE'], axis = 1))

In [None]:
all_df[0]

In [None]:
len(all_df)

In [None]:
data = pd.concat(all_df, axis=0, ignore_index=True)
data

In [None]:
data.isnull().sum()

In [None]:
data['CLASS'].value_counts()

### Sample 1

In [None]:
message_sample = ['This is a dog']

vectorizer_sample = CountVectorizer()

vectorizer_sample.fit(message_sample)

In [None]:
vectorizer_sample.transform(message_sample).toarray()

In [None]:
vectorizer_sample.get_feature_names_out()

In [None]:
vectorizer_sample.transform(['This is a cat']).toarray()

### Sample 2

In [None]:
message_sample2 = ['This is a dog and that is a dog', 'This is a cat']

vectorizer_sample2 = CountVectorizer()

vectorizer_sample2.fit_transform(message_sample2).toarray()

In [None]:
vectorizer_sample2.get_feature_names_out()

In [None]:
vectorizer_sample2.transform(['Those are birds.']).toarray()

### Defining the inputs and the target. Creating the train-test split.

In [None]:
inputs = data['CONTENT']
target = data['CLASS']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(inputs, target, 
                                                    test_size=0.2, 
                                                    random_state=365, 
                                                    stratify = target)

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

### Tokenizing the YouTube comments

In [None]:
vectorizer = CountVectorizer()

In [None]:
x_train_transf = vectorizer.fit_transform(x_train)
x_test_transf = vectorizer.transform(x_test)

In [None]:
x_train_transf.toarray()

In [None]:
x_train_transf.shape

In [None]:
x_test_transf.shape

### Performing the classification

In [None]:
clf = MultinomialNB()
# clf = MultinomialNB(class_prior = np.array([0.6, 0.4]))

clf.fit(x_train_transf, y_train)

In [None]:
np.exp(clf.class_log_prior_)

In [None]:
clf.get_params()

### Performing the evaluation on the test dataset

In [None]:
y_test_pred = clf.predict(x_test_transf)

In [None]:
sns.reset_orig()

ConfusionMatrixDisplay.from_predictions(
    y_test, y_test_pred,
    labels = clf.classes_,
    cmap = 'magma'
);

In [None]:
print(classification_report(y_test, y_test_pred, target_names = ['Ham', 'Spam']))

### Creating probability-distribution figures

In [None]:
spam_proba = clf.predict_proba(x_test_transf).round(3)[:,1];

df_scatter = pd.DataFrame()

df_scatter['True class'] = y_test
df_scatter['Predicted class'] = y_test_pred
df_scatter['Predicted probability (spam)'] = spam_proba

df_scatter = df_scatter.reset_index(drop = True)

palette_0 = sns.color_palette(['#000000'])
palette_1 = sns.color_palette(['#FF0000'])

df_scatter_0 = df_scatter[df_scatter['True class'] == 0].reset_index(drop = True)
df_scatter_1 = df_scatter[df_scatter['True class'] == 1].reset_index(drop = True)

sns.set()

fig, (ax1,ax2) = plt.subplots(2,1, figsize=(12,5))
fig.tight_layout(pad = 3)

sns.scatterplot(x = 'Predicted probability (spam)', 
                y = np.zeros(df_scatter_0.shape[0]), 
                data = df_scatter_0,
                hue = 'True class', 
                s = 50,
                markers = ['o'],
                palette = palette_0,
                style = 'True class',
                legend = False, 
                ax = ax1).set(yticklabels=[])

ax1.set_title('Probability distribution of comments belonging to the true \'ham\' class')
ax1.vlines(0.5, -1, 1, linestyles = 'dashed', colors = 'red');


sns.scatterplot(x = 'Predicted probability (spam)', 
                y = np.zeros(df_scatter_1.shape[0]), 
                hue = 'True class', 
                data = df_scatter_1,
                s = 50,
                palette = palette_1,
                markers = ['X'],
                style = 'True class',
                legend = False, 
                ax = ax2).set(yticklabels=[])

ax2.set_title('Probability distribution of comments belonging to the true \'spam\' class')

ax2.vlines(0.5, -1, 1, linestyles = 'dashed', colors = 'red');

### Making predictions

In [None]:
predict_data = vectorizer.transform(['This song is amazing!',
                                     'You can win 1m dollars right now, just click here!!!'])

In [None]:
clf.predict(predict_data)