# YouTube Comments Spam Classifier

### Import modules

In [23]:
import pandas as pd
import zipfile
import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import confusion_matrix, classification_report

In [24]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Import dataset files from Google Drive

In [25]:
z = zipfile.ZipFile("/content/drive/MyDrive/YouTube-Spam-Collection-v1.zip")
Psy = pd.read_csv(z.open("Youtube01-Psy.csv"))
Katy = pd.read_csv(z.open("Youtube02-KatyPerry.csv"))
LMFAO = pd.read_csv(z.open("Youtube03-LMFAO.csv"))
Eminem = pd.read_csv(z.open("Youtube04-Eminem.csv"))
Shakira = pd.read_csv(z.open("Youtube05-Shakira.csv"))

In [39]:
data = pd.concat([Psy, Katy, LMFAO, Eminem, Shakira])
data.drop(["COMMENT_ID", "DATE", "AUTHOR"], axis=1, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1956 entries, 0 to 369
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   CONTENT  1956 non-null   object
 1   CLASS    1956 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 45.8+ KB


### Splitting dataset into train/test sets

In [27]:
X_train, X_test, y_train, y_test = train_test_split(data["CONTENT"], data["CLASS"])

### Tokenizing comments in training set and applying TF-IDF vectorizer on training set

In [28]:
tfidf_vect = TfidfVectorizer(use_idf=True, lowercase=True)
X_train_tfidf = tfidf_vect.fit_transform(X_train)
X_train_tfidf.shape

(1467, 3646)

### Training the multinomial Naive Bayes model

In [29]:
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### Generate predictions on test set

In [34]:
X_test_tfidf = tfidf_vect.transform(X_test)
predictions = model.predict(X_test_tfidf)

### Generate model performance metrics

In [35]:
confusion_matrix(y_test, predictions)

array([[217,  26],
       [ 10, 236]])

In [36]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.96      0.89      0.92       243
           1       0.90      0.96      0.93       246

    accuracy                           0.93       489
   macro avg       0.93      0.93      0.93       489
weighted avg       0.93      0.93      0.93       489



In [37]:
model.score(X_test_tfidf, y_test)

0.9263803680981595

### Exporting the model and TF-IDF vectorizer

In [38]:
with open("model.pkl", "wb") as model_file:
  pickle.dump(model, model_file)

with open("tfidf-vect.pkl", "wb") as tfidf_vect_file:
  pickle.dump(tfidf_vect, tfidf_vect_file)