# Youtube comments spam classifier

In order to understand NLP in practice, we will work through a simple example of a spam classifier using sklearn.

In [1]:
#import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#load data and view data
comments = pd.read_csv('youtube_data.csv', sep=',', encoding='Latin-1')
print(comments.shape)
comments.head()

(350, 5)


Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
0,z12pgdhovmrktzm3i23es5d5junftft3f,lekanaVEVO1,2014-07-22T15:27:50,i love this so much. AND also I Generate Free ...,1
1,z13yx345uxepetggz04ci5rjcxeohzlrtf4,Pyunghee,2014-07-27T01:57:16,http://www.billboard.com/articles/columns/pop-...,1
2,z12lsjvi3wa5x1vwh04cibeaqnzrevxajw00k,Erica Ross,2014-07-27T02:51:43,Hey guys! Please join me in my fight to help a...,1
3,z13jcjuovxbwfr0ge04cev2ipsjdfdurwck,Aviel Haimov,2014-08-01T12:27:48,http://psnboss.com/?ref=2tGgp3pV6L this is the...,1
4,z13qybua2yfydzxzj04cgfpqdt2syfx53ms0k,John Bello,2014-08-01T21:04:03,Hey everyone. Watch this trailer!!!!!!!! http...,1


In [2]:
#drop meaningless columns
comments = comments.drop(['COMMENT_ID', 'AUTHOR', 'DATE'], axis=1)

In [3]:
comments.head()

Unnamed: 0,CONTENT,CLASS
0,i love this so much. AND also I Generate Free ...,1
1,http://www.billboard.com/articles/columns/pop-...,1
2,Hey guys! Please join me in my fight to help a...,1
3,http://psnboss.com/?ref=2tGgp3pV6L this is the...,1
4,Hey everyone. Watch this trailer!!!!!!!! http...,1


In [4]:
#give meaningful names for exist columns
comments.columns = ['Text', 'label']

In [5]:
comments.head()

Unnamed: 0,Text,label
0,i love this so much. AND also I Generate Free ...,1
1,http://www.billboard.com/articles/columns/pop-...,1
2,Hey guys! Please join me in my fight to help a...,1
3,http://psnboss.com/?ref=2tGgp3pV6L this is the...,1
4,Hey everyone. Watch this trailer!!!!!!!! http...,1


In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split( comments['Text'], comments['label'], test_size=0.25)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer

In [9]:
count_vect = CountVectorizer()

In [10]:
X_train_counts = count_vect.fit_transform(X_train)

In [11]:
list(count_vect.vocabulary_.items())[0:10]

[('check', 311),
 ('out', 984),
 ('our', 983),
 ('vids', 1381),
 ('songs', 1205),
 ('are', 179),
 ('awesome', 200),
 ('and', 160),
 ('that', 1301),
 ('guarantee', 621)]

In [12]:
lab_bin= LabelBinarizer()
y_trainbin=lab_bin.fit_transform(y_train)
y_testbin=lab_bin.fit_transform(y_test)

In [13]:
from sklearn.naive_bayes import MultinomialNB

In [14]:
clf = MultinomialNB().fit(X_train_counts, y_trainbin.ravel())

In [15]:
import collections
importantwords = collections.Counter()

In [16]:
for word,imp in zip(count_vect.vocabulary_.keys(),clf.coef_[0]):
    importantwords[word]=imp
    

In [17]:
X_test_counts = count_vect.transform(X_test)
pred = clf.predict(X_test_counts)

In [18]:
from sklearn.metrics import average_precision_score

In [19]:
average_precision_score(y_testbin, pred)  

0.9016146455170845