In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

# Importing data

In [2]:
psy = pd.read_csv('data/Youtube01-Psy.csv')
katy = pd.read_csv('data/Youtube02-KatyPerry.csv')
lmfao = pd.read_csv('data/Youtube03-LMFAO.csv')
eminem = pd.read_csv('data/Youtube04-Eminem.csv')
shakira = pd.read_csv('data/Youtube05-Shakira.csv')
df = pd.concat([psy, katy, lmfao, eminem,shakira])

print(df.columns)
print(df.shape)

Index(['COMMENT_ID', 'AUTHOR', 'DATE', 'CONTENT', 'CLASS'], dtype='object')
(1956, 5)


# Removing unnecessary data columns

In [3]:
df.drop(['COMMENT_ID', 'AUTHOR', 'DATE'], axis=1, inplace=True)
print(df.head)

<bound method NDFrame.head of                                                CONTENT  CLASS
0    Huh, anyway check out this you[tube] channel: ...      1
1    Hey guys check out my new channel and our firs...      1
2               just for test I have to say murdev.com      1
3     me shaking my sexy ass on my channel enjoy ^_^ ﻿      1
4              watch?v=vtaRGgvGtWQ   Check this out .﻿      1
..                                                 ...    ...
365  I love this song because we sing it at Camp al...      0
366  I love this song for two reasons: 1.it is abou...      0
367                                                wow      0
368                            Shakira u are so wiredo      0
369                         Shakira is the best dancer      0

[1956 rows x 2 columns]>


# Removing Stopwords

In [4]:
# Import stopwords with nltk.
import re
from nltk.corpus import stopwords
stop = stopwords.words('english')

#removing stopwords
df['CONTENT'] = df['CONTENT'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
#removing words with digits because they don't add meaning
df['CONTENT'] = df['CONTENT'].apply(lambda x: re.sub(r'\w*\d\w*', '', x).strip())

#simplifying complex words using porter stemer
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

df['CONTENT'] = df['CONTENT'].apply(lambda x: ' '.join([porter.stem(word) for word in x.split()]))

# Separating comments and labels

In [5]:

comments = df['CONTENT']
labels = df['CLASS']

labels_map = {
    1: 'Spam',
    0: 'Ham'
}

# Transforming words into feature vectors

In [6]:
count = CountVectorizer()
bag = count.fit_transform(comments.values)

print(bag.shape)

print(type(bag))

print(count.get_feature_names_out()[100:200])

print(bag.toarray()[0][900:1000])

(1956, 3713)
<class 'scipy.sparse._csr.csr_matrix'>
['ain' 'air' 'airlin' 'airplan' 'aka' 'al' 'album' 'alcohol' 'alcoholic'
 'alert' 'alex' 'alfr' 'ali' 'aliv' 'alive' 'all' 'allot' 'allow' 'allway'
 'almond' 'almost' 'alo' 'aloidia' 'alone' 'alot' 'alreadi' 'alright'
 'also' 'altern' 'alvar' 'alway' 'am' 'amaz' 'amazed' 'amazing' 'amazon'
 'ambit' 'ambiti' 'ambition' 'amend' 'america' 'american' 'ami' 'amiable'
 'amount' 'amp' 'an' 'ana' 'anaconda' 'analyst' 'anand' 'ancestor' 'and'
 'anderson' 'andrew' 'andrijamatf' 'android' 'angel' 'angels' 'angri'
 'ani' 'anim' 'animal' 'animals' 'animator' 'animes' 'annoy' 'annoying'
 'anoth' 'answer' 'anthem' 'antrobofficial' 'anxiou' 'anybodi' 'anymor'
 'anymore' 'anyon' 'anyone' 'anyth' 'anything' 'anyway' 'anywher' 'anywon'
 'aplica' 'apocalyps' 'apologies' 'apostles' 'app' 'apparel' 'apparently'
 'appl' 'applaus' 'appli' 'applocker' 'appoint' 'appreci' 'appreciate'
 'appreciated' 'apprecit' 'approv']
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

# Assessing word relevancy

In [7]:
tfidf = TfidfTransformer()
train_tfidf = tfidf.fit_transform(bag)

print(train_tfidf.shape)

print(type(train_tfidf))

np.set_printoptions(precision=2)
print(train_tfidf.toarray()[0][900:1000])

(1956, 3713)
<class 'scipy.sparse._csr.csr_matrix'>
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]


# Splitting Test and Training data

In [8]:
df = df.sample(frac=1).reset_index(drop=True)


train = df.sample(frac=0.75, random_state=1)
test = df.drop(train.index).reset_index(drop=True)
train = train.reset_index(drop=True)


X_train = train['CONTENT']
y_train = train['CLASS']
X_test = test['CONTENT']
y_test = test['CLASS']

# Pipeline

In [9]:
pipe_nlp = make_pipeline(CountVectorizer(), TfidfTransformer(), MultinomialNB())
pipe_nlp.fit(X_train, y_train)

# Cross validation

In [10]:
from sklearn.model_selection import StratifiedKFold

kfolds = StratifiedKFold(n_splits=5).split(X_train, y_train)

scores = []
for k, (train, test) in enumerate(kfolds):
    pipe_nlp = pipe_nlp.fit(X_train[train], y_train[train])
    score = pipe_nlp.score(X_train[test], y_train[test])
    scores.append(score)
    print('Fold: %2d, Class dist.: %s, Acc: %.3f' % (k+1, np.bincount(y_train[train]), score))

print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

Fold:  1, Class dist.: [570 603], Acc: 0.881
Fold:  2, Class dist.: [570 603], Acc: 0.898
Fold:  3, Class dist.: [571 603], Acc: 0.908
Fold:  4, Class dist.: [571 603], Acc: 0.874
Fold:  5, Class dist.: [570 604], Acc: 0.894

CV accuracy: 0.891 +/- 0.012


# Testing the model with our testing data

In [11]:
from sklearn.metrics import confusion_matrix
y_pred = pipe_nlp.predict(X_test)

print('Model Accuracy: %.3f' % pipe_nlp.score(X_test, y_test))
print('Confussion Matrix: \n')
print(confusion_matrix(y_test, y_pred))

Model Accuracy: 0.910
Confussion Matrix: 

[[206  32]
 [ 12 239]]


# Making new prediction

In [12]:
input_data = ['I literally cant stop rewatching this concert over and over. Its already 2 months and I just cant FU*** stop. <3',
                'This was such a great concert <3',
                'So amazing! I wish I was there! Post never disappoints!',
                'Subscribe to my new youtube channel!',
                'What a show! Pure class & cool. Huge respect for Eminem taking the knee. Just brilliant. Even if you don’t love this genre of music, this is the best collaboration and flawless production ever!',
                'Get a free iphone 13 pro max by visiting our new page www.fakepage.com',
                'Buy my product in www.ssssssssssss.com']

predictions = pipe_nlp.predict(input_data)

for prediction in predictions:
    print(labels_map[prediction])

Spam
Ham
Ham
Spam
Ham
Spam
Spam
