In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import re

In [None]:
psy = pd.read_csv("./data/YouTube-Spam-Collection-v1/Youtube01-Psy.csv")
perry = pd.read_csv("./data/YouTube-Spam-Collection-v1/Youtube02-KatyPerry.csv")
lmf = pd.read_csv("./data/YouTube-Spam-Collection-v1/Youtube03-LMFAO.csv")
emin = pd.read_csv("./data/YouTube-Spam-Collection-v1/Youtube04-Eminem.csv")
shak = pd.read_csv("./data/YouTube-Spam-Collection-v1/Youtube05-Shakira.csv")

In [None]:
## Combine all datasets into one, and drop unnecessary features
nbdf = pd.concat([shak, perry, psy, lmf, emin]).reset_index()
nbdf = nbdf.iloc[:,4:]

## Clean all comments of punctuation
nbdf['CONTENT'] = nbdf['CONTENT'].str.replace('\W',' ')
nbdf['CONTENT'] = nbdf['CONTENT'].str.lower()
nbdf['CONTENT'] = nbdf['CONTENT'].str.split()

## Run through each line and gather unique words within the document
dictionary = []
for line in nbdf['CONTENT']:
    for word in line:
        dictionary.append(word)
dictionary = list(set(dictionary))

## Dictionary assigning a wordcount to each unique word in the doc
word_count_per_line = {unique_word: [0] * len(nbdf['CONTENT']) for unique_word in dictionary}
for index, line in enumerate(nbdf['CONTENT']):
    for word in line:
        word_count_per_line[word][index] +=1
word_counts = pd.DataFrame(word_count_per_line)

## Combine the words count and total dataframe sets into one; drop commonly-used English words that mean little
clean = pd.concat([nbdf, word_counts], axis = 1)
clean = clean.drop(['i','and','the','but','a','in','of','be','that','have','it','for','on','with', 'this', 'is', 'to'], axis = 1)


## Split based on seam made during concatenation; grab shak observations for testing set. THIS IS NOT RANDOM.
train = clean.iloc[len(shak):,:]
test = clean.iloc[:len(shak),:]

In [None]:
## Splt further, dividing X and y
Xtrain = train.drop(['CONTENT','CLASS'], axis = 1)
ytrain = train['CLASS']

Xtest = test.drop(['CONTENT','CLASS'], axis = 1)
ytest = test[['CLASS']]

# Instantiate and train model
mnb = MultinomialNB().fit(Xtrain, ytrain)
y_hat = mnb.predict(Xtest)

# Test model (it did OK)
acc = accuracy_score(ytest, y_hat)
print(acc, "\n")
conf = confusion_matrix(ytest, y_hat)
print(conf)

In [None]:
## Play with the results
test['predictions'] = gnb.predict(Xtest)
incorrect1 = test[test['predictions'] != test['CLASS']][['CONTENT','CLASS','predictions']]
results1 = test[['CONTENT','CLASS','predictions']]
results1.columns = ['NBComments','NBClass','NBPredictions']

In [None]:
############ Focus on creating own version of NB ###############

## Drop unimportant attributes
df = pd.concat([perry, psy, lmf, emin], axis = 0)
df = df.reset_index()
df = df.iloc[:,4:]

# Clean
df['CONTENT'] = df['CONTENT'].str.replace('\W',' ')
df['CONTENT'] = df['CONTENT'].str.lower()
df['CONTENT'] = df['CONTENT'].str.split()

## Split comments into singular unique words
dictionary = []
for line in df['CONTENT']:
    for word in line:
        dictionary.append(word)
dictionary = list(set(dictionary))

## Convert list of unique words into a dictionary with key: unique word, value: count
word_count_per_line = {unique_word: [0] * len(df['CONTENT']) for unique_word in dictionary}
for index, line in enumerate(df['CONTENT']):
    for word in line:
        word_count_per_line[word][index] +=1
word_counts = pd.DataFrame(word_count_per_line)
clean_training = pd.concat([df, word_counts], axis = 1)

## remove grouping of words that are influential for mean nothing (i, and, but)
dictionary = [x for x in dictionary if x not in ['i','and','the','but','a','in','of','be','that','have','it','for','on','with', 'this', 'is', 'to']]
clean_training = clean_training.drop(['i','and','the','but','a','in','of','be','that','have','it','for','on','with', 'this', 'is', 'to'], axis = 1)

In [None]:
## Setup
spam = clean_training[clean_training['CLASS'] == 1]
ham = clean_training[clean_training['CLASS'] == 0]

## P(y == 1) & P(y == 0)
pspam = len(spam)/len(clean_training)
pham = len(ham)/len(clean_training)

# Count words per comment in spam and ham
n_words_in_spam = spam['CONTENT'].apply(len)
n_words_in_ham = ham['CONTENT'].apply(len)

# Count total number of words for spam and ham
n_spam = n_words_in_spam.sum()
n_ham = n_words_in_ham.sum()

# Total number of unique words
n_dict = len(dictionary)

# Laplace smoothing
alpha = 1

In [None]:
# Parameters

## Initialization
params_spam = {unique_word: 0 for unique_word in dictionary}
params_ham = {unique_word: 0 for unique_word in dictionary}

## Calculation -> P(word|y == 1) & P(word|y == 0)
for word in dictionary:
    nword_in_spam = spam[word].sum()
    # Occurance of the word in spam / total words in spam
    pword_in_spam = (nword_in_spam + alpha)/(n_spam + alpha * n_dict)
    params_spam[word] = pword_in_spam

    nword_in_ham = ham[word].sum()
    # Occurance of the wod in ham / total words in ham
    pword_in_ham = (nword_in_ham + alpha)/(n_ham + alpha * n_dict)
    params_ham[word] = pword_in_ham

In [None]:
def classify(message):
    message = re.sub('\W', ' ', message)
    message = message.lower().split()

    p_spam_given_message = pspam
    p_ham_given_message = pham

    for word in message:
        print(word)
        if word in params_spam:
            p_spam_given_message *= params_spam[word]
        if word in params_ham:
            p_ham_given_message *= params_ham[word]
    
    print('P(spam|message): ', p_spam_given_message)
    print('P(ham|message): ', p_ham_given_message)

    if p_ham_given_message > p_spam_given_message:
        print('Label: ham')
        return(0)
    else:
        print('Label: spam')
        return(1)

In [None]:
## Running the function
test_set = shak.iloc[:,3:]
classify(test_set['CONTENT'].iloc[4])

In [None]:
def classify_test(message):
    message = re.sub('\W',' ', message)
    message = message.lower().split()

    p_ham_given_message = pham
    p_spam_given_message = pspam

    for word in message:
        if word in params_spam:
            p_spam_given_message *= params_spam[word]
        if word in params_ham:
            p_ham_given_message *= params_ham[word]
    if p_ham_given_message > p_spam_given_message:
      return 0
    else:
      return 1


In [None]:
## Running the function en masse, adding the resultant column as part of the dataframe
test_set['predictions'] = test_set['CONTENT'].apply(classify_test)

# Check accuracy
correct = 0
for row in test_set.iterrows():
    row = row[1]
    if row['CLASS'] == row['predictions']:
        correct += 1
accuracy = correct/len(test_set)
print('Correct: ', correct)
print('Incorrect: ', len(test_set) - correct)
print('Accuracy: ', accuracy)

results2 = test_set[['CONTENT', 'CLASS','predictions']]
results2.columns = ['HMComments','HMClass','HMPredictions']
incorrect2 = test_set[test_set['predictions'] != test_set['CLASS']]

In [None]:
comparisonSet = pd.concat((results1, results2), axis = 1)
comparisonSet.head(20)

In [None]:


maxVal = 0
for pair in params_ham:
    value = params_ham[pair]
    if value > maxVal:
        maxVal = value
        print(maxVal)

#maxVal = 0
#for pair in params_spam:
#    value = params_spam[pair]
#    if value > maxVal:
#        maxVal = value
#        print(maxVal)


In [None]:
keys = list(params_spam.keys())
vals = list(params_spam.values())

# out
keys[vals.index(0.019946091644204852)]
# to
keys[vals.index(0.01567834681042228)]
# youtube
keys[vals.index(0.009478885893980233)]
# channel
keys[vals.index(0.007412398921832884)]
# amp
#keys[vals.index(0.005207865672981952)]

In [None]:
keys = list(params_ham.keys())
vals = list(params_ham.values())

# song
keys[vals.index(0.01379014989293362)]

# to
keys[vals.index(0.010706638115631691)]

# like
keys[vals.index(0.006338329764453961)]

# to
#keys[vals.index(0.0107048043161771)]

# like
#keys[vals.index(0.006337244155176843)]