In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

This notebook explores the performance of ngram word classification using a Naive Bayes classifier. Performance is assessed on a data set containing Biggie Smalls and 2Pac lyrics.

In [2]:
biggie_df = pd.read_csv('./biggie_lyrics.csv', usecols=[1], encoding='latin-1', header=None)
biggie_df.columns = ["lyrics"]
biggie_df["lyrics"] = biggie_df["lyrics"].str.replace('[^\w\s]','')
biggie_df["lyrics"] = biggie_df["lyrics"].str.lower()

biggie_df.tail()

Unnamed: 0,lyrics
11,relax and take notes while i take tokes of the...
12,good evenin ladies and gentlemen\nhows everybo...
13,who shot ya\nseperate the weak from the obsole...
14,when i die fuck it i wanna go to hell\ncause i...
15,when the lala hits ya lyrics just splits ya\nh...


In [3]:
pac_df = pd.read_csv('./2pac_lyrics.csv', usecols=[1], encoding='latin-1', header=None)
pac_df.columns = ["lyrics"]
pac_df["lyrics"] = pac_df["lyrics"].str.replace('[^\w\s]','')
pac_df["lyrics"] = pac_df["lyrics"].str.lower()

pac_df.head()

Unnamed: 0,lyrics
0,little something for my godson elijah\nand a l...
1,yo mo bee mayn drop that shit\nyou know what t...
2,rest in peace to my motherfucker biggy smallz\...
3,makaveli in this killuminati\nall through your...
4,its just me against the world\nnothin to lose\...


In [4]:
biggie_lyrics = biggie_df["lyrics"].values
biggie_lyrics = [ song.split('\n') for song in biggie_lyrics]
biggie_lyrics = [line for song in biggie_lyrics for line in song]
pac_lyrics = pac_df["lyrics"].values
pac_lyrics = [ song.split('\n') for song in pac_lyrics]
pac_lyrics = [line for song in pac_lyrics for line in song]

rap_lines = [] 

for line in biggie_lyrics:
    if len(line.split()) > 3:
        rap_lines.append(np.array([0,str(line)]))
        
for line in pac_lyrics:
    if len(line.split()) > 3:
        rap_lines.append(np.array([1,str(line)]))
        
rap_lines = np.array(rap_lines)

In [5]:
rap_lines = pd.DataFrame(rap_lines)
rap_lines.columns = ["label","line"]
rap_lines.head()
rap_lines['label'] = rap_lines['label'].replace(['0','1'],[0,1])

In [6]:
class ngrams_bayes():
    
    def __init__(self, data, n=2, split=0.75):
        
        # split into training and testing data
        self.train_data, self.test_data = train_test_split(data,
                                                          train_size=split)
        # convert into n grams
        self.train_data = [[item[0], self.ngrams(n, item[1])] for item in self.train_data]
        self.test_data = [[item[0], self.ngrams(n, item[1])] for item in self.test_data]
        
        # count unique n grams in training data
        flattened = [gram for message in self.train_data for gram in message[1]]
        self.unique = len(set(flattened))
        
        # init dicts
        self.trainPositive = {}
        self.trainNegative = {}
        # counters
        self.posGramCount = 0
        self.negGramCount = 0
        self.spamCount = 0
        # priors
        self.pA = 0
        self.pNotA = 0
        
    def ngrams(self, n, text):
        text = text.split(' ')
        grams = []
        for i in range(len(text)-n+1):
            gram = ' '.join(text[i:i+n])
            grams.append(gram)
        return grams 
    
    def train(self):
        
        for item in self.train_data:
            label = item[0]
            grams = item[1]
            if label == 1:
                self.spamCount += 1   
            for gram in grams:
                if label == 1:
                    self.trainPositive[gram] = self.trainPositive.get(gram, 0) + 1
                    self.posGramCount += 1
                else:
                    self.trainNegative[gram] = self.trainNegative.get(gram, 0) + 1
                    self.negGramCount += 1
                    
        self.pA = self.spamCount/float(len(self.train_data))
        self.pNotA = 1.0 - self.pA
        
    def classify(self, text, alpha=1.0):
        
        self.alpha = alpha
        isSpam = self.pA * self.conditionalText(text, 1)
        notSpam = self.pNotA * self.conditionalText(text, 0)
        if (isSpam > notSpam):
            return 1
        else:
            return 0
        
    def conditionalText(self, grams, label):
        result = 1.0
        for ngram in grams:
            result *= self.conditionalNgram(ngram, label)
        return result
    
    def conditionalNgram(self, ngram, label):
        alpha = self.alpha
        if label == 1:
            return ((self.trainPositive.get(ngram,0)+alpha) /
                    float(self.posGramCount+alpha*self.unique))
        else:
            return ((self.trainNegative.get(ngram,0)+alpha) /
                    float(self.negGramCount+alpha*self.unique))
            
    def evaluate_test_data(self):
        results = []
        for test in self.test_data:
            label = test[0]
            text = test[1]
            ruling = self.classify(text)
            if ruling == label:
                results.append(1) 
            else:
                results.append(0) 
                
        print("Evaluated {} test cases. {:.2f}% Accuracy".format(len(results), 100.0*sum(results)/float(len(results))))
        return sum(results)/float(len(results))

In [7]:
bayes_biggie_vs_pac = ngrams_bayes(rap_lines.values, 1, 0.9)



In [8]:
bayes_biggie_vs_pac.train()

In [9]:
bayes_biggie_vs_pac.evaluate_test_data()

Evaluated 197 test cases. 75.63% Accuracy


0.7563451776649747

In [10]:
# Since we have a small data set, let's run multiple trials with different 
# train-test splits to get a better idea of what our average classification 
# accuracy using this method.

results = []

for _ in range(10):
    unigram = ngrams_bayes(rap_lines.values, 1, 0.9)
    unigram.train()
    results.append(unigram.evaluate_test_data())
    
print("Average Accuracy: {:.2f}".format(sum(results)/float(len(results))))

Evaluated 197 test cases. 77.16% Accuracy
Evaluated 197 test cases. 75.63% Accuracy
Evaluated 197 test cases. 69.54% Accuracy
Evaluated 197 test cases. 72.08% Accuracy
Evaluated 197 test cases. 75.13% Accuracy
Evaluated 197 test cases. 72.59% Accuracy
Evaluated 197 test cases. 75.13% Accuracy
Evaluated 197 test cases. 70.05% Accuracy
Evaluated 197 test cases. 68.53% Accuracy
Evaluated 197 test cases. 70.05% Accuracy
Average Accuracy: 0.73


In [12]:
# How do bigrams compete?


results = []
for _ in range(10):
    bigram_net = ngrams_bayes(rap_lines.values, 2, 0.9)
    bigram_net.train()
    results.append(bigram_net.evaluate_test_data())
    
print("Average Accuracy: {:.2f}".format(sum(results)/float(len(results))))

Evaluated 197 test cases. 75.13% Accuracy
Evaluated 197 test cases. 69.54% Accuracy
Evaluated 197 test cases. 78.68% Accuracy
Evaluated 197 test cases. 72.08% Accuracy
Evaluated 197 test cases. 77.16% Accuracy
Evaluated 197 test cases. 71.07% Accuracy
Evaluated 197 test cases. 73.10% Accuracy
Evaluated 197 test cases. 73.60% Accuracy
Evaluated 197 test cases. 75.13% Accuracy
Evaluated 197 test cases. 71.57% Accuracy
Average Accuracy: 0.74


In [15]:
# How do trigrams compete?

results = []
for _ in range(10):
    trigram_net = ngrams_bayes(rap_lines.values, 3, 0.9)
    trigram_net.train()
    results.append(trigram_net.evaluate_test_data())
    
print("Average Accuracy: {:.2f}".format(sum(results)/float(len(results))))

Evaluated 197 test cases. 60.91% Accuracy
Evaluated 197 test cases. 62.94% Accuracy
Evaluated 197 test cases. 54.31% Accuracy
Evaluated 197 test cases. 66.50% Accuracy
Evaluated 197 test cases. 54.82% Accuracy
Evaluated 197 test cases. 61.42% Accuracy
Evaluated 197 test cases. 58.38% Accuracy
Evaluated 197 test cases. 61.42% Accuracy
Evaluated 197 test cases. 59.39% Accuracy
Evaluated 197 test cases. 60.91% Accuracy
Average Accuracy: 0.60


In [None]:
# Unigrams seem to have a slight edge on bigrams and trigrams but using trigrams doesn't yield
# horrible results on this rap data like it did when classing sms messages. 
# Potentially, while these raps contain highly colloquial words, there are phrases unique to 
# Biggie and Pac that they use repeatedly while with sms messages the sequence of words is more arbitrary.