In [27]:
from nltk.lm import KneserNeyInterpolated
from nltk.lm.preprocessing import padded_everygram_pipeline, flatten, pad_both_ends
from nltk.util import everygrams
from nltk import word_tokenize, sent_tokenize
import numpy as np
import pandas as pd
import string

## Read in lyrics data

Song `The Art of Peer Pressure` chosen as an example of a narrative based rap.

Song `Money Trees` chosen as an example of a non-narrative based rap.

In [18]:
df = pd.read_csv('songs_with_transcription.csv')

In [30]:
lyrics = df[df['song_title'] == 'The Art of Peer Pressure']['lyrics'].values[0]
lyrics_x = df[df['song_title'] == 'Money Trees']['lyrics'].values[0]

## Define functions for training language model

`KneserNeyInterpolated` uses an interpolated smoothing technique as defined by Chen & Goodman 1995 https://dash.harvard.edu/bitstream/handle/1/25104739/tr-10-98.pdf?sequence=1.

Bigrams used but model scales to any ngram >=2.

In [31]:
def text_by_line(lyrics):
    return [list(map(str.lower, word_tokenize(sent))) for sent in lyrics.splitlines()]

In [32]:
def make_training_data(lines):
    test = lines[-1]
    train = lines[:-1]
    return test, train

def perplexity_by_line(text, n):
    perplexity_scores = []
    lines = text_by_line(text)
    
    for i in range(2, len(lines)):
        model = KneserNeyInterpolated(n)
        
        test_text, train_text = make_training_data(lines[:i])
        
        train_data, padded_sents_train = padded_everygram_pipeline(n, train_text)
        test_data, padded_sents_test = padded_everygram_pipeline(n, test_text)
        padded_bigrams = list(pad_both_ends(test_text, n=n))
    
        model.fit(train_data, padded_sents_train)
        test = list(everygrams(padded_bigrams, max_len=n))
        perp = model.perplexity(test)

        perplexity_scores.append(perp)
    
    return perplexity_scores, model

## Scores for narrative-based rap

In [33]:
n = 2

perplexity_scores, model = perplexity_by_line(lyrics, n)
np.mean(perplexity_scores)

261.1545566044926

In [35]:
n = 2

perplexity_scores_x, model_x = perplexity_by_line(lyrics_x, n)
np.mean(perplexity_scores_x)

191.35127459625022