# DMA Final Project - Rap Language Model

This notebook: load rap lyrics dataset, tokenize the lyrics, split into train, validation, test sets

**Tokenization Rules**
1. split on whitespace
2. keep track of newline tokens - want to be able to predict these
3. you'll -> [you, 'll],<br>i'm -> [i, 'm],<br>should've -> [should, 've]
4. couldn't -> [could, n't]



- replace all \\n tokens with $<BR>$ token which makes it easier to read/write data to tsv files
- lowercase all tokens
- train:val:test - 80:10:10 split

In [323]:
import os, json, re
from nltk import word_tokenize, regexp_tokenize
from collections import Counter
import numpy as np

### Load and Tokenize Data

In [538]:
def read_lyrics_json(file):
    """
    Return list of tokenized lyrics
    
    excpected json structure:
        {...'songs':[{'lyrics':LYRICS},{'lyrics':LYRICS}]...}
    """
    if not os.path.exists(file):
        print("file {} does not exist".format(file))
        return []
    
    with open(file) as json_file:
        data = json.load(json_file)
    
    if 'songs' not in data:
        print('no songs in data')
        return []
    
    songs = []
    for song in data['songs']:
        if 'lyrics' not in song or song['lyrics'] is None:
            continue
        
        lyrics = song['lyrics'].lower()
        lyrics = lyrics.replace("\n", " <BR> ")
        lyrics = lyrics.replace("n't", " n't") # couln't -> could n't
        """
        Tokenizer rules
            [\w]+[^\s?!,:()']* - tokens that start with a-zA-Z. Not including punctutation or apostrophes
            \n - keep track of new line tokens
            [^\w\s] - non-wodrs. Punctuation and other characters
            \'[\w]+ - 's, 'em 've
            n\'t - keep n't as one token (not)
        """
        tokenized = regexp_tokenize(lyrics, pattern='n\'t|[\w]+[^\s?!,\.’:();"\'-]*|<BR>|\'[\w]+|[^\w\s\']{1}')
        songs.append(tokenized)
        
    return data['name'], songs

# load all lyrics
print("Reading data")
rap_corpus = []
datadir = 'data/lyrics'
for filename in os.listdir(datadir):
    if filename.endswith(".json"): 
        path = os.path.join(datadir, filename)
        artist, songs = read_lyrics_json(path)
        rap_corpus += [(artist, s) for s in songs]
        print("\t{} ({})".format(path, len(songs)))

Reading data
	data/lyrics/Lyrics_2Pac.json (592)
	data/lyrics/Lyrics_50Cent.json (381)
	data/lyrics/Lyrics_BigSean.json (223)
	data/lyrics/Lyrics_BustaRhymes.json (320)
	data/lyrics/Lyrics_DMX.json (286)
	data/lyrics/Lyrics_Eminem.json (467)
	data/lyrics/Lyrics_IceCube.json (228)
	data/lyrics/Lyrics_J.Cole.json (261)
	data/lyrics/Lyrics_JAYZ.json (359)
	data/lyrics/Lyrics_KanyeWest.json (448)
	data/lyrics/Lyrics_KendrickLamar.json (307)
	data/lyrics/Lyrics_LilWayne.json (1121)
	data/lyrics/Lyrics_Ludacris.json (220)
	data/lyrics/Lyrics_MethodMan.json (186)
	data/lyrics/Lyrics_MFDOOM.json (186)
	data/lyrics/Lyrics_Nas.json (378)
	data/lyrics/Lyrics_Rakim.json (85)
	data/lyrics/Lyrics_SnoopDogg.json (672)
	data/lyrics/Lyrics_TheNotoriousB.I.G..json (130)
	data/lyrics/Lyrics_TravisScott.json (192)


In [539]:
print("loaded", len(rap_corpus), "songs")
rap_vocab = Counter([t for x in rap_corpus for t in x[1]]) #set([t for x in rap_corpus for t in x[1]])
print("Vocab size = {}".format(len(rap_vocab)))

loaded 7042 songs
Vocab size = 55088


In [540]:
rap_vocab.most_common(10)

[('<BR>', 476707),
 (',', 275576),
 ('i', 176362),
 ('the', 144912),
 ('you', 105233),
 ('a', 82686),
 ('and', 77331),
 ('to', 72723),
 ('it', 69813),
 ('my', 58745)]

### Limit to 100 songs per artist

In [541]:
max_songs_per_artist = 100
rap_corpus = np.random.permutation(rap_corpus)
inventory = Counter()
rap_corpus_small = []
for artist, song in rap_corpus:
    if inventory[artist] >= max_songs_per_artist:
        continue
    inventory[artist] += 1
    rap_corpus_small.append((artist, song))

In [542]:
inventory

Counter({'Busta Rhymes': 100,
         'MF DOOM': 100,
         'DMX': 100,
         'Lil Wayne': 100,
         'Snoop Dogg': 100,
         'Eminem': 100,
         'Kanye West': 100,
         'JAY-Z': 100,
         '50 Cent': 100,
         'Ludacris': 100,
         'Big Sean': 100,
         '2Pac': 100,
         'Travis Scott': 100,
         'Method Man': 100,
         'Kendrick Lamar': 100,
         'J. Cole': 100,
         'Nas': 100,
         'The Notorious B.I.G.': 100,
         'Rakim': 85,
         'Ice Cube': 100})

In [543]:
len(rap_corpus_small)

1985

In [544]:
rap_corpus = rap_corpus_small

### Preprocess

Convert all lyrics into samples of set size (seq_length parameter).

In [545]:
artist2id = {}
seq_length = 10

samples = []
for artist, song in rap_corpus:
    # vectorize artists
    if artist not in artist2id:
        artist2id[artist] = len(artist2id)
        
    # split each song into sequences of length seq_length
    # 1. pad
    n = len(song)
    song = ['<PAD>']*(seq_length-1) + song + ['<PAD>']*(seq_length-1)
    
    i = 0
    while i < n:
        samples.append((artist2id[artist], song[i:i+seq_length], song[i+seq_length]))
        i += 1
    
print("{} samples".format(len(samples)))
    

1368732 samples


In [546]:
samples[500000]

(8,
 ['gotta', 'make', 'it', 'to', 'heaven', '<BR>', 'i', 'gotta', 'make', 'it'],
 'to')

In [547]:
artist2id

{'Busta Rhymes': 0,
 'MF DOOM': 1,
 'DMX': 2,
 'Lil Wayne': 3,
 'Snoop Dogg': 4,
 'Eminem': 5,
 'Kanye West': 6,
 'JAY-Z': 7,
 '50 Cent': 8,
 'Ludacris': 9,
 'Big Sean': 10,
 '2Pac': 11,
 'Travis Scott': 12,
 'Method Man': 13,
 'Kendrick Lamar': 14,
 'J. Cole': 15,
 'Nas': 16,
 'The Notorious B.I.G.': 17,
 'Rakim': 18,
 'Ice Cube': 19}

### Train/Val/Test Split

In [548]:
N = len(samples)
perm = np.random.permutation(samples) # shuffle

ntrain = int(N*0.8) # 80% in train
nval = int(N*0.1) # 10% in val, 10% in test

train = perm[:ntrain]
val = perm[ntrain:ntrain+nval]
test = perm[ntrain+nval:]

print(len(train), 'train samples')
print(len(val), 'validation samples')
print(len(test), 'test samples')

1094985 train samples
136873 validation samples
136874 test samples


Save the split

In [549]:
_dir = 'data/rap_max{}_{}'.format(max_songs_per_artist, seq_length)
if not os.path.isdir(_dir):
    os.mkdir(_dir)

def write_data(file, data):
    with open(file, 'w') as f:
        for a,tok,target in data:
            f.write("{}\t{}\t{}\n".format(a, ' '.join(tok), target))



write_data(os.path.join(_dir, 'train.tsv'), train)
write_data(os.path.join(_dir, 'val.tsv'), val)
write_data(os.path.join(_dir, 'test.tsv'), test)

with open(os.path.join(_dir, 'artist2id.tsv'), 'w') as f:
    for a in artist2id:
        f.write("{}\t{}\n".format(a, artist2id[a]))

In [423]:
def read_data(file):
    data = []
    with open(file, 'r') as f:
        for line in f:
            line = line.split('\t')
            data.append((line[0], line[1].split(), line[2]))
    return data
val = read_data(os.path.join(_dir, 'val.tsv'))

### Check Distribution of Artists in Train

In [560]:
id2artist = {v:k for k,v in artist2id.items()}
a_train = Counter()
a_val = Counter()

ntrain = len(train)
nval = len(val)
for artist, _, _ in train:
    a_train[id2artist[artist]] += 1./ntrain
    
for artist, _, _ in val:
    a_val[id2artist[artist]] += 1./nval
    
print(a_train.most_common())
print(a_val.most_common())

[('Eminem', 0.06601551619422613), ('The Notorious B.I.G.', 0.058414498828708764), ('DMX', 0.05649027155619343), ('Busta Rhymes', 0.055851906647086544), ('Ludacris', 0.055740489595711665), ('Kendrick Lamar', 0.054884770110971814), ('JAY-Z', 0.054541386411652515), ('2Pac', 0.053480184660032765), ('Lil Wayne', 0.05322721315814062), ('Snoop Dogg', 0.05181532167104584), ('Nas', 0.0514189692095975), ('J. Cole', 0.05118517605261415), ('50 Cent', 0.050861883952723105), ('Big Sean', 0.04895409526155817), ('Method Man', 0.04866733334244577), ('Ice Cube', 0.0482691543719585), ('Kanye West', 0.04555404868558542), ('Rakim', 0.03753110773207466), ('Travis Scott', 0.03692196696759069), ('MF DOOM', 0.020174705589577027)]
[('Eminem', 0.0673909390456893), ('The Notorious B.I.G.', 0.05803189818299463), ('Busta Rhymes', 0.05656338357456172), ('DMX', 0.05610310287639618), ('Ludacris', 0.05539441672207781), ('Kendrick Lamar', 0.05475879099603968), ('JAY-Z', 0.05415969548414168), ('Lil Wayne', 0.052764241303

In [564]:
a_train.most_common()

[('Eminem', 0.06601551619422613),
 ('The Notorious B.I.G.', 0.058414498828708764),
 ('DMX', 0.05649027155619343),
 ('Busta Rhymes', 0.055851906647086544),
 ('Ludacris', 0.055740489595711665),
 ('Kendrick Lamar', 0.054884770110971814),
 ('JAY-Z', 0.054541386411652515),
 ('2Pac', 0.053480184660032765),
 ('Lil Wayne', 0.05322721315814062),
 ('Snoop Dogg', 0.05181532167104584),
 ('Nas', 0.0514189692095975),
 ('J. Cole', 0.05118517605261415),
 ('50 Cent', 0.050861883952723105),
 ('Big Sean', 0.04895409526155817),
 ('Method Man', 0.04866733334244577),
 ('Ice Cube', 0.0482691543719585),
 ('Kanye West', 0.04555404868558542),
 ('Rakim', 0.03753110773207466),
 ('Travis Scott', 0.03692196696759069),
 ('MF DOOM', 0.020174705589577027)]

### Use pretrained Glove embeddings

***Selecting Vocabulary Size***

The rap corpus contains 55625 unique tokens (printed out in cell above). Here I load in pretrained Glove embeddings and check how much coveragewe get when using 100k, 200k,... tokens. Corpus coverage takes into account token frequency in our corpus.

| vocab size | token coverage   | corpus coverage |
|------|------|------|
| 50k  | 45.36% | 95.34% | 
| 100k | 56.42% | 96.55% | 
| 200k | % | %|
| 300k | % | %|
| 400k | % | %|

Looks like using all 400k tokens does not give us a huge advantage in terms of corpus coverage (87% vs 85%). Limiting the vocabulary size will make the model a little bit easier to train.  I selected vocab size of **100k**.

Examining tokens for which we don't have coverage - most are strange spellings or words in other languages. These will be replaced with <UNK> token during model training.
    
**Unknown tokens**: {'irreputable',
 '77777777',
 'heini',
 'wallabees',
 'motherfuckerss',
 'jip',
 'gyrate',
 'lights…',
 'pigsties',
 'hassans',
 'muthaphukka',
 'jiggy',
 "'months",
 'allergenic',
 'девки',
 'nestlé',
 'westbank',
 'boggles',
 'tieing',...}
 
 **Most frequent unknown tokens**: ('nigga', 21525),
 ('niggas', 18183),
 ("'all", 6122),
 ('!)', 4568),
 ("',", 4442),
 ('bitches', 4082),
 ('hoes', 3003),
 ("'ma", 2211),
 ('pussy', 2117),
 ("'bout", 2092),
 ('..', 1893),
 ('motherfucker', 1848),
 ('?)', 1811),
 ('tryna', 1776),
 ('ooh', 1727),
 ("'mma", 1501),
 ('motherfuckers', 1373)


In [99]:
from gensim.models import Word2Vec, KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

# RUN ONCE
glove_file="data/glove/glove.6B/glove.6B.100d.txt"
glove_in_w2v_format="data/glove/glove.6B/glove.6B.100d.w2v"
_ = glove2word2vec(glove_file, glove_in_w2v_format)

In [318]:
vocab_size = 50000
vocab_dim = 100
glove_in_w2v_format="data/glove/glove.6B/glove.6B.100d.w2v"
glove = KeyedVectors.load_word2vec_format(glove_in_w2v_format,
                                          binary=False,
                                          limit=vocab_size)

In [336]:
def load_embeddings(filename, max_vocab_size, emb_dim):

    vocab={}
    embeddings=[]
    with open(filename) as file:
        
        cols=file.readline().split(" ")
        num_words=int(cols[0])
        size=int(cols[1])
        embeddings.append(np.zeros(size))  # 0 = 0 padding if needed
        embeddings.append(np.random.uniform(-1,1,emb_dim))  # 1 = UNK
        embeddings.append(np.random.uniform(-1,1,emb_dim))  # 1 = <BR>
        vocab["<PAD>"]=0
        vocab["<UNK>"]=1
        vocab["<BR>"]=2
        
        for idx,line in enumerate(file):

            if idx+3 >= max_vocab_size:
                break

            cols=line.rstrip().split(" ")
            val=np.array(cols[1:])
            word=cols[0]
            
            embeddings.append(val)
            vocab[word]=idx+3

    return np.array(embeddings), vocab, size

In [370]:
vocab_size = 50000
vocab_dim = 100
emb, tok2id, size = load_embeddings('data/glove/glove.6B/glove.6B.100d.w2v', vocab_size, vocab_dim)

In [371]:
def check_glove_coverage(vocab, tokens):  
    in_vocab = set()
    out_vocab = set()
    all_vocab = set()
    
    in_count = 0
    out_count = 0
    all_count = 0
    
    out_counter = Counter()
    for tok in tokens:
        if tok in vocab or tok == '<BR>':
            in_vocab.add(tok)
            in_count += tokens[tok]            
        elif tok[-1]=='n' and tok+'g' in vocab:
            # 'growin' -> 'growing'
            # 'obeyin' -> 'obeying'
            in_vocab.add(tok)
            in_count += tokens[tok]
                
        elif tok[-1]=='\'' and tok[:-1]+'g' in vocab:
            # 'growin'' -> 'growing'
            # 'obeyin' -> 'obeying'
            in_vocab.add(tok)
            in_count += tokens[tok]
                
        else:
            out_counter[tok] += tokens[tok]
            out_vocab.add(tok)
            out_count += tokens[tok]
        all_vocab.add(tok)
        all_count += tokens[tok]
        
    print("{:.2%} of unique tokens covered".format(len(in_vocab)/len(all_vocab)))
    print("{:.2%} of corpus covered".format(in_count/all_count))
    
    return in_vocab, out_vocab, all_vocab, out_counter

in_, out_, all_, out_counter = check_glove_coverage(tok2id, rap_vocab)
# print("{:.2%} of rap_vocab tokens covered".format(len(in_)/len(all_)))

45.37% of unique tokens covered
95.34% of corpus covered


In [516]:
# DOMAIN SPECIFIC VOABULARY
len([o for o in out_counter if out_counter[o]>20])

988

In [506]:
out_counter.most_common()

[('nigga', 21525),
 ('niggas', 18183),
 ("'all", 6122),
 ('!)', 4568),
 ("',", 4442),
 ('bitches', 4082),
 ('hoes', 3003),
 ("'ma", 2211),
 ('pussy', 2117),
 ("'bout", 2092),
 ('..', 1893),
 ('motherfucker', 1848),
 ('?)', 1811),
 ('tryna', 1776),
 ('ooh', 1727),
 ("'mma", 1501),
 ('motherfuckers', 1373),
 ('uhh', 1323),
 ('motherfuckin', 1312),
 ('homie', 1305),
 ("'mon", 1133),
 ('),', 1108),
 ('fucked', 1067),
 ('?"', 1045),
 ('cuz', 1037),
 ('!"', 1006),
 ('weezy', 1003),
 ('yea', 999),
 ('homies', 953),
 ('holla', 876),
 ('thang', 750),
 ('outta', 715),
 ('bullshit', 714),
 ('ayy', 657),
 ('motherfucking', 616),
 ('biggie', 588),
 ('haters', 585),
 ('haha', 582),
 ("'?", 567),
 ('."', 565),
 ('woah', 542),
 ('ballin', 536),
 ('ayo', 506),
 ('shawty', 463),
 (',"', 461),
 ('busta', 454),
 ('jigga', 452),
 ('gat', 448),
 ('pimpin', 444),
 ('2pac', 442),
 ('thats', 433),
 ('.,', 432),
 ("'round", 429),
 ('dawg', 423),
 ('("', 390),
 ('doggy', 378),
 ('hov', 353),
 ("'fore", 352),
 ("

In [345]:
def get_tok_id(tok, vocab):
    """
    return id for token
        get_tok_id('the', tok2id) -> 3
        get_tok_id('<UNK>', tok2id) -> 1
    """
    if tok in vocab:
        return vocab[tok]
    
    if tok[-1]=='n' and tok+'g' in vocab:
        return vocab[tok+'g']
    
    if tok[-1]=='\'' and tok[:-1]+'g' in vocab:
        return vocab[tok[:-1]+'g']

## Poetry Dataset

Using [Gutenberg,dammit](https://github.com/aparrish/gutenberg-dammit/) to extract metadata for gutenberg document id
1. run pip install https://github.com/aparrish/gutenberg-dammit/archive/master.zip


In [469]:
from collections import defaultdict
import gzip, json
import pandas as pd

In [470]:
poetry_pd = pd.read_csv('data/kaggle_poem_dataset.csv')
poetry_pd.head()

Unnamed: 0.1,Unnamed: 0,Author,Title,Poetry Foundation ID,Content
0,0,Wendy Videlock,!,55489,"Dear Writers, I’m compiling the first in what ..."
1,1,Hailey Leithauser,0,41729,"Philosophic\nin its complex, ovoid emptiness,\..."
2,2,Jody Gladding,1-800-FEAR,57135,We'd like to talk with you about fear t...
3,3,Joseph Brodsky,1 January 1965,56736,The Wise Men will unlearn your name.\nAbove yo...
4,4,Ted Berrigan,3 Pages,51624,For Jack Collom\n10 Things I do Every Day\n\np...


In [488]:
num_poems = poetry_pd.groupby(['Author']).size().values
authors = poetry_pd.groupby(['Author']).size().keys()
idx = np.argsort(num_poems)
[print(authors[i], num_poems[i]) for i in idx[-10:]]

Percy sshe Shelley 43
Yusef Komunyakaa 43
John Ashbery 46
William Butler Yeats 47
Emily Dickinson 57
William Wordsworth 59
Rae Armantrout 62
Alfred, Lord Tennyson 78
Anonymous 82
William Shakespeare 85


[None, None, None, None, None, None, None, None, None, None]

In [499]:
poetry_pd[poetry_pd['Author']=='Emily Dickinson']

Unnamed: 0.1,Unnamed: 0,Author,Title,Poetry Foundation ID,Content
204,204,Emily Dickinson,"After great pain, a formal feeling comes – (372)",47651,"After great pain, a formal feeling comes –\nTh..."
347,347,Emily Dickinson,"All overgrown by cunning moss, (146)",52200,"All overgrown by cunning moss,\nAll interspers..."
1205,1205,Emily Dickinson,Banish Air from Air - (963),56454,Banish Air from Air -\nDivide Light if you dar...
1315,1315,Emily Dickinson,Because I could not stop for Death – (479)\n \...,47652,Because I could not stop for Death –\nHe kindl...
1353,1353,Emily Dickinson,Before I got my eye put out – (336),52135,Before I got my eye put out –\nI liked as well...
1488,1488,Emily Dickinson,"A Bird, came down the Walk - (359)",56593,"A Bird, came down the Walk -\nHe did not know ..."
1861,1861,Emily Dickinson,The Bustle in a House (1108),44084,The Bustle in a House\nThe Morning after Death...
2424,2424,Emily Dickinson,Come slowly – Eden! (205),52136,Come slowly – Eden!\nLips unused to Thee –\nBa...
2710,2710,Emily Dickinson,Crumbling is not an instant's Act (1010),56823,Crumbling is not an instant's Act\nA fundament...
4126,4126,Emily Dickinson,“Faith” is fine invention (202),48184,“Faith” is a fine invention\nFor Gentlemen who...


In [489]:
from gutenbergdammit.ziputils import loadmetadata
metadata = loadmetadata("data/gutenberg-dammit-files-v002.zip")


In [497]:
metadata[25]

{'Author': ['John Milton'],
 'Author Birth': [1608],
 'Author Death': [1674],
 'Author Given': ['John'],
 'Author Surname': ['Milton'],
 'Copyright Status': ['Not copyrighted in the United States.'],
 'Language': ['English'],
 'LoC Class': ['PR: Language and Literatures: English literature'],
 'Num': '26',
 'Subject': ['Religion', 'Poetry'],
 'Title': ['Paradise Lost'],
 'charset': 'us-ascii',
 'gd-num-padded': '00026',
 'gd-path': '000/00026.txt',
 'href': '/2/26/26.zip'}

In [447]:

poetry_corpus = []
for line in gzip.open("data/gutenberg-poetry-v001.ndjson.gz"):
    poetry_corpus.append(json.loads(line.strip()))

In [455]:
poetry_dict = defaultdict(list)
for obj in poetry_corpus:
    poetry_dict[obj['gid']].append(obj)

In [467]:
poetry_dict['109']

[{'s': 'All I could see from where I stood', 'gid': '109'},
 {'s': 'The room is full of you!--As I came in', 'gid': '109'},
 {'s': '"Curse thee, Life, I will live with thee no more!', 'gid': '109'},
 {'s': "God's World", 'gid': '109'},
 {'s': 'O world, I cannot hold thee close enough!', 'gid': '109'},
 {'s': 'I will be the gladdest thing', 'gid': '109'},
 {'s': 'Sorrow like a ceaseless rain', 'gid': '109'},
 {'s': "I'll keep a little tavern", 'gid': '109'},
 {'s': 'Love has gone and left me and the days are all alike;', 'gid': '109'},
 {'s': 'I knew her for a little ghost', 'gid': '109'},
 {'s': 'The first rose on my rose-tree', 'gid': '109'},
 {'s': 'Let the little birds sing;', 'gid': '109'},
 {'s': 'All the dog-wood blossoms are underneath the tree!', 'gid': '109'},
 {'s': 'Death, I say, my heart is bowed', 'gid': '109'},
 {'s': 'Love, if I weep it will not matter,', 'gid': '109'},
 {'s': 'I said,--for Love was laggard, O, Love was slow to come,--',
  'gid': '109'},
 {'s': 'She is n

In [463]:
gutenberg_authors = Counter()
for key in poetry_dict.keys():
    gid = int(key)-1
    
    
    

dict_keys(['19', '20', '26', '58', '109', '136', '151', '163', '207', '213', '214', '228', '230', '232', '246', '257', '258', '259', '261', '262', '263', '264', '266', '301', '304', '309', '312', '313', '315', '317', '323', '328', '348', '353', '390', '391', '392', '397', '400', '409', '413', '424', '438', '441', '442', '454', '458', '487', '574', '579', '591', '592', '594', '595', '596', '602', '610', '615', '617', '618', '651', '658', '679', '680', '691', '692', '703', '715', '772', '785', '791', '795', '835', '841', '845', '937', '941', '962', '981', '982', '995', '1001', '1002', '1003', '1004', '1005', '1006', '1007', '1008', '1019', '1020', '1021', '1030', '1031', '1034', '1035', '1040', '1041', '1042', '1045', '1054', '1057', '1062', '1141', '1151', '1165', '1166', '1186', '1199', '1211', '1229', '1238', '1246', '1247', '1279', '1280', '1287', '1304', '1317', '1321', '1322', '1333', '1358', '1365', '1381', '1382', '1383', '1393', '1418', '1459', '1469', '1471', '1505', '1506', '1