**Text generation using an n-gram markov model**  
ANLP 2020/2021 final project  
Friederike Schreiber, Peng Chen, Anton Rabe



In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk import FreqDist
from nltk import ngrams
import itertools
import re
import random
import dill
import pickle

In [None]:
filename = '../resources/n-gram_markov.pkl'

In [None]:
dill.dump_session(filename)

In [None]:
dill.load_session(filename)

**some data preprocessing**  
Transforming our lsit of songtexts with song name and artist tags into a list of lists of lines for each song.
Also removing punctuation and adding Tokens for the beginning and ending of lines verses and songs.

In [None]:
data = pd.read_csv('../resources/output.txt', sep='\n\n\n', engine='python',encoding='utf8', header = 0)

In [None]:
linelist= data["Artist: 40 Thevz f/ Malika"]

In [None]:
lines = [x for x in linelist]

In [None]:
lines = [x for x in lines if x[:5] != 'Album' and x[:6]!='Artist' and x[0]!='*' and x[:5]!='Typed']

In [None]:
def replace_verse(x):
    if x[:5]=='Verse' or x[:6]=='Chorus' or x[:5]=='Intro' or x[0]=='[':
        return "NXTVRSE"
    if x[:4]=='Song':
        return "NXTSNG"
    else: 
        return x

In [None]:
lines = [replace_verse(x) for x in lines]

In [None]:
mini_lines = [re.sub("[^A-Za-z0-9 -\']", "", x.lower()) for x in lines]

MemoryError: 

In [None]:
#We either chose TreebankWordTokenizer or simple split to tokenize our lines into lists of tokens
from nltk.tokenize import TreebankWordTokenizer

In [None]:
word_lines = [x.split(" ") for x in mini_lines]

In [None]:
all_lines = [x for x in word_lines if len(x)>0]
for y in all_lines:
    if y[0]!='nxtsng' and y[0]!='nxtvrse':
        y.append('endline')

In [None]:
long = [x for y in all_lines for x in y]

In [None]:
size = len(long) 
idx_list = [idx + 1 for idx, val in enumerate(long) if val == 'nxtsng'] 
  
  
listlist = [long[i: j] for i, j in
        zip([0] + idx_list, idx_list + 
        ([size] if idx_list[-1] != size else []))] 

In [None]:
#pickle the pre processed data 
pickle.dump(listlist, open( "../resources/OHHLAdata_list.p", "wb" ) )

**Plain n-gram markov model for generation**
For each n-gram learn the frequency distribution of follow up words from the corpus with the train function. Then generate using the markov chain by providing a start sequence and the required of lines to the generate_lines function.

In [None]:
class ngram_markov_generator(object):
    
    def __init__(self, order,  end='nxtsng', endline='endline', meta_list=['nxtvrse','nxtsng']):
        self.end = end
        self.endline = endline
        self.meta_list = meta_list
        self.order =order
        self.freq_dict = dict()
        
    def train(self, tknzd_txt_list):
        for text in tknzd_txt_list:
            grams = list(ngrams(text, self.order+1))
            for gram in grams:
                self.add_to_dict(gram)
            
    def add_to_dict(self, gram):
        try:
            self.freq_dict[gram[:-1]][gram[-1]]+=1
        except KeyError:
            self.freq_dict[gram[:-1]]= FreqDist([gram[-1]])
    
    def generate_text(self, start, max_len=20, temp=1):
        key = start[-self.order:]
        res_sent= start
        
        for _ in itertools.repeat(None, max_len):
            
            with_temp = {key: value**(1/temp) for key, value in self.freq_dict[tuple(key)].items()}
            dist = nltk.DictionaryProbDist(with_temp,normalize=True)
            
            nextword = dist.generate()
            res_sent.append(nextword)
            
            if nextword==self.end:
                break
                
            key =res_sent[-self.order:]
        
        return res_sent
    
    def generate_lines(self, start, num_lines, max_len=200, temp=1):
        key = start[-self.order:]
        res_sent= start
        linecount=0
        for _ in itertools.repeat(None, max_len):
            
            with_temp = {key: value**(1/temp) for key, value in self.freq_dict[tuple(key)].items()}
            dist = nltk.DictionaryProbDist(with_temp,normalize=True)
            
            nextword = dist.generate()
            res_sent.append(nextword)
            
            if nextword==self.endline:
                linecount = linecount + 1
            
            if linecount >= num_lines:
                break
            if nextword==self.end:
                break
                
            key =res_sent[-self.order:]
        
        return res_sent
    def nice_format(self, output_list):
        no_meta = [x for x in output_list if x not in self.meta_list]
        with_linebreaks = ["\n" if x==self.endline else x for x in no_meta]
        return " ".join(with_linebreaks)
        
        
        

In [None]:
trigram_mc = ngram_markov_generator(2)

In [None]:
trigram_mc.train(listlist)

In [None]:
text= trigram_mc.generate_text(['nxtvrse','i','was','so'],max_len=20,temp=1)
print(trigram_mc.nice_format(text))

i was so young 
 cause fire then you're next 
 you better check your boy now 
 don't be wrong yeah your


In [None]:
lines= trigram_mc.generate_lines(['nxtvrse','i','was','so'],4)
print(trigram_mc.nice_format(lines))

i was so devastating i feel like 
 drum tap young cat wit 9 lives from the '80s this world '' 
 i promise 
 i do n't admit it 



**Generating 100 4-liners for evaluation and classification**

In [None]:
mc_list= []
for x in range(100):
  beginning= random.choice(listlist)
  mc_list.append(trigram_mc.nice_format(trigram_mc.generate_lines(beginning[0:2],4)))

In [None]:
pickle.dump(mc_list, open("../resources/mc_list.p","wb"))