# Generating lyrics using a word prediction dictionary
Using code adapted from [The Clever Programmer's Next Word Prediction](https://thecleverprogrammer.com/2021/01/19/next-word-prediction-with-python/), we'll use our scraped lyrics to generate a new song!

In [3]:
import numpy as np
import re
from collections import Counter
import random

First, let's start by reading our lyrics in and removing punctuation

In [28]:
data = open('playlist.txt').readlines()

In [30]:
data[:5]

['hull\n',
 "i've travelled the land with a guitar in my hand\n",
 'and an eye ever open for some fun\n',
 "i've made some mistakes, had my share of the breaks\n",
 'seen the boys on the make and on the bum\n']

In [38]:
def clean_line(line):
    chars_to_remove = ['.', ',', '…', '?', '"', '!', ';', ':', "(",")","-","—"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    line = re.sub(rx,'',line).strip()
    return(line)

In [39]:
data = [clean_line(x) for x in data]

Next, we are going to create three dictionaries:
- `lexicon`: for each key, what is the occurence of words that come immediately after?
- `backward_lexicon`: for each key, what is the occurrence of words that come immediately before?
- `lineend_lexicon`: for each key (created only from words at the end of each line), what is the occurrence of words that end the line immediately before?

In [154]:
lexicon = {}
backward_lexicon = {}
lineend_lexicon = {}

def update_lexicon(current : str, next_word : str) -> None:
    # Add the input word to the lexicon if it in there yet.
    if current not in lexicon:
        lexicon.update({current: {next_word: 1} })
        return
    
    if next_word not in backward_lexicon:
        backward_lexicon.update({next_word: {current:1}})
        return

    # Recieve the probabilties of the input word.
    options = lexicon[current]
    options_backward = backward_lexicon[next_word]

    # Check if the output word is in the propability list.
    if next_word not in options:
        options.update({next_word : 1})
    else:
        options.update({next_word : options[next_word] + 1})
        
    if current not in options_backward:
        options_backward.update({current : 1})
    else:
        options_backward.update({current : options_backward[current] + 1})


    # Update the lexicon
    lexicon[current] = options
    backward_lexicon[next_word] = options_backward
    
def update_lineend(current : str, next_word : str) -> None:
    if current not in lineend_lexicon:
        lineend_lexicon.update({current: {next_word: 1} })
        return
    
    options = lineend_lexicon[current]
    if next_word not in options:
        options.update({next_word : 1})
    else:
        options.update({next_word : options[next_word] + 1})
        
    lineend_lexicon[current] = options


with open('playlist.txt', 'r') as dataset:
    end = ""
    for line in dataset:
        line = clean_line(line)
        words = line.strip().split(' ')
        for i in range(len(words) - 1):
            update_lexicon(words[i], words[i+1])
            
        update_lexicon(words[len(words)-1], "{{ENDLINE}}")
        update_lexicon("{{STARTLINE}}", words[0])
        if end != "":
            update_lineend(end,words[len(words)-1])
        end = words[len(words)-1]
        

Once we've created our dictionaries, we can then convert them from counts (i.e. `sun` follows `the` fifteen times) to percentages (when we see `the`, `sun` follows 3% of the time)

In [154]:
for word, transition in lineend_lexicon.items():
    transition = dict((key, value / sum(transition.values())) for key, value in transition.items())
    lineend_lexicon[word] = transition
    
for word, transition in backward_lexicon.items():
    transition = dict((key, value / sum(transition.values())) for key, value in transition.items())
    backward_lexicon[word] = transition
    
for word, transition in lexicon.items():
    transition = dict((key, value / sum(transition.values())) for key, value in transition.items())
    lexicon[word] = transition

In [95]:
def flatten_deep(arr: list):
    """ Flattens arbitrarily-nested list `arr` into single-dimensional. """

    while arr:
        if isinstance(arr[0], list) and not isinstance(arr[0], str):  # Checks whether first element is a list
            arr = arr[0] + arr[1:]  # If so, flattens that first element one level
        else:
            yield arr.pop(0)  # Otherwise yield as part of the flat array

In [95]:
# list of all words in lyrical corpus
all_words = list(set(list(flatten_deep([x.split() for x in data]))))

In [96]:
len(all_words)

2449

In [65]:
# frequency of words starting a line
start_words = [x.split()[0] for x in data]
start_words_dict = Counter(start_words)

In [75]:
# frequency of line length
line_length_dict = Counter([len(x.split()) for x in data])

In [1]:
# generate a lyrical line 
def predict_line():
    # select random word to start
    start_word = random.choices(list(start_words_dict.keys()), weights=start_words_dict.values(), k=1)[0]
#     line_length = random.choices(list(line_length_dict.keys()), weights=line_length_dict.values(), k=1)[0]
    
    my_line = start_word
    word = start_word
    # our line should be a maximum of 15 words, unless we reach {{ENDLINE}} before then
    for k in range(15):
        if word in lexicon:
            options = lexicon[word]
        else:
            while word not in lexicon:
                word = np.random.choice(all_words)
            options = lexicon[word]
        predicted = np.random.choice(list(options.keys()), p=list(options.values()))
        if predicted == "{{ENDLINE}}":
            break
        my_line += " " + predicted
        word = predicted
    return(my_line)

# generate a lyrical line given the word we want the line to end with
def predict_line_backward(end_word):
#     line_length = random.choices(list(line_length_dict.keys()), weights=line_length_dict.values(), k=1)[0]
    
    my_line = end_word
    word = end_word
    for k in range(15):
        if word in backward_lexicon:
            options = backward_lexicon[word]
        else:
            while word not in backward_lexicon:
                word = np.random.choice(all_words)
            options = lexicon[word]
        predicted = np.random.choice(list(options.keys()), p=list(options.values()))
        if predicted == "{{STARTLINE}}":
            break
        my_line += " " + predicted
        word = predicted
    my_line = my_line.split()
    my_line.reverse()
    my_line = ' '.join(my_line)
    return(my_line)

# given a line, decide what word the line after that should end with
def select_line_end(line):
    # get last word in existing line
    word = line.split()[-1]
    
    # if this line has been at the end of any of the lines in our corpus, use lineend_lexicon to select an ending word for the new line
    if word in lineend_lexicon:
        options = lineend_lexicon[word]
        predicted = np.random.choice(list(options.keys()), p=list(options.values()))
        return(predicted)
    # otherwise just randomly select a word 
    else:
        return(np.random.choice(all_words))


In [2]:
# randomly generate one line
l1 = predict_line()
lines = [l1]

# use each line to predict another line and append to list
for i in range(20):
    lines.append(predict_line_backward(select_line_end(lines[i])))

NameError: name 'random' is not defined

In [184]:
lines

["i'm alone",
 'in my lonesome',
 'put upon',
 "i'm back on",
 "finally through me don't believe in my spine i've got a time gone",
 'for the united federation of a line drop you near to have you had to free',
 "everybody's feeling",
 "i'm falling in the bee",
 'hear me',
 'dance',
 'but me',
 'there’s this land forever and met in the streets you baby',
 'for a little too hard to make sure that',
 'lying there soon',
 'how far',
 'drove up at the same things',
 'and gone by her social status',
 'and adobe slats',
 'boys i alone love you see my girls',
 'i only lied that old city walls and said i keep things',
 "the man she's too bad that he's such a feeling warm and there's a social status"]

In [None]:
#TODO: 
# include some notion of repeating lines
# add some randomness when selecting next line ends so we don't get 20 rhyming lines in a row 