### 03_Tri-Gram_Model

Source: https://nlpforhackers.io/language-models/

In [None]:
import nltk
nltk.download('reuters')
nltk.download('punkt')

In [2]:
from nltk.corpus import reuters
from nltk import bigrams, trigrams
from collections import Counter, defaultdict
 
first_sentence = reuters.sents()[0]
print (first_sentence) # [u'ASIAN', u'EXPORTERS', u'FEAR', u'DAMAGE', u'FROM' ...
 
# Get the bigrams
print (list(bigrams(first_sentence))) # [(u'ASIAN', u'EXPORTERS'), (u'EXPORTERS', u'FEAR'), (u'FEAR', u'DAMAGE'), (u'DAMAGE', u'FROM'), ...
 
# Get the padded bigrams
print (list(bigrams(first_sentence, pad_left=True, pad_right=True))) # [(None, u'ASIAN'), (u'ASIAN', u'EXPORTERS'), (u'EXPORTERS', u'FEAR'), (u'FEAR', u'DAMAGE'), (u'DAMAGE', u'FROM'),
 
# Get the trigrams
print (list(trigrams(first_sentence))) # [(u'ASIAN', u'EXPORTERS', u'FEAR'), (u'EXPORTERS', u'FEAR', u'DAMAGE'), (u'FEAR', u'DAMAGE', u'FROM'), ...
 
# Get the padded trigrams
print (list(trigrams(first_sentence, pad_left=True, pad_right=True))) # [(None, None, u'ASIAN'), (None, u'ASIAN', u'EXPORTERS'), (u'ASIAN', u'EXPORTERS', u'FEAR'), (u'EXPORTERS', u'FEAR', u'DAMAGE'), (u'FEAR', u'DAMAGE', u'FROM') ...
 

['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between', 'the', 'U', '.', 'S', '.', 'And', 'Japan', 'has', 'raised', 'fears', 'among', 'many', 'of', 'Asia', "'", 's', 'exporting', 'nations', 'that', 'the', 'row', 'could', 'inflict', 'far', '-', 'reaching', 'economic', 'damage', ',', 'businessmen', 'and', 'officials', 'said', '.']
[('ASIAN', 'EXPORTERS'), ('EXPORTERS', 'FEAR'), ('FEAR', 'DAMAGE'), ('DAMAGE', 'FROM'), ('FROM', 'U'), ('U', '.'), ('.', 'S'), ('S', '.-'), ('.-', 'JAPAN'), ('JAPAN', 'RIFT'), ('RIFT', 'Mounting'), ('Mounting', 'trade'), ('trade', 'friction'), ('friction', 'between'), ('between', 'the'), ('the', 'U'), ('U', '.'), ('.', 'S'), ('S', '.'), ('.', 'And'), ('And', 'Japan'), ('Japan', 'has'), ('has', 'raised'), ('raised', 'fears'), ('fears', 'among'), ('among', 'many'), ('many', 'of'), ('of', 'Asia'), ('Asia', "'"), ("'", 's'), ('s', 'exporting'), ('exporting', 'nations'), ('nations', 'that'), 

In [3]:
model = defaultdict(lambda: defaultdict(lambda: 0))
 
for sentence in reuters.sents():
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        model[(w1, w2)][w3] += 1
 
 
print (model["what", "the"]["economists"]) # "economists" follows "what the" 2 times
print (model["what", "the"]["nonexistingword"]) # 0 times
print (model[None, None]["The"]) # 8839 sentences start with "The"
 
# Let's transform the counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count
 
print (model["what", "the"]["economists"]) # 0.0434782608696
print (model["what", "the"]["nonexistingword"]) # 0.0
print (model[None, None]["The"]) # 0.161543241465

2
0
8839
0.043478260869565216
0.0
0.16154324146501936


In [12]:
import random
 
 
text = [None, None]
prob = 1.0  # <- Init probability
 
sentence_finished = False
 
while not sentence_finished:
    r = random.random()
    accumulator = .0
 
    for word in model[tuple(text[-2:])].keys():
        accumulator += model[tuple(text[-2:])][word]
 
        if accumulator >= r:
            prob *= model[tuple(text[-2:])][word]  # <- Update the probability with the conditional probability of the new word
            text.append(word)
            break
 
    if text[-2:] == [None, None]:
        sentence_finished = True
 
print ("Probability of text=", prob)  # <- Print the probability of the text
print (' '.join([t for t in text if t]))
 

Probability of text= 2.0078033839670153e-52
Ortner said that given a more balanced economic growth but plant density is reported to the plan , the bank earned 3 . 2 mln vs 70 . 8 mln Sales 40 . 5 mln vs 64 . 3 billion dlrs , or 47 cts Oper net profit 577 , 000 barrels per day .


### Incorporate Manowar

In [20]:
# Convert Manowar lyrics into format below (list of lists where each list contains words from a sentence/line)
reuters.sents()

[['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between', 'the', 'U', '.', 'S', '.', 'And', 'Japan', 'has', 'raised', 'fears', 'among', 'many', 'of', 'Asia', "'", 's', 'exporting', 'nations', 'that', 'the', 'row', 'could', 'inflict', 'far', '-', 'reaching', 'economic', 'damage', ',', 'businessmen', 'and', 'officials', 'said', '.'], ['They', 'told', 'Reuter', 'correspondents', 'in', 'Asian', 'capitals', 'a', 'U', '.', 'S', '.', 'Move', 'against', 'Japan', 'might', 'boost', 'protectionist', 'sentiment', 'in', 'the', 'U', '.', 'S', '.', 'And', 'lead', 'to', 'curbs', 'on', 'American', 'imports', 'of', 'their', 'products', '.'], ...]

In [21]:
# Load dependencies:
import pandas as pd
import numpy as np
import os
import re

# View warnings once (or ignore)
import warnings
warnings.filterwarnings(action='once')
#warnings.filterwarnings('ignore')

# Show multiple results per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# See wider columns (for text)
pd.set_option('max_colwidth', 500)

In [22]:
# Load lyric directory pre-downloaded with LyricsGenius package:
files = pd.DataFrame(os.listdir("LyricsGenius/Manowar/"), columns = ["json_file_name"])
# Exclude non json files:
files = files[files['json_file_name'].str.contains(".json")]
# Quick preview:
files.head()

Unnamed: 0,json_file_name
2,lyrics_manowar_achillesagonyandecstasyineightparts.json
3,lyrics_manowar_allmenplayon10.json
4,lyrics_manowar_allmenplayonten.json
5,lyrics_manowar_anamericantrilogy.json
6,lyrics_manowar_animals.json


In [24]:
# Create empty archive:
archive = pd.DataFrame([])
# Loop through json files in directory and append to archive:
for i in range(0, len(files)):
    # Load raw json file:
    raw_json = pd.read_json("LyricsGenius/Manowar/"+files['json_file_name'].iloc[i])
    # Load json dictionary as df:
    lyrics = pd.DataFrame(raw_json['songs'][0], index = [0])
    # Add artist:
    lyrics['artist'] = raw_json['artist'][0]
    # Drop undesired columns:
    lyrics = lyrics.drop(['image', 'year'], axis=1)
    # Append to archive:
    archive = archive.append(lyrics)
    # Reset index:
    archive = archive.reset_index()
    archive = archive.drop(['index'], axis=1)

In [26]:
# Determine indeces for non-english Father tracks (which standardised as "Father  - * version"):
father = archive[archive['title'].str.contains("Father -")].index.tolist()
# See Father index:
father
# Exclude Father index:
archive = archive.drop(father, axis=0)

[6, 13, 40, 65, 85, 92, 94, 98, 99, 100, 115, 116, 151, 158, 159]

In [31]:
# Find Nessun Dorma Index (non-English cover):
archive[archive['title'].str.contains("Nessun")]['title']
# Remove Nessun Dorma
archive = archive.drop(88, axis=0)

88    Nessun Dorma
Name: title, dtype: object

In [33]:
# Some tracks are instrumental with lyrics entered as: [Instrumental]
# (normally the LyricsGenius package picks this up but apaprently it's not fail-safe)
archive[archive['lyrics']==("[Instrumental]")]

Unnamed: 0,album,lyrics,title,artist
36,Fighting the World,[Instrumental],Drums Of Doom,Manowar
80,The Final Battle I (EP),[Instrumental],March of the Heroes into Valhalla,Manowar
87,Louder than Hell,[Instrumental],My Spirit Lives On,Manowar
96,Gods of War,[Instrumental],Overture To Odin,Manowar
97,Gods of War,[Instrumental],Overture To The Hymn Of The Immortal Warriors,Manowar
112,Kings of Metal,[Instrumental],Sting of the Bumblebee,Manowar
130,Kings of Metal MMXIV,[Instrumental],The Heart of Steel MMXIV (Orchestral Intro Version),Manowar
133,Warriors of the World,[Instrumental],The March,Manowar
139,Kings of Metal MMXIV,[Instrumental],The Sting of the Bumblebee MMXIV,Manowar
144,Sign of the Hammer,[Instrumental],Thunderpick,Manowar


In [34]:
# Remove instrumental tracks:
instrumentals = archive[archive['lyrics']==("[Instrumental]")].index.tolist()
archive = archive.drop(instrumentals, axis=0)
# Reset index:
archive = archive.reset_index()
archive = archive.drop(['index'], axis=1)

In [35]:
# remove these paranthesis and square brackets:
for i in range(0, len(archive)):
    archive['lyrics'][i] = re.sub("[\(\[].*?[\)\]]", "", archive['lyrics'][i])
# Quick preview
archive.head()

  archive['lyrics'][i] = re.sub("[\(\[].*?[\)\]]", "", archive['lyrics'][i])


Unnamed: 0,album,lyrics,title,artist
0,The Triumph Of Steel,"\n\nI HECTOR STORMS THE WALL\nSee my chariot, run to your ships\nI'll drive you back to the sea\nYou came here for gold, the wall will not hold\nThis day was promised to me\nThe Gods are my shield, my fate has been sealed\nLightning and javelins fly\nSoon many will fall, we are storming the wall\nStones fall like snow from the sky\n\nWe will pay with our glory in the fire of battle\nZeus, today is mine\nKilling all in my way like sheep and like cattle\nSmashing skulls of all who defy\nI spar...","Achilles, Agony And Ecstasy In Eight Parts",Manowar
1,Sign of the Hammer,"I made a rock'n'roll sin\nWhen I tried givin' in to\nMake money had to turn down low\nThey said, ""Why be proud, don't play so loud\nBe like us and get a sound that's real thin\nWear a polyester suit, act happy look cute\nGet a haircut and buy small gear""\nThat's when I turned to them and said\n""Hold it, right there!""\n\nWell it's more to me than just a job\nAnd while I'm playin' you won't get robbed\nNobody tells a man how to play\nIt just ain't that way hey, hey, hey\nCan you hear me say......",All Men Play On 10,Manowar
2,Sign of the Hammer,"I made a rock n' roll sin, when I tried giving in\nTo make money had to turn down low\nThey said, ""why be proud, don't play so loud, be like us\nAnd get A sound that's real thin\nWear A polyester suit, act happy, look cute, get A haircut\nAnd buy small gear""\nThat's when I turned to them and said, hold it, right there\nWell it's more to me than just A job, and while I'm playin'\nYou won't get robbed\nNobody tells A man how to play, it just ain't that way\nHey, hey, hey, can you hear me say\n...",All Men Play On Ten,Manowar
3,Warriors of the World,"Oh, I wish, I was in the land of cotton\nOld times they are not forgotten\nLook away, look away, look away Dixie land\n\nOh, I wish, I was in Dixie away, away\nIn Dixie land I'll take my stand\nTo live and die in Dixie\n\nDixie land where I was born\nEarly Lord, one frosty morning\nLook away, look away, look away Dixie land\n\nGlory, glory, Hallelujah\nGlory, glory, Hallelujah\nGlory, glory, Hallelujah\nThis truth is marching on\n\nSo hush little baby, don't you cry\nYou know your daddy's bo...",An American Trilogy,Manowar
4,Sign of the Hammer,"Hear the call of the wild in us all\nIt waits for the night to fall\nI'm getting hot, I'm ready for the night\nNo holdin' back, let's ball\nI'm gonna give all you can take all night\nAnd leave you in the morning feeling right\n\nI'm an animal, there's an animal in me\nGonna set it free\nI'm an animal, there's an animal in me\nGonna set it free\n\nI've been looking, you've been watching the side\nThere's somethings you just can't hide\nOh, your getting wet, your working up a sweat\nYour hairs...",Animals,Manowar


In [105]:
# Concatenate the lyrics
manowar = archive['lyrics'].str.cat(sep=' ').lower().replace("\'","")

In [106]:
# Split lyrics into sentences
sentences = manowar.split("\n")  

In [107]:
# Create list of words
for i in range(0,len(sentences)):
     sentences[i] = sentences[i].split()

In [108]:
# Exclude emtpy lists
to_exclude = filter(None, sentences)
sentences = list(to_exclude)

In [111]:
# Quick preview
sentences[:5]

[['i', 'hector', 'storms', 'the', 'wall'],
 ['see', 'my', 'chariot,', 'run', 'to', 'your', 'ships'],
 ['ill', 'drive', 'you', 'back', 'to', 'the', 'sea'],
 ['you', 'came', 'here', 'for', 'gold,', 'the', 'wall', 'will', 'not', 'hold'],
 ['this', 'day', 'was', 'promised', 'to', 'me']]

In [124]:
# Incorporate new corpus into model:
model = defaultdict(lambda: defaultdict(lambda: 0))
 
for sentence in sentences:
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        model[(w1, w2)][w3] += 1
 
 
print (model["hail", "and"]["kill"]) # "kill" follows "hail and" 45 times
print (model["hail", "and"]["nonexistingword"]) # 0 times
print (model[None, None]["i"]) # 295 sentences start with "I"
 
# Let's transform the counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count
 
print (model["hail", "and"]["kill"])
print (model["hail", "and"]["nonexistingword"])
print (model[None, None]["i"])

45
0
295
0.8035714285714286
0.0
0.05412844036697248


In [303]:
def manowar_sentence_generator():
    text = [None, None]
    prob = 1.0  # <- Init probability

    sentence_finished = False

    while not sentence_finished:
        r = random.random()
        accumulator = .0

        for word in model[tuple(text[-2:])].keys():
            accumulator += model[tuple(text[-2:])][word]

            if accumulator >= r:
                prob *= model[tuple(text[-2:])][word]  # <- Update the probability with the conditional probability of the new word
                text.append(word)
                break

        if text[-2:] == [None, None]:
            sentence_finished = True

    print ("Probability of text=", prob)  # <- Print the probability of the text
    print ('\033[1m' + ' '.join([t for t in text if t]) + '\033[0m')
    print ('')

In [308]:
# Create AI ManOwaR verse:
manowar_sentence_generator()
manowar_sentence_generator()
manowar_sentence_generator()
manowar_sentence_generator()

Probability of text= 0.0004892966360856268
[1mwoman be my guide[0m

Probability of text= 6.7093112735667565e-06
[1mthere is blood in the sky[0m

Probability of text= 0.0003949023379698135
[1minto the wind[0m

Probability of text= 7.485633632422623e-05
[1mi was born in a flash of lightning strike[0m



### Incorporate Norse Edda

In [323]:
# Import source files

# File 1 = Mythologic Edda
file1 = open("edda_mythologic_voluspa.txt", 'r')
mythologic = file1.read()

# File 2 = Heroic Edda
file2 = open("edda_heroic_volundarkvida.txt", 'r')
heroic = file2.read()

In [325]:
# Mythologic preview:
mythologic

'For silence I pray all sacred children, great and small, sons of Heimdall they will that I Valfather’s deeds recount,  men’s ancient saws, those that I best remember.\nThe Jötuns I remember early born, those who me of old have reared.\nI nine worlds remember, nine trees, the great central tree, beneath the earth.\nThere was in times of old, where Ymir dwelt,  nor sand nor sea, nor gelid waves; earth existed not, nor heaven above, ‘twas a chaotic chasm, and grass nowhere.\nBefore Bur’s sons raised up heaven’s vault, they who the noble mid-earth shaped.\nThe sun shone from the south over the structure’s rocks: then was the earth begrown with herbage green.\nThe sun from the south, the moon’s companion, her right hand cast about the heavenly horses.\nThe sun knew not where she a dwelling had, the moon knew not what power he possessed, the stars knew not where they had a station.\nThen went the powers all to their judgement-seats, the all-holy gods, and thereon held council: to night and 

In [330]:
# Heroic preview
heroic

'Maids flew from the south, through the murky wood, Alvit the young, fate to fulfil.\nOne of them, of maidens fairest, to his comely breast Egil clasped.\nSvanhvit was the second, she a swan’s plumage bore; but the third, their sister, the white neck clasped of Völund.\nThere they stayed seven winters through; but all the eighth were with longing seized; and in the ninth fate parted them.\nThe maidens yearned for the murky wood, the young Alvit, fate to fulfil.\nFrom the chase came the ardent hunters, Slagfid and Egil, found their house deserted,went out and in, and looked around.\nEgil went east after Ölrún, and Slagfid west after Svanhvit; But Völund alone remained in Ulfdal.\nHe the red gold set with the hard gem, well fastened all the rings on linden bast, and so awaited his bright consort, if to him she would return.\nIt was told to Nidud, the Niarars’ lord, that Völund alone remained in Ulfdal.\nIn the night went men, in studded corslets, their shields glistened in the waning moo

In [341]:
# Merge the 2 sources:
edda = mythologic + heroic
# Lower strings
edda = edda.lower()

In [342]:
# Split lyrics into sentences
edda_sentences = edda.split("\n")  
# Create list of words
for i in range(0,len(edda_sentences)):
     edda_sentences[i] = edda_sentences[i].split()

In [357]:
# Quick preview
edda_sentences[0]

['for',
 'silence',
 'i',
 'pray',
 'all',
 'sacred',
 'children,',
 'great',
 'and',
 'small,',
 'sons',
 'of',
 'heimdall',
 'they',
 'will',
 'that',
 'i',
 'valfather’s',
 'deeds',
 'recount,',
 'men’s',
 'ancient',
 'saws,',
 'those',
 'that',
 'i',
 'best',
 'remember.']

In [366]:
# Merge ManOwaR corpus & Edda
joint_sentences = sentences + edda_sentences

In [367]:
# Incorporate new corpus into model:
model = defaultdict(lambda: defaultdict(lambda: 0))
 
for sentence in joint_sentences:
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        model[(w1, w2)][w3] += 1
 
 
print (model["sons", "of"]["odin"]) # "odin" follows "sons of" 12 times
print (model["sons", "of"]["nonexistingword"]) # 0 times
print (model[None, None]["i"]) # 301 sentences start with "I"
 
# Let's transform the counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count
 
print (model["sons", "of"]["odin"])
print (model["sons", "of"]["nonexistingword"])
print (model[None, None]["i"])

12
0
301
0.3333333333333333
0.0
0.05356825057839473


In [387]:
# Generate lyrics
def manowar_sentence_generator():
    text = [None, None]
    prob = 1.0  # <- Init probability

    sentence_finished = False

    while not sentence_finished:
        r = random.random()
        accumulator = .0

        for word in model[tuple(text[-2:])].keys():
            accumulator += model[tuple(text[-2:])][word]

            if accumulator >= r:
                prob *= model[tuple(text[-2:])][word]  # <- Update the probability with the conditional probability of the new word
                text.append(word)
                break

        if text[-2:] == [None, None]:
            sentence_finished = True

    #print ("Probability of text=", prob)  # <- Print the probability of the text
    print (' '.join([t for t in text if t]))
    print ('')

In [388]:
# Test AI ManOwaR + Edda verse:
manowar_sentence_generator()
manowar_sentence_generator()
manowar_sentence_generator()
manowar_sentence_generator()

to a rainbow in the wind in the night he rides across land, sea and air. the bringer of the world tree deep into the house of death

youre watching on the draw

and in the sky

lusting for blood and our pride



In [389]:
# Generate sample of 45 verses
for i in range(0, 45):
    print("Track " + str(i+1) + ":")
    manowar_sentence_generator()
    manowar_sentence_generator()
    manowar_sentence_generator()
    manowar_sentence_generator()
    print("\n")

Track 1:
let valhallas gates

the world

the dwarfs i have nothing to sell but the third, their sister, the white man came to trade and borrow

armed with magic sons of thunder



Track 2:
who were brave and not afraid to die and live again

where i stand

you taught me wrong from right

in battle



Track 3:
hail, hail, hail and fire on satans throne

gonna keep on burning

im gambler, so bet on your command, by your side

so mister d.j., play just one for the grand final stand



Track 4:
metal makes us strong

to all our dreams

yes, i will crucify, slay them with their brothers

this was my gift of the final orgy



Track 5:
so hard and so wild and free

brothers everywhere

there, the road

in righteous glory



Track 6:
long before your birth

greatness waits for those who stand before me the trail of tears would end

and only i know the power to kill, the power of the slain

if you can take thee, or so skilful that he from thy steel



Track 7:
reborn in the streets the fight

d