In [145]:
import numpy as np
import pandas as pd
import matplotlib as matplot
import nltk
import sklearn
from random import random, randrange

In [146]:
# Code Tokens
START = "<"
STOP = ">"
SPACE = " "
PUNCTUATION = ","

# Question 1

In [147]:
en_df_raw = pd.read_csv('data/CONcreTEXT_trial_EN.tsv', sep='\t') # load data files
it_df_raw = pd.read_csv('data/CONcreTEXT_trial_IT.tsv', sep='\t')

In [148]:
en_df = pd.DataFrame()
it_df = pd.DataFrame()

In [149]:
tokenizer = nltk.RegexpTokenizer(r"\w+")
en_df['SENTENCES'] = en_df_raw['TEXT'].apply(lambda sent: sent.strip().lower())
it_df['SENTENCES'] = it_df_raw['TEXT'].apply(lambda sent: sent.strip().lower())
en_df

Unnamed: 0,SENTENCES
0,"bring up academic achievements , awards , and ..."
1,"please list people you have helped , your pers..."
2,add activated carbon straight to your vodka .
3,"place sensors around your garden , and when a ..."
4,look for a partner that shares your level of a...
...,...
95,rinse your face with warm water and pat it dry .
96,staying mentally strong means winning half the...
97,the person who has the highest score wins the ...
98,"for the most part , men and women wear the sam..."


In [150]:
en_train = []
en_test = []

it_train = []
it_test = []

state = 4111
en_train, en_test = sklearn.model_selection.train_test_split(en_df['SENTENCES'], train_size=0.8, test_size=0.2, random_state=state)
it_train, it_test = sklearn.model_selection.train_test_split(it_df['SENTENCES'], train_size=0.8, test_size=0.2, random_state=state)
en_train[0] # show first element of training data

'bring up academic achievements , awards , and other milestones in your life .'

### Laplace (Add-1)

I tried using the `nltk.lm.Laplace` model but I couldn't find a way to construct the model using only bigrams, as when i filtered out the unigrams from the everygrams used from the pipeline, it caused a ValueError. So I decided to make my own implementation based on the slides.

In [151]:
padded_train = [START + sent + STOP for sent in en_train]
padded_train[0]

'<the fertilizer will inspire leafy growth rather than flower growth .>'

In [152]:
bigrams = [gram for sent in padded_train for gram in list(nltk.bigrams(sent))]
bigrams[:5]

[('<', 't'), ('t', 'h'), ('h', 'e'), ('e', ' '), (' ', 'f')]

In [153]:
vocab_tokens = nltk.lm.Vocabulary([char for sent in padded_train for char in sent])
len(vocab_tokens.counts)

37

In [154]:
conditionals = nltk.ConditionalFreqDist( (pre, suc) for (pre, suc) in bigrams )
conditionals.conditions()[:5]

['<', 't', 'h', 'e', ' ']

In [407]:
class Laplace:
    def __init__(self, frequency_distribution, vocabulary, seed=None):
        self.freqdist = frequency_distribution
        self.vocab = vocabulary
        self.seed = seed
        
    def smooth(self, pre, suc):
        """
            P_add-1(w_i | w_i-1) = c(w_i-1, w_i)+1 / c(w_i-1) + V
            V is the vocabulary size
            
        """
        numerator = self.freqdist[pre][suc] + 1
        V = len(self.vocab.counts)
        denominator = self.freqdist[pre].N() + V
        return numerator/denominator
    
    def letter(self):
        if self.seed == None:
            raise ValueError("nuh uh uh pls set seed first")
        distribution = self.freqdist[self.seed]
        x = random()
        for letter in self.freqdist[self.seed]:
            x = x - self.smooth(self.seed, letter)
            if x <= 0:
                self.seed = letter
                return letter
    
    def generate_sentence(self, max_len=100, start="<"):
        self.seed = start
        sentence = ""
        
        size = 0
        while ">" not in sentence and size <= max_len:
            letter = self.letter()
            if letter == None:
                letter = ""
            sentence += letter
            size += 1
        return sentence[:-1] #trim off the stop token

In [408]:
en_laplace = Laplace(conditionals, vocab_tokens) #
for _ in range(5):
    s = en_laplace.generate_sentence()
    print("Length: " + str(len(s)) + "\t" + s)

Length: 35	is ce pl llkigbe we t sis tasng t .
Length: 13	choifiluted .
Length: 7	cacan .
Length: 91	yokittt yentofleacangorigh t ans epredesstureffouthovesoufe parvizemurokeshens micke sta r 
Length: 93	e ark tafeme ierse angoroug ce thererecandotwil t barcato , ly erwhesiowtearithi tiveriuraver


**It works!** :O

In [409]:
# proof that it actually smooths over all vocabulary counts, not just ones that exist as a bigram pair
prob = 0
for i in en_laplace.vocab.counts: # all the tokens
    prob += en_laplace.smooth("<",i)
print(str(prob) + " and it's basically 1")

0.9999999999999997 and it's basically 1


### Linear Interpolation
(Equally weighted lambdas)

In [410]:
ngrams_generator, vocab_generator = nltk.lm.preprocessing.padded_everygram_pipeline(2, en_train)

# listify because i hate generators
ngrams = [list(generator) for generator in list(ngrams_generator)]
vocabulary = list(vocab_generator)

In [411]:
kneserney = nltk.lm.KneserNeyInterpolated(2)
kneserney.fit(ngrams, vocabulary)

**Lots easier than Add-1, since linear interpolation uses everygrams less than or equal to the max length**

# Question 2

In [463]:
# laplace sentence creation
possible_starts=['a','r','u','m','p','q','h']


en_laplace = Laplace(conditionals, vocab_tokens)
for _ in range(5):
    s = en_laplace.generate_sentence(100, possible_starts[randrange(len(possible_starts))])
    print("Length: " + str(len(s)) + "\t" + s)

Length: 64	 ierelf y izactisilf yo bes mismill wtint tyoes yocha s the fo .
Length: 47	isstste pareler res souterbed ord w lon dotak .
Length: 13	uplfoowicon .
Length: 90	e iprs h veway onten f when y edravor wou or s , tssony te tsadklve iminspan welinyoad ort
Length: 49	allstoth , plor pate ll oteny ipr bultthawike t .


In [468]:
# kneser ney letter and sentence generation
possible_starts=['a','r','u','m','p','q','h']

possible_starts[randrange(len(possible_starts))]

for _ in range(5):
    source = kneserney.generate(100, text_seed=possible_starts[randrange(len(possible_starts))])
    s = "".join([token for token in source if token != '</s>' and token != '<s>'])
    print("Length: " + str(len(s)) + "\t" + s)

Length: 98	 t wanourely foule a .you . wheg thadserith fing o co toopioul qufoude thaphourterine in cen cl rv
Length: 98	 tan ateal vizamomus ’r f maicer a .zalestspalll y bo at .0 y rs s h bl yown itidvilarsurs , me ti
Length: 96	in by plalad 'llou e .g y an hapas tsmoaponyoiresowaf te the t w r rnd .d breshat ., h .ifoingem
Length: 98	else y ctomperrs mavelitisplden whooil .wisid .qu ftty d an ant usoufoppastene somifsovo intave ch
Length: 99	ngizeth e ftply the watulouin , thimima dy ory tiresof wins 'le yof an .’l , akitithodindesharsocan


# Question 3

In [404]:
ngrams_generator, vocab_generator = nltk.lm.preprocessing.padded_everygram_pipeline(2, en_train)

# listify because i hate generators
ngrams = [list(generator) for generator in list(ngrams_generator)]
vocabulary = list(vocab_generator)

In [405]:
kneserney = nltk.lm.KneserNeyInterpolated(3)
kneserney.fit(ngrams, vocabulary)

In [470]:
possible_starts=['a','r','u','m','p','q','h']

possible_starts[randrange(len(possible_starts))]

for _ in range(5):
    source = kneserney.generate(100, text_seed=possible_starts[randrange(len(possible_starts))])
    s = "".join([token for token in source if token != '</s>' and token != '<s>'])
    print("Length: " + str(len(s)) + "\t" + s)

Length: 99	use meffor win he , rowithal at as .jorilene in s , cr al haddel inddrt ieavin mang bitlltimatonsil
Length: 99	er .g w g pre yomplyoter mavou crideaveris choges f fll bimoldeerier , wir arins tedircoveesoce ss 
Length: 99	are areadephe tegalilpy sthe toom , y ghennstopalllouteng cank igan fe .s f as or ouss iest wsinshe
Length: 98	explyecerored s o s shene , s ncals iof yom bll .plusestamicouthe re y the g uputhr .tenco wisp da
Length: 99	ofe t ct te a t greanutou qurear cansanemat s the pader al .it e acar chisshesaste ju y y ooutlyour


# Extra Credit