In [145]:
import numpy as np
import pandas as pd
import matplotlib as matplot
import nltk
import sklearn
from random import random, randrange

In [146]:
# Code Tokens
START = "<"
STOP = ">"
SPACE = " "
PUNCTUATION = ","

# Question 1

In [147]:
en_df_raw = pd.read_csv('data/CONcreTEXT_trial_EN.tsv', sep='\t') # load data files
it_df_raw = pd.read_csv('data/CONcreTEXT_trial_IT.tsv', sep='\t')

In [148]:
en_df = pd.DataFrame()
it_df = pd.DataFrame()

In [149]:
tokenizer = nltk.RegexpTokenizer(r"\w+")
en_df['SENTENCES'] = en_df_raw['TEXT'].apply(lambda sent: sent.strip().lower())
it_df['SENTENCES'] = it_df_raw['TEXT'].apply(lambda sent: sent.strip().lower())
en_df

Unnamed: 0,SENTENCES
0,"bring up academic achievements , awards , and ..."
1,"please list people you have helped , your pers..."
2,add activated carbon straight to your vodka .
3,"place sensors around your garden , and when a ..."
4,look for a partner that shares your level of a...
...,...
95,rinse your face with warm water and pat it dry .
96,staying mentally strong means winning half the...
97,the person who has the highest score wins the ...
98,"for the most part , men and women wear the sam..."


In [150]:
en_train = []
en_test = []

it_train = []
it_test = []

state = 4111
en_train, en_test = sklearn.model_selection.train_test_split(en_df['SENTENCES'], train_size=0.8, test_size=0.2, random_state=state)
it_train, it_test = sklearn.model_selection.train_test_split(it_df['SENTENCES'], train_size=0.8, test_size=0.2, random_state=state)
en_train[0] # show first element of training data

'bring up academic achievements , awards , and other milestones in your life .'

### Laplace (Add-1)

I tried using the `nltk.lm.Laplace` model but I couldn't find a way to construct the model using only bigrams, as when i filtered out the unigrams from the everygrams used from the pipeline, it caused a ValueError. So I decided to make my own implementation based on the slides.

In [151]:
padded_train = [START + sent + STOP for sent in en_train]
padded_train[0]

'<the fertilizer will inspire leafy growth rather than flower growth .>'

In [152]:
bigrams = [gram for sent in padded_train for gram in list(nltk.bigrams(sent))]
bigrams[:5]

[('<', 't'), ('t', 'h'), ('h', 'e'), ('e', ' '), (' ', 'f')]

In [153]:
vocab_tokens = nltk.lm.Vocabulary([char for sent in padded_train for char in sent])
len(vocab_tokens.counts)

37

In [154]:
conditionals = nltk.ConditionalFreqDist( (pre, suc) for (pre, suc) in bigrams )
conditionals.conditions()[:5]

['<', 't', 'h', 'e', ' ']

In [157]:
class Laplace:
    def __init__(self, frequency_distribution, vocabulary, seed=None):
        self.freqdist = frequency_distribution
        self.vocab = vocabulary
        self.seed = seed
        
    def smooth(self, pre, suc):
        """
            P_add-1(w_i | w_i-1) = c(w_i-1, w_i)+1 / c(w_i-1) + V
            V is the vocabulary size
            
        """
        numerator = self.freqdist[pre][suc] + 1
        V = len(self.vocab.counts)
        denominator = self.freqdist[pre].N() + V
        return numerator/denominator
    
    def letter(self):
        if self.seed == None:
            raise ValueError("nuh uh uh pls set seed first")
        distribution = self.freqdist[self.seed]
        x = random()
        for letter in self.freqdist[self.seed]:
            x = x - self.smooth(self.seed, letter)
            if x <= 0:
                self.seed = letter
                return letter
    
    def generate_sentence(self, max_len=100, start="<"):
        self.seed = start
        sentence = ""
        
        size = 0
        while ">" not in sentence and size <= max_len:
            letter = self.letter()
            if letter == None:
                letter = ""
            sentence += letter
            size += 1
        return sentence[:-1] #trim off the stop token

In [163]:
en_laplace = Laplace(conditionals, vocab_tokens) #
for _ in range(5):
    s = en_laplace.generate_sentence()
    print("Length: " + str(len(s)) + "\t" + s)

Length: 28	miourtodletmeriene ie alke .
Length: 34	des pons you wthas ath techeanes .
Length: 92	tecl malisteplling f othotttonsmatr chers a ayonghyscksit tok cusunyounspardeeoof ads a tudi
Length: 92	omathereveskin laveve , se asin sin pis atreve a toyon oficemperes e ckicrt lyin inort tay t
Length: 16	iola patescexp .


**It works!** :O thanks erfan for all the help with my questions

In [164]:
# proof that it actually smooths over all vocabulary counts, not just ones that exist as a bigram pair
prob = 0
for i in en_laplace.vocab.counts: # all the tokens
    prob += en_laplace.smooth("<",i)
print(str(prob) + " and it's basically 1")

0.9999999999999997 and it's basically 1


### Linear Interpolation
(Equally weighted lambdas)

In [166]:
ngrams_generator, vocab_generator = nltk.lm.preprocessing.padded_everygram_pipeline(2, en_train)

# listify because i hate generators
ngrams = [list(generator) for generator in list(ngrams_generator)]
vocabulary = list(vocab_generator)

In [168]:
kneserney = nltk.lm.KneserNeyInterpolated(2)
kneserney.fit(ngrams, vocabulary)

**Lots easier than Add-1, since linear interpolation uses everygrams less than or equal to the max length**

# Question 2

In [172]:
# laplace sentence creation
possible_starts=['a','r','u','m','p','q','h']


en_laplace = Laplace(conditionals, vocab_tokens)
for _ in range(5):
    s = en_laplace.generate_sentence(100, possible_starts[randrange(len(possible_starts))])
    print("Length: " + str(len(s)) + "\t" + s)

Length: 96	orthinyor wicacanghace cothe intern 't yommunes ant fattrous ddelyo cacunuf matur oourinistho ar
Length: 81	ulyor wher canss metcoun oth dd bowhr wedkenu brnt jotusn yo wh rspan itllthan f 
Length: 7	 teak .
Length: 69	d isse lis henghoang coreny wite ancontrevesneyoudexif l ofith grin .
Length: 22	thexiresott ar en or .


In [173]:
# kneser ney letter and sentence generation
possible_starts=['a','r','u','m','p','q','h']

possible_starts[randrange(len(possible_starts))]

for _ in range(5):
    source = kneserney.generate(100, text_seed=possible_starts[randrange(len(possible_starts))])
    s = "".join([token for token in source if token != '</s>' and token != '<s>'])
    print("Length: " + str(len(s)) + "\t" + s)

Length: 99	 chale veingameratowanysongin chits nn .hel tedomers s mit ber he bee pathinsoourouennod sece wil h
Length: 97	esusit fed y prs mind ly e , , matherg torutoowin caves hokes tir , smurt puly ...onghive foute s
Length: 100	 upak ntomenore g tisaspa trowad ome s pl donyofl whasubedy winesilene aveneryol jourowhateaircomepy
Length: 99	ees arissen pasou d ong conye ho 'splonndran thoint ar omews ther y silyier moutrtak y ste .emoug m
Length: 99	iddil pa an are proucell an caveathandsus p , an cakintlinghamean .tin cooro pamesot dr f baursthec


# Question 3

In [11]:
ngrams_generator, vocab_generator = nltk.lm.preprocessing.padded_everygram_pipeline(3, en_train)

# listify because i hate generators
ngrams = [list(generator) for generator in list(ngrams_generator)]
vocabulary = list(vocab_generator)

In [12]:
kneserney = nltk.lm.KneserNeyInterpolated(3)
kneserney.fit(ngrams, vocabulary)

In [75]:
possible_starts=['a','r','u','m','p','q','h']

possible_starts[randrange(len(possible_starts))]

for _ in range(5):
    source = kneserney.generate(100, text_seed=possible_starts[randrange(len(possible_starts))])
    s = "".join([token for token in source if token != '</s>' and token != '<s>'])
    print("Length: " + str(len(s)) + "\t" + s)

Length: 37	 th a othe miessimposs ing unt inve .
Length: 100	diour oven ponturserges spy ant parovence can phe ore our a must , ance foreall , ant by thent it ac
Length: 19	r the arepess whe .
Length: 21	t care an st sou 's .
Length: 100	rou 've evelace : re afeal for ing to ma ned ove ter of wist exis in artal , bods owitchin dooke a l


# Extra Credit

### Bigram Model

In [26]:
ngrams_generator, vocab_generator = nltk.lm.preprocessing.padded_everygram_pipeline(2, it_train)

# listify because i hate generators againnn
ngrams = [list(generator) for generator in list(ngrams_generator)]
vocabulary = list(vocab_generator)

In [27]:
it_bikneserney = nltk.lm.KneserNeyInterpolated(2)
it_bikneserney.fit(ngrams, vocabulary)

In [28]:
# kneser ney letter and sentence generation
possible_starts=['a','r','u','m','p','q','h']

possible_starts[randrange(len(possible_starts))]
print("EC ITALIAN BIGRAM SENTENCES:")
for _ in range(5):
    source = it_bikneserney.generate(100, text_seed=possible_starts[randrange(len(possible_starts))])
    s = "".join([token for token in source if token != '</s>' and token != '<s>'])
    print("Length: " + str(len(s)) + "\t" + s)

EC ITALIAN BIGRAM SENTENCES:
Length: 99	unnorcorta e i io .g pa due gi val e del' ca iu pl' uò diotile , azzi meme utolistusunariaden sssan
Length: 99	ma nacin eg , arl ffen tta ce è domen derelivi glle de , 125 ata dimantrmidi sttomia ate o .ka roio
Length: 99	tistataronsi erce ssomea tttreze stiba onttà ri cisagai desunnalersco udifi mobio .morbendolitti ch
Length: 96	mustei io ari.25 to stricimene coza din nn .' d’ di latolampaseli de le muttra tai le .pe sticor
Length: 99	 pil ia mativemeltrantinsi cobarinisorcinffforti comaltevica cana ca iegle pri trma l .hi pee uontà


In [22]:
ngrams_generator, vocab_generator = nltk.lm.preprocessing.padded_everygram_pipeline(3, it_train)

# listify because i hate generators
ngrams = [list(generator) for generator in list(ngrams_generator)]
vocabulary = list(vocab_generator)

In [23]:
it_trikneserney = nltk.lm.KneserNeyInterpolated(3)
it_trikneserney.fit(ngrams, vocabulary)

In [35]:
# kneser ney letter and sentence generation
possible_starts=['a','r','u','m','p','q','h']

possible_starts[randrange(len(possible_starts))]
print("EC ITALIAN TRIGRAM SENTENCES:")
for _ in range(5):
    source = it_trikneserney.generate(100, text_seed=possible_starts[randrange(len(possible_starts))])
    s = "".join([token for token in source if token != '</s>' and token != '<s>'])
    print("Length: " + str(len(s)) + "\t" + s)

EC ITALIAN TRIGRAM SENTENCES:
Length: 100	ertempettà e sta strovamperitorgendelle colpazi aveccola persiclupere ci rissu i , perchichima dellu
Length: 17	dee lorza della .
Length: 100	asimaspiù peririclesso sque orta ché suppa masamin sto , sti soppa vuo esatane lavo con unquiniattic
Length: 57	ri è comun unquellaveri hai percata più può e sità a lo .
Length: 35	ua pio la un tire di so la , l' e .


In [186]:
ngrams_generator, vocab_generator = nltk.lm.preprocessing.padded_everygram_pipeline(6, en_train)

# listify because i hate generators
ngrams = [list(generator) for generator in list(ngrams_generator)]
vocabulary = list(vocab_generator)

In [187]:
kn = nltk.lm.KneserNeyInterpolated(6)
kn.fit(ngrams, vocabulary)

In [193]:
# kneser ney letter and sentence generation
source = kn.generate(200)
s = "".join([token for token in source if token != '</s>' and token != '<s>'])
s

"n the hole in you 've imagined any other are relaxing ways to not insult or offend anyone ."