###  Luis Ricardo Cruz García
#### Procesamiento de Lenguaje Natural

#### Tarea 4

In [1]:
import math
import nltk
import glob
import numpy as np
from decimal import Decimal
import random
from nltk.corpus import stopwords
from typing import Optional
from sklearn.preprocessing import normalize
from copy import deepcopy

In [2]:
def get_texts_from_file(path_corpus: str, path_label: str) -> tuple[list, list]:
    """Given the corpus and label paths, returns the list of docs and labels."""
    docs, labels = [], []

    with open(path_corpus, "r") as f_corpus:
        for doc in f_corpus:
            docs.append(doc)

    with open(path_label, "r") as f_labels:
        for label in f_labels:
            labels.append(label)

    return docs, labels

In [3]:
class Vocabulary:
    """Vocabulary class to store the ranking of the words using the 
    given corpus frequency distribution."""
    def __init__(self, 
                 corpus_freqdist: nltk.FreqDist, 
                 n_words: Optional[int] = None):
        vocabulary_freq_desc = self._sort_FreqDist(corpus_freqdist)
        
        if n_words is not None: # restrict the max num of words
            vocabulary_freq_desc = vocabulary_freq_desc[:n_words]

        self.vocabulary = [word for word, freq in vocabulary_freq_desc]
        
        # dictionary of the rank (frequency) of words in the vocabulary, word: freq_ranking
        self.word_to_index = {word:rank for rank, word in enumerate(self.vocabulary)}
    
    @staticmethod
    def _sort_FreqDist(fd: nltk.FreqDist) -> list:
        """Return the list of items (pairs of <word, freq>) sorted by frequency (desc)."""
        aux = list(fd.items())
        aux.sort(key=lambda x: x[1], reverse=True)
        return aux

    def __len__(self):
        return len(self.vocabulary)
    
    def __getitem__(self, key: str | int) -> int | str | None:
        """Depending of the type of the key, returns the word at 
        index key (if key is an integer) or the rank of the 
        word key (if key is a string).
        """
        if not isinstance(key, int) and not isinstance(key, np.int64) and not isinstance(key, str):
            raise ValueError(f"Key must be an integer or a string, key = {key}")
        
        if isinstance(key, int) or isinstance(key, np.int64):
            return self.vocabulary[key]

        if isinstance(key, str):
            return self.word_to_index[key]

    def __contains__(self, key: str) -> bool:
        return key in self.word_to_index

In [4]:
class Corpus_Ngram:
    """Corpus class to preprocess and store the frequecy and vocabulary for a given list of documents."""
    def __init__(self, 
                 docs: list[str], 
                 n: int = 1, 
                 start_end_token: bool = False, 
                 n_words: Optional[int] = None):
        self.n = n
        self.docs = deepcopy(docs)
        self.docs = [doc.lower() for doc in self.docs] # set to lowercase

        # add start(<s>) and end(</s>) special tokens to each doc
        if start_end_token:
            self.docs = ["<s>" + doc + "</s>" for doc in self.docs]

        tokenizer = nltk.TweetTokenizer()

        self.tokens = []
        for doc in self.docs:
            self.tokens += tokenizer.tokenize(doc)

        if n != 1:
            self.tokens = nltk.ngrams(self.tokens, n)

        self.freq = nltk.FreqDist(self.tokens)

        self.vocabulary = Vocabulary(self.freq, n_words=n_words)
        
    def __len__(self):
        return len(self.docs)
    
    def __getitem__(self, key: int) -> str:
        return self.docs[key]

In [5]:
class Corpus(Corpus_Ngram):
    """Unigram corpus."""
    def __init__(self, 
                 docs: list[str], 
                 start_end_token: bool = False, 
                 n_words: Optional[int] = None):
        super(Corpus, self).__init__(docs, n=1, start_end_token=start_end_token, n_words=n_words)

### 2 Modelo de Lenguaje y Evaluación

2.1. Preprocese todos los tuits de agresividad (positivos y negativos) según su intuición para construir un buen corpus para un modelo de lenguaje (e.g., solo palabras en minúscula, etc.).

In [6]:
tokenizer = nltk.TweetTokenizer()

In [7]:
# get training docs and labels
train_docs, train_labels = get_texts_from_file("../../Data/mex_train.txt", "../../Data/mex_train_labels.txt")
train_labels = list(map(int, train_labels))  # cast to integer

In [8]:
corpus_full_train = Corpus(train_docs, start_end_token=True)

Agregue tokens especiales de "< s >" y "</ s >" según usted considere (e.g., al inicio y final de cada tuit). Defina su vocabulario y enmascare con <unk> toda palabra que no esté en su vocabulario.

In [9]:
def mask_OOV_words(corpus: Corpus, 
                   n_terms: Optional[int]) -> list[str]:
        """Mask the words out of vocabulary (with an optional restriction on the 
        number of terms n_terms) with the "<unk>" token. 
        Returns the docs masked.
        """
        docs = deepcopy(corpus.docs)
        tokenizer = nltk.TweetTokenizer()

        if n_terms is None: n_terms = len(corpus.vocabulary)

        for i, doc in enumerate(docs):
            doc_masked_words = []
            for word in tokenizer.tokenize(doc):
                if word in corpus.vocabulary and corpus.vocabulary[word] < n_terms:
                    doc_masked_words.append(word)
                else:
                    doc_masked_words.append("<unk>")
            docs[i] = " ".join(doc_masked_words)

        return docs

In [10]:
# mask out of vocabulary words (restricted to the 5k most occurring words) of the train_docs
train_docs = mask_OOV_words(corpus_full_train, n_terms=5000)

In [11]:
# Partition of train_docs

# train   -> 80% of train_docs
# heldout -> 10% of train_docs
# test    -> 10% of train_docs

train   = train_docs[:4435]
heldout = train_docs[4435:4989]
test    = train_docs[4989:]

In [12]:
# reconstruct the train corpus from "train" (now train is smaller, 
# with start/end token and masked)
corpus_train = Corpus(train)

### 2.2. Entrene tres modelos de lenguaje sobre todos los tuits: $𝑃_{𝑢𝑛𝑖𝑔𝑟𝑎𝑚𝑎𝑠}(𝑤_{1:𝑛})$, $𝑃_{𝑏𝑖𝑔𝑟𝑎𝑚𝑎𝑠}(𝑤_{1:𝑛} )$, $𝑃_{𝑡𝑟𝑖𝑔𝑟𝑎𝑚𝑎𝑠}(𝑤_{1:𝑛})$. Para cada uno proporcione una interfaz (función) sencilla. Los modelos deben tener una estrategia común para lidiar con secuencias no vistas. Puede optar por un suavizamiento Laplace o un Good-Turing discounting.

In [13]:
# choose a tweet to apply the models
test_tweet = train[19]
test_tweet_tokenized = tokenizer.tokenize(test_tweet)
test_tweet

'<s> puta madre quiero <unk> por la hdp de la <unk> </s>'

Calculamos las estimaciones de una sucesión de palabras usando un modelo de $n$-grams con Laplace smoothing de la siguiente manera.

$$P_L(w_{1:k}) = \prod_i P_L(w_i | w_{i-n:i-1})  = \prod_i \frac{C(w_{i-n:i}) + 1}{C(w_{i-n:i-1}) + V}$$

en el caso de no se quiera el "suavizado", entonces la fórmula se reduce a 

$$P(w_{1:k}) = \prod_i P(w_i | w_{i-n:i-1})  = \prod_i \frac{C(w_{i-n:i})}{C(w_{i-n:i-1})}$$


In [14]:
class Ngram_Model:
    def __init__(self, corpus: Corpus, n: int):
        if corpus.n != 1:
            raise ValueError("The corpus must be a unigram corpus.")
        
        self.n = n
        self.corpus = corpus
        
        # create n-gram corpus for n and n-1
        if self.n == 1:
            self.corpus_n = self.corpus
            self.corpus_n_minus_1 = self.corpus
        elif self.n == 2:
            self.corpus_n = Corpus_Ngram(self.corpus.docs, self.n)
            self.corpus_n_minus_1 = self.corpus
        else:
            self.corpus_n = Corpus_Ngram(self.corpus.docs, self.n)
            self.corpus_n_minus_1 = Corpus_Ngram(self.corpus.docs, self.n - 1)

    def conditional_probability(self, 
                                word: str, 
                                prev_words: tuple[str], 
                                smoothing: bool = True) -> Decimal:
        """Computes the estimated conditional probability of a word given self.n - 1 previous words.

            P(w_i | w_{i-n:i-1}) = C(w_{i-n:i} / C(w_{i-n:i-1}) 

        if smoothing is enabled, then it will use Laplace smoothing.

            P_L(w_i | w_{i-n:i-1}) = (C(w_{i-n:i} + 1) / (C(w_{i-n:i-1}) + V) 

        where C(w_{i-n:i}) is the number occurrences of word + prev_words in the corpus and 
        C(w_{i-n:i-1}) is the number of occurrences of prev_words in the corpus.
        """	
        if not isinstance(prev_words, tuple):
            raise ValueError(f"The prev_words should be a tuple of strings.")

        if len(prev_words) != self.n - 1:
            raise ValueError(f"The tuple prev_words size should be self.n - 1. {len(prev_words) = }")

        if smoothing:
            if self.n == 1:
                P_L_w_given_prev = Decimal((self.corpus.freq[word] + 1) / (len(self.corpus.tokens) + len(self.corpus.vocabulary)))
            elif self.n == 2:
                P_L_w_given_prev = Decimal((self.corpus_n.freq[(*prev_words, word)] + 1) / (self.corpus_n_minus_1.freq[prev_words[0]] + len(self.corpus.vocabulary)))
            else:
                P_L_w_given_prev = Decimal((self.corpus_n.freq[(*prev_words, word)] + 1) / (self.corpus_n_minus_1.freq[prev_words] + len(self.corpus.vocabulary)))
        else:
            if self.n == 1:
                P_L_w_given_prev = Decimal(self.corpus.freq[word] / len(self.corpus.tokens))
            elif self.n == 2:
                P_L_w_given_prev = Decimal(self.corpus_n.freq[(*prev_words, word)] / self.corpus_n_minus_1.freq[prev_words[0]])
            else:
                P_L_w_given_prev = Decimal(self.corpus_n.freq[(*prev_words, word)] / self.corpus_n_minus_1.freq[prev_words])

        return P_L_w_given_prev

    def probability(self, 
                    seq_words: list[str], 
                    smoothing: bool = True) -> Decimal:
        """Computes the estimate probability of sequence of words (i.e. tokenized doc) 
        using an n-gram model.

            P(w_{1:k}) = \prod_i P(w_i | w_{i-n:i-1})

        If smoothing smoothing is enabled, then the probabilities will be computed with Laplace smoothing.
            
            P_L(w_{1:k}) = \prod_i P_L(w_i | w_{i-n:i-1})
        
        where k = len(seq_words) - self.n and P_L(w_i | w_{i-n:i-1}) is estimated 
        with n-grams (see conditional_probability function).
        """
        log_prob = Decimal(0.)
        for ngram in list(nltk.ngrams(seq_words, self.n)):
            P_L_w_given_prev = self.conditional_probability(ngram[-1], ngram[:-1], smoothing=smoothing)

            # log_prob += ln( P_L(w_i | w_{i-n:i-1}) )
            log_prob += P_L_w_given_prev.ln()

        return log_prob.exp()

    def perplexity(self, seq_words: list[str]):
        return (1 / self.probability(seq_words)) ** (1 / Decimal(len(seq_words)))

#### Modelo de unigramas

In [15]:
unigram_model_train = Ngram_Model(corpus_train, n=1)

In [16]:
print(f"P_1(test_tweet) = {unigram_model_train.probability(test_tweet_tokenized)}")

P_1(test_tweet) = 5.004098532124699262146308101E-23


#### Modelo de bigramas

In [17]:
bigram_model_train = Ngram_Model(corpus_train, n=2)

In [18]:
print(f"P_2(test_tweet) = {bigram_model_train.probability(test_tweet_tokenized)}")

P_2(test_tweet) = 1.679099977175348530204536306E-25


#### Modelo Trigramas

In [19]:
trigram_model_train = Ngram_Model(corpus_train, n=3)

In [20]:
print(f"P_3(test_tweet) = {trigram_model_train.probability(test_tweet_tokenized)}")

P_3(test_tweet) = 7.525659602589744277693384578E-31


Muestre un par de ejemplos de cómo funciona, al menos uno con una palabra
fuera del vocabulario.

In [21]:
corpus_train.freq["topología"]  # palabra OOV

0

In [22]:
# hagamos un tweet (una oración con los tokens de inicio y fin) cuya única palabra sea "topología"
tweet_oov = "<s> topología </s>"

# lo tokenizo para enviarlo a los métodos de cada modelo
tweet_oov_tokenized = tokenizer.tokenize(tweet_oov)

In [23]:
print(f"P_1(tweet_oov) = {unigram_model_train.probability(tweet_oov_tokenized)}")

P_1(tweet_oov) = 2.305041643895685496422031688E-8


In [24]:
print(f"P_2(tweet_oov) = {bigram_model_train.probability(tweet_oov_tokenized)}")

P_2(tweet_oov) = 2.195755753439991671758321113E-8


In [25]:
print(f"P_3(tweet_oov) = {trigram_model_train.probability(tweet_oov_tokenized)}")

P_3(tweet_oov) = 0.0002046663937781416267146922250


#### 2.3 Construya un modelo interpolado con valores $\lambda$ fijos:

For some $\lambda = \{ \lambda_0, \lambda_1, \lambda_2 \}$ such that $\lambda_0 + \lambda_1 + \lambda_2 = 1$, the interpolated model using unigrams, bigrams and trigrams is given by

$$\hat{P}(w_{1:k}) = \prod_i \left( \lambda_2 P_L(w_i | w_{i-2}w{i-1}) + \lambda_1 P_L(w_i | w{i-1}) + \lambda_0 P_L(w_i) \right)$$

In [26]:
class Interpolated_Model:
    def __init__(self, 
                 corpus: Corpus):
        self.corpus = corpus
        self.unigram_model = Ngram_Model(corpus, n=1)
        self.bigram_model  = Ngram_Model(corpus, n=2)
        self.trigram_model = Ngram_Model(corpus, n=3)
    
    def conditional_probability(self, 
                                word: str, 
                                prev_words: tuple[str], 
                                lambda_values: list[float]) -> Decimal:
        """Computes the estimated conditional probability of a word given 2 previous words.

            P(w_i | w_{i-2:i-1}) = \lambda_2 P_L(w_i | w_{i-2}w{i-1}) + \lambda_1 P_L(w_i | w{i-1}) + \lambda_0 P_L(w_i)
        """
        if not isinstance(prev_words, tuple):
            raise ValueError(f"The prev_words should be a tuple of strings.")

        if len(prev_words) != 2:
            raise ValueError(f"The tuple prev_words size should be 2. {len(prev_words) = }")

        P_L_w_given_prev = (
                (self.unigram_model.conditional_probability(word, ()) * Decimal(lambda_values[0]))
                + (self.bigram_model.conditional_probability(word, prev_words[:1]) * Decimal(lambda_values[1]))
                + (self.trigram_model.conditional_probability(word, prev_words[:2]) * Decimal(lambda_values[2]))
        )

        return P_L_w_given_prev

    def probability(self, 
                    seq_words: list[str], 
                    lambda_values: list[float]) -> Decimal:
        """Computes the estimate probability of sequence of words (i.e. tokenized doc) 
        using an interpolated trigram model.

            P(w_{1:k}) = \prod_i P(w_i | w_{i-2:i-1})
        """
        log_prob = Decimal(0.)
        for trigram in list(nltk.trigrams(seq_words)):
            P_L_w_given_prev = self.conditional_probability(trigram[-1], trigram[:-1], lambda_values)

            log_prob += P_L_w_given_prev.ln()

        return log_prob.exp()

    def perplexity(self, seq_words: list[str], lambda_values: list[float]) -> Decimal:
        return (1 / self.probability(seq_words, lambda_values)) ** (1 / Decimal(len(seq_words)))
    
    def generate(self, lambda_values: list[float], max_len: int = 50):
        """Generate text using the interpolated model and the lambda_values.
        """
        if len(lambda_values) != 3:
            raise ValueError('The model needs 3 lambda values.')

        generated_tokens = ["<s>"]
        prev_prev_word, prev_word = "<s>", "<s>" # initial previous words

        # generates tokens until the end of sentence token is generated or the length is 50 tokens
        while prev_word != "</s>" and len(generated_tokens) < max_len:
            # generates the probabilities of the word given prev_prev_word and prev_word
            model_weights = [
                float(self.conditional_probability(word, (prev_prev_word, prev_word), lambda_values)) for word in self.corpus.vocabulary
            ]

            # choose a word in the vocabulary (using the weights to bias the choice to the words with greater probability)
            new_word = random.choices(self.corpus.vocabulary, weights=model_weights)[0]

            if new_word == "<s>": continue

            generated_tokens.append(new_word)

            # updata previous words
            prev_prev_word = prev_word
            prev_word      = new_word

        return " ".join(generated_tokens)
    
    def Expectation_Maximization(self, seq_words: list[str], n_iterations: int = 5):
        """Expectation-maximization algorithm for interpolated language modeling. 
        Learns the lambda parameters for the interpolated language model.
        See Jacob Eisenstein's book "Natural Laguage Processing". """
        n_max = 3
        M = len(seq_words)
        
        trigrams_seq_words = list(nltk.trigrams(seq_words))

        lambda_z = np.array([1/3, 1/3, 1/3], dtype=float)
        q = np.zeros(shape=(M - 2, n_max), dtype=float)

        print(f'\tInitial lambdas = {lambda_z}')
        print("-" * 80)

        for it in range(n_iterations):
            for m, (w2, w1, w) in enumerate(trigrams_seq_words):
                q[m][0] = lambda_z[0] * float(self.unigram_model.conditional_probability(w, ()))
                q[m][1] = lambda_z[1] * float(self.bigram_model.conditional_probability(w, (w1,)))
                q[m][2] = lambda_z[2] * float(self.trigram_model.conditional_probability(w, (w2, w1)))

                q[m] =  q[m] * np.divide(1, np.sum(np.abs(q[m]))) # normalize (l1 norm) the m-th row

            lambda_z = (1 / M) * np.sum(q, axis=0)

            print(f'i = {it}: lambdas = {lambda_z}, ', end="")
            print(f"PP(heldout) = {float(self.perplexity(seq_words, lambda_z)):.2f}")

        return lambda_z

In [27]:
possible_lambdas = [
    [0.33, 0.33, 0.33], 
    [0.4, 0.4, 0.2], 
    [0.2, 0.4, 0.4], 
    [0.5, 0.4, 0.1], 
    [0.1, 0.4, 0.5]
]

In [28]:
heldout_tokenized = tokenizer.tokenize(" ".join(heldout))

Muestre cómo bajan o suben las perplejidades en el held-out, finalmente pruebe una vez en test.

In [29]:
interpolated_model_train = Interpolated_Model(corpus_train)

In [30]:
for lambda_vals in possible_lambdas:
    print(f"{lambda_vals}: PP(heldout) = {float(interpolated_model_train.perplexity(heldout_tokenized, lambda_vals)):.2f}")

[0.33, 0.33, 0.33]: PP(heldout) = 511.67
[0.4, 0.4, 0.2]: PP(heldout) = 472.52
[0.2, 0.4, 0.4]: PP(heldout) = 626.09
[0.5, 0.4, 0.1]: PP(heldout) = 434.30
[0.1, 0.4, 0.5]: PP(heldout) = 813.17


De las perplejidades calculadas para las $\lambda$ dadas, las que se desempeñaron mejor (pues la perplejidad obtenida en el Held-out fue la mínima) fueron $$\lambda = [0.5, 0.4, 0.1]$$ 

In [31]:
# al evaluar en el test usando los valores lambda = [0.5, 0.4, 0.1] obtenemos la siguiente perplejidad
test_tokenized = tokenizer.tokenize(" ".join(test))
print(f"PP(test) = {interpolated_model_train.perplexity(test_tokenized, [0.5, 0.4, 0.1])}")

PP(test) = 410.5147955904772932987539322


### 3. Generación de Texto

#### 3.1. Proponga una estrategia con base en **Expectation Maximization** para encontrar buenos valores de interpolación en $\hat{𝑃}$ usando todo el dataset de agresividad. Para ello experimente con el modelo en particiones de $80\%$, $10\%$ y $10\%$ para entrenar (train), ajustar parámetros (val) y probar (test) respectivamente. Muestre como bajan las perplejidades en $5$ iteraciones que usted elija (de todas las que sean necesarias de acuerdo a su EM) en validación, y pruebe una vez en test.

In [32]:
lambda_values_EM = interpolated_model_train.Expectation_Maximization(heldout_tokenized, n_iterations=5)

	Initial lambdas = [0.33333333 0.33333333 0.33333333]
--------------------------------------------------------------------------------
i = 0: lambdas = [0.51176631 0.33576714 0.15228981], PP(heldout) = 426.90
i = 1: lambdas = [0.59594392 0.32821656 0.07566278], PP(heldout) = 404.63
i = 2: lambdas = [0.63334677 0.32619773 0.04027876], PP(heldout) = 397.15
i = 3: lambdas = [0.64961421 0.32794002 0.02226902], PP(heldout) = 394.54
i = 4: lambdas = [0.65622659 0.33106038 0.01253629], PP(heldout) = 393.77


In [33]:
print(lambda_values_EM)

[0.65622659 0.33106038 0.01253629]


3.2. Haga una función "tuitear" con base en su modelo de lenguaje $\hat{𝑃}$ del último punto. El modelo deberá poder parar automáticamente cuando genere el símbolo de terminación de tuit al final (e.g., "<\/s>"), o $50$ palabras. Proponga algo para que en los últimos tokens sea más probable generar el token "<\/s>". Muestre al menos cinco ejemplos.

In [47]:
# tweet function
interpolated_model_train.generate(list(lambda_values_EM))

'<s> * vamos madre ptm educados contenido 😛 tus de 30 </s>'

3.3. Use la intuición que ha ganado en esta tarea y los datos de las mañaneras para entrenar un modelo de lenguaje ``` AMLO``` . Haga una función ``` dar_conferencia()```. Genere un discurso de $300$ palabras y detenga al modelo de forma abrupta.

In [27]:
conferences = []

for file in glob.glob("../../Data/estenograficas_limpias/*"):
	with open(file, "r") as f_corpus:
		for sentence in f_corpus:
			conferences.append(sentence)

In [28]:
corpus_conferences = Corpus(conferences, start_end_token=True)
conferences_masked = mask_OOV_words(corpus_conferences, n_terms=5000)
corpus_conferences = Corpus(conferences_masked) # corpus with masked conferences

In [29]:
AMLO = Interpolated_Model(corpus_conferences)

In [69]:
AMLO.generate([0.1, 0.3, 0.6], max_len=20)

'<s> asegurar abandonar social el papel 1º hoy corruptos acabó tomaron decía estarían culiacán movimientos fuera , ven fronteriza consultar'

3.4. Calcule el estimado de cada uno sus modelos de lenguaje (el de tuits y el de amlo) para las frases: "si no gano me voy a la chingada", "ya se va a acabar la corrupción".

In [36]:
sentence_1 = "si no gano me voy a la chingada"
sentence_2 = "ya se va a acabar la corrupción"

sentence_1_tokenized = tokenizer.tokenize(sentence_1)
sentence_2_tokenized = tokenizer.tokenize(sentence_2)

Con los modelos de los tweets agresivos

In [37]:
# evaluamos a sentence_1 = "si no gano me voy a la chingada"
print(f"P_unigram(sentence_1) = {unigram_model_train.probability(sentence_1_tokenized)}")
print(f"P_bigram(sentence_1)  = {bigram_model_train.probability(sentence_1_tokenized)}")
print(f"P_trigram(sentence_1) = {trigram_model_train.probability(sentence_1_tokenized)}")

P_unigram(sentence_1) = 1.422737445556451821090663411E-20
P_bigram(sentence_1)  = 8.717001740778747181481628152E-18
P_trigram(sentence_1) = 3.466901356069048238502135543E-20


In [38]:
# evaluamos a sentence_2 = "ya se va a acabar la corrupción"
print(f"P_unigram(sentence_2) = {unigram_model_train.probability(sentence_2_tokenized)}")
print(f"P_bigram(sentence_2)  = {bigram_model_train.probability(sentence_2_tokenized)}")
print(f"P_trigram(sentence_2) = {trigram_model_train.probability(sentence_2_tokenized)}")

P_unigram(sentence_2) = 1.460573663588147904551449248E-19
P_bigram(sentence_2)  = 7.270212752785908748956856244E-18
P_trigram(sentence_2) = 1.404911774707322600534679050E-17


Con los modelos de las conferencias

In [40]:
unigram_model_conferences = Ngram_Model(corpus_conferences, n=1)
bigram_model_conferences = Ngram_Model(corpus_conferences, n=2)
trigram_model_conferences = Ngram_Model(corpus_conferences, n=3)

In [41]:
# evaluamos a sentence_1 = "si no gano me voy a la chingada"
print(f"P_unigram(sentence_1) = {unigram_model_conferences.probability(sentence_1_tokenized)}")
print(f"P_bigram(sentence_1)  = {bigram_model_conferences.probability(sentence_1_tokenized)}")
print(f"P_trigram(sentence_1) = {trigram_model_conferences.probability(sentence_1_tokenized)}")

P_unigram(sentence_1) = 4.532466486548684280014766525E-28
P_bigram(sentence_1)  = 1.129882724757320311605002232E-18
P_trigram(sentence_1) = 6.558164955988236444108646296E-20


In [42]:
# evaluamos a sentence_2 = "ya se va a acabar la corrupción"
print(f"P_unigram(sentence_2) = {unigram_model_conferences.probability(sentence_2_tokenized)}")
print(f"P_bigram(sentence_2)  = {bigram_model_conferences.probability(sentence_2_tokenized)}")
print(f"P_trigram(sentence_2) = {trigram_model_conferences.probability(sentence_2_tokenized)}")

P_unigram(sentence_2) = 7.950891199318519215067285481E-18
P_bigram(sentence_2)  = 6.235694567159202024259164979E-10
P_trigram(sentence_2) = 4.095999863770872298837284021E-9
