# Práctica Modelo de Lenguaje Probabilista
## **Mateo Serrato Ascencio** 

In [89]:
# PRIMERO LEEMOS LOS DATOS 

def get_text_from_files(path_corpus, path_truth):
    tr_text = [] #aqui van los tuits
    tr_labels = [] #aqui van las etiquetas

    with open(path_corpus, 'r', encoding='utf-8') as f_corpus, open(path_truth, 'r', encoding='utf-8') as f_truth:

        for twitt in f_corpus:
            tr_text += [twitt]

        for label in f_truth:
            tr_labels += [label]

    return tr_text, tr_labels



In [90]:
tr_text, tr_labels = get_text_from_files('./mex20_train.txt','mex20_train_labels.txt' )

In [91]:
from nltk import FreqDist
class TrigramData:

    def __init__(self,vocab_max,tokenizer):
        self.vocab_max = vocab_max
        self.tokenizer = tokenizer
        self.final_vocab = set()
        self.sos = '<s>' # start of sentence
        self.eos = '</s>' # end of sentence
        self.unk = '<unk>' # unknown token
    
    def fit(self,raw_txt): #raw txt recibe los tweets
        freq_dist = FreqDist()
        tokenized_corpus = []

        for txt in raw_txt:
            tokens = self.tokenizer(txt.lower())
            tokenized_corpus.append(tokens) #recordar que es una lista de listas de tuit tokenizado
            for w in tokens: #Para cada palabra en cada tuit contar en el vocabulario 
                freq_dist[w] += 1

        self.final_vocab = {tok for tok,_ in freq_dist.most_common(self.vocab_max)}
        self.final_vocab.update([self.unk,self.sos,self.eos])

        transformed_corpus = []
        for tokens in tokenized_corpus: #Recordar que recorremos los tokens en cada tuit 
            transformed_corpus.append(self.transform(tokens)) #Recuerda que tokens es un tuit 

        return transformed_corpus
    
    def mask_oov(self,word):
        return self.unk if word not in self.final_vocab else word 

    def add_sos_eos(self, tokens):
        return [self.sos,self.sos] + tokens + [self.eos]

    def transform(self,tokens):
        transformed = [] #tokens transformados 
        for w in tokens:
            transformed.append(self.mask_oov(w)) #mask out of vocabulary word (OOV)
        transformed = self.add_sos_eos(transformed) #añade sos y eos 
        return transformed

In [92]:
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer().tokenize
trigram_data = TrigramData(vocab_max=10000,tokenizer=tokenizer)

transformed_corpus = trigram_data.fit(tr_text)
final_vocab = trigram_data.final_vocab
transformed_corpus[:2]  #muestra los dos primeros tuits tokenizados y transformados

[['<s>',
  '<s>',
  '@usuario',
  '@usuario',
  '@usuario',
  'q',
  'se',
  'puede',
  'esperar',
  'del',
  'maricon',
  'de',
  'closet',
  'de',
  'la',
  'yañez',
  'aun',
  'recuerdo',
  'esa',
  'ves',
  'q',
  'lo',
  'vi',
  'en',
  'zona',
  'rosa',
  'viendo',
  'quien',
  'lo',
  'levantada',
  '</s>'],
 ['<s>',
  '<s>',
  '@usuario',
  'la',
  'piel',
  'nueva',
  'siempre',
  'arde',
  'un',
  'poquito',
  'los',
  'primeros',
  'días',
  '...',
  'y',
  'más',
  'con',
  'este',
  'puto',
  'clima',
  '</s>']]

## BUILDING A TRIGRAM LANGUAGE MODEL

In [None]:
class TrigramLanguageModel: #Modelo Interpolado de Unigramas + Bigramas + Trigramas

    def __init__(self,lambda1 = .01,lambda2 = .40, lambda3 = .59):
        self.lambda1 = lambda1 #Ponderar al Modelo de Lenguaje de Unigramas
        self.lambda2 = lambda2 #Ponderar al Modelo de Lenguaje de Bigramas
        self.lambda3 = lambda3 #Ponderar al Modelo de Lenguaje de Trigramas 
    
        #Contadores

        self.unigram_counts = {} #palabras solitas
        self.bigram_counts = {}  #pares de palabras
        self.trigram_counts = {} #tripletas de palabras
        self.vocab = set()
        self.V = 0 #tamaño del vocabulario

    def train(self, transformed_corpus, final_vocab):
        self.vocab = final_vocab
        self.V = len(final_vocab)

        for tokens in transformed_corpus: #primero recorremos tuit por tuit 
            for i,w in enumerate(tokens): #luego para cada tuit recorremos palabra por palabra 
                #Unigramas
                self.unigram_counts[w] = self.unigram_counts.get(w,0) + 1
                
                #Bigramas
                if i >= 1:
                   w_prev = tokens[i-1]
                   self.bigram_counts[w_prev,w] = self.bigram_counts.get((w_prev,w),0) + 1

                #Trigramas
                if i >= 2:
                  w_prev2 = tokens[i-2]
                  self.trigram_counts[w_prev2,w_prev,w] = self.trigram_counts.get((w_prev2,w_prev,w),0) + 1
            self.total_tokens = sum(self.unigram_counts.values())

    def mask_oov(self,word):
        return "<unk>" if word not in self.vocab else word

    def unigram_prob(self,w):
        numerator = self.unigram_counts.get(self.mask_oov(w),0) + 1
        denominator = self.total_tokens + self.V
    

        return numerator / denominator
    
    def bigram_prob(self,w_prev,w): # P(w|w_prev)
        w_prev = self.mask_oov(w_prev)
        w = self.mask_oov(w)
        numerator = self.bigram_counts.get((w_prev,w),0) + 1
        denominator = self.unigram_counts.get((w_prev),0) + self.V
        
        return numerator / denominator
    
    def trigram_prob(self,w_prev2,w_prev,w): #P(w|w_prev2,w_prev)
        w_prev2 = self.mask_oov(w_prev2)
        w_prev = self.mask_oov(w_prev)
        w = self.mask_oov(w)
        numerator = self.trigram_counts.get((w_prev2,w_prev,w),0) + 1 #Cuantas veces aparecio la secuencia completa
        denominator = self.bigram_counts.get((w_prev2,w_prev),0) + self.V # Cuantas veces aparecio el contexto (la secuencia previa a ww)

        return numerator / denominator
    
    def checar_prob(self):
        print(sum(self.unigram_prob(w) for w in self.vocab))
        print(sum(self.bigram_prob('gato',w) for w in self.vocab))
        print(sum(self.trigram_prob('hola','como',w) for w in self.vocab))

    def top_next_words(self,w_prev2,w_prev,top_k = 5):
        candidates = []
        for cand in self.vocab:
            p_cand = self.probability_of_word(w_prev2,w_prev,cand)
            candidates.append((cand,p_cand))
        candidates = sorted(candidates,key=lambda x:x[1],reverse=True)
        return candidates[:top_k]

    def probability_of_word(self,w_prev2,w_prev,w):
        p3 = self.trigram_prob(w_prev2,w_prev,w)
        p2 = self.bigram_prob(w_prev,w)
        p1 = self.unigram_prob(w)

        lambda1 = self.lambda1 
        lambda2 = self.lambda2 
        lambda3 = self.lambda3 

        P_interpolada = p1*lambda1 + p2*lambda2 + p3* lambda3
        return P_interpolada
    
    def sequence_probability(self,sequence):
        import math 
        log_prob = 0
        for i in range(2,len(sequence)):
            w_prev2 = sequence[i-2]
            w_prev = sequence[i-1]
            w = sequence[i]

            p = self.probability_of_word(w_prev2,w_prev,w)
            log_prob += math.log(p)

        return math.exp(log_prob)


    def rank_permutations(self,tokens,top_k=5,bottom_k=5):
        from itertools import permutations
        perms = set(permutations(tokens))
        scores = []
        for p in perms:
            prob = self.sequence_probability(list(p))
            scores.append((" ".join(p),prob))
        scores.sort(key = lambda x: x[1],reverse=True)

        best = scores[:top_k]
        worst = scores[-bottom_k:]
        return best, worst
    
    def generate_from_tokens_as_chatgpt(self, first_word,second_word,max_length = 20,delay = 0.5):
        if first_word not in self.vocab:
            first_word = '<unk>'        
        if second_word not in self.vocab:
            second_word = '<unk>'   
        generated = [first_word,second_word]

        print(first_word,second_word,end=" ",flush=True) 

        for _ in range(max_length):
            w_prev2 = generated[-2]
            w_prev = generated[-1]

            dist = []
            total_p = 0.0
            for w in self.vocab:
                p = self.probability_of_word(w_prev2,w_prev,w)
                dist.append((w,p))
                total_p += p
            if total_p == 0.0:
                break

            import random 
            import time
            import sys

            r = random.random() * total_p
            cumulative = 0.0
            chosen = None
            for w,p in dist:
                cumulative += p
                if cumulative >= r:
                    chosen = w
                    break
            if chosen is None or chosen == '</s>':
                break

            generated.append(chosen)
            print(chosen,end= " ",flush = True)
            time.sleep(delay)

        print()
        return " ".join(generated)
                
    

# Probando el Modelo...

In [94]:
trigram_lm = TrigramLanguageModel()
trigram_lm.train(transformed_corpus,final_vocab)
trigram_lm.checar_prob()

0.9999999999999782
0.9999999999998422
1.0000000000000644


In [95]:
w_prev2,w_prev,w = 'vete',"a",'el'
p_w = trigram_lm.probability_of_word(w_prev2,w_prev,w)
print(f"{p_w * 100:.4f}%")

0.0285%


In [96]:
top_5 = (trigram_lm.top_next_words('a','la',top_k=5))
top_5

[('verga', 0.015603964747744226),
 ('madre', 0.005336759556709187),
 ('<unk>', 0.004533091183708973),
 ('chingada', 0.0024771255985542757),
 ('gente', 0.002447574091646817)]

In [97]:
# 5 ejemplos de  top_5
top_5 = (trigram_lm.top_next_words('hijo','de',top_k=5))
top_5

top_5 = (trigram_lm.top_next_words('chinga','tu',top_k=5))
top_5

top_5 = (trigram_lm.top_next_words('soy','la',top_k=5))
top_5

top_5 = (trigram_lm.top_next_words('sois','unos',top_k=5))
top_5

top_5 = (trigram_lm.top_next_words('me','vale',top_k=5))
top_5

[('verga', 0.005137069815879167),
 ('madre', 0.001970339559950493),
 ('<s>', 0.0009542736274069821),
 ('</s>', 0.0005262467033943309),
 ('que', 0.0004881400079337845)]

# Evaluación Cualitativa con los modelos de Lenguaje

In [101]:
test_tokens = ["sino","gano","me","voy","a","la","chingada",","]


top_5,bottom_5 =trigram_lm.rank_permutations(test_tokens,top_k=5,bottom_k=5)   

In [103]:
trigram_lm.sequence_probability(test_tokens)
top_5,bottom_5

([('gano sino , me voy a la chingada', 1.4556256451760978e-16),
  ('sino gano , me voy a la chingada', 1.304113035182973e-16),
  ('gano , me voy a la chingada sino', 3.7665331039448914e-17),
  ('sino , me voy a la chingada gano', 3.744980003570942e-17),
  ('gano me voy a la chingada , sino', 3.225307430667454e-17)],
 [(', a sino me chingada voy la gano', 5.631051079606779e-24),
  ('a , gano voy la sino me chingada', 5.532510787112802e-24),
  ('a , gano la sino voy me chingada', 5.529217166054525e-24),
  ('a , gano la sino me chingada voy', 5.5278547255835275e-24),
  ('a , gano me chingada voy la sino', 5.519840317620859e-24)])