# Construccion del modelo de lenguaje
## IMPORTANTE: Para ejecutar los notebooks, en la carpeta data añada los archivos que puede encontrar en el siguiete drive:
https://uniandes-my.sharepoint.com/:f:/g/personal/j_arboleda_uniandes_edu_co/EgEasT6fqDxFmBCiYZYRw0MBG87E7s4hFZuHCzTJ3DAXow?e=ybBMgw
## Integrantes
* Juan Esteban Arboleda
* Luccas Rojas

### 1. NEWS Dataset
Lo primero que se llevara a cabo es la union de todos los documentos en un solo un par de archivos, uno con los documentos de 20news y otro con lso documentos de BAC.

In [1]:
import os
import pandas as pd
import numpy as np
import string
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, WhitespaceTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import time

tokenizer = WhitespaceTokenizer()


* A continucion se recorren todos los documentos de ambas carpetas y se unen en un solo archivo.
* Se debe modificar la ruta de los archivos para que se ajuste. La ruta de la carpeta de 20news debe estar en PATH_20NEWS y la ruta de la carpeta de BAC debe estar en PATH_BAC.

In [2]:
PATH_20NEWS = '../data/20news-18828'
PATH_FINAL_20NEWS = "../data/final_20news.txt"

def load_news(documents_path: str,final_document_path:str):
    """
    Creates a single file with all the documents in the folders
    Params:
    -------
        documents_path: path to the folder with the documents
        final_document_path: path to the final document

    """
    open(final_document_path, "w").close()
    with open(final_document_path, "a") as final_document:
        for folder in os.listdir(documents_path):
            for document_file in os.listdir(os.path.join(documents_path, folder)):
                with open(os.path.join(documents_path, folder, document_file), "r") as document:
                    text = document.read()
                final_document.write(text)
                final_document.write("\n")

# load_news() should only be called if the PATH_FINAL_NEWS does not
# exist yet.
if not os.path.isfile(PATH_FINAL_20NEWS):
    load_news(PATH_20NEWS,PATH_FINAL_20NEWS)

In [3]:
import re
PATH_BAC= '../data/BAC/blogs/blogs'
PATH_FINAL_BAC = "../data/final_bac.txt"

def load_bac(documents_path: str,final_document_path:str):
    """
    Creates a single file with all the documents in the folders
    Params:
    -------
        documents_path: path to the folder with the documents
        final_document_path: path to the final document
    """
    pattern = r'<post>(.*?)</post>'
    documents = []
    index = []
    id = 1
    columns = ['filename', 'body']
    open(final_document_path, "w").close()
    with open(final_document_path, "a", encoding = 'latin_1') as final_document:
        for file_name in os.listdir(documents_path):
            with open(os.path.join(PATH_BAC,file_name) , encoding="latin_1") as f:
                text = f.read()
                texts = re.findall(pattern, text, re.DOTALL)
            all_text= ". \n".join(texts)
            filtered_text = all_text.replace('\n', ' ').replace('\xa0', ' ')
            final_document.write(filtered_text)

# load_bac() should only be called if the PATH_FINAL_BAC does not
# exist yet.
if not os.path.isfile(PATH_FINAL_BAC):
    load_bac(PATH_BAC,PATH_FINAL_BAC)

In [4]:
with open(PATH_FINAL_20NEWS, "r", errors='ignore') as f:
    raw_news = f.read()
with open(PATH_FINAL_BAC, "r", errors='ignore') as f:
    raw_bac = f.read()

news_sentences = sent_tokenize(raw_news)
# bac_sentences = sent_tokenize(raw_bac)

* Posteriormente se leen los archivos y se normalizan con el formato adecuado para el modelo de lenguaje. De este modo se pone todo el minusculas, se remplazan los numeros por num y se agregan caracteres al inicio y al final de cada frase

In [5]:
def normalize(sentence:str)->str:
    """
    Normalize a sentence by lowercasing it, replacing numbers with NUM and adding <s> and </s> tokens
    Params:
    -------
        sentence: sentence to normalize
    Returns:
    --------
        sentence: normalized sentence
    """

    sentence = re.sub(r"[^(a-zA-Z0-9\s)]", " ", sentence).lower().replace("\n", " ")
    words = tokenizer.tokenize(sentence)
    for word in words:
        try:
            word
            number = float(word)
            sentence = sentence.replace(word, "NUM")
        except:
            pass
    sentence = f"<s> {sentence} </s>"
    return sentence

* Luego se extrae el vocabulario de todas las palabras junto con su frecuencia para asi poder reemplazar los tokens que no se encuentren en el vocabulario por el token UNK

In [6]:
def extract_vocabulary(sentences:list)->dict:
    """
    Extract the vocabulary from a list of sentences
    Params:
    -------
        sentences: list of sentences
    Returns:
    --------
        vocabulary: dictionary in which the keys are the words and the values are the number of times the word appears in the corpus
    """
    vocabulary = {}
    for sentence in sentences:
        words = tokenizer.tokenize(sentence)
        for word in words:
            if word not in vocabulary:
                vocabulary[word] = 1
            else:   
                vocabulary[word] += 1
    return vocabulary

* Con el vocabulario y las frases se cambian todos los elementos que aparecen sola una vez en el corpus por el token "UNK"

In [7]:
def replace_unknowns(sentences:list, vocabulary:dict)->list:
    """
    Replace the words that appear only once in the corpus by the <UNK> token
    Params:
    -------
        sentences: list of sentences
        vocabulary: dictionary in which the keys are the words and the values are the number of times the word appears in the corpus
    Returns:
    --------
        sentences: list of sentences with the <UNK> token
    """
    vocabulary["<UNK>"] = 0
    for i, sentence in enumerate(sentences):
        words = tokenizer.tokenize(sentence)
        for j, word in enumerate(words):
            if vocabulary[word] == 1:
                sentences[i] = sentence.replace(word, "<UNK>")
                vocabulary["<UNK>"] += 1
                del vocabulary[word]
    return sentences

* En este punto se normalizan los 2 sets y se extrae su vocabulario para si poder remplazar los tokens que se encuentren en el vocabulario solo una vez por el token UNK

In [8]:
normalized_news_sentences = [normalize(sentence) for sentence in news_sentences]
# normalized_bac_sentences = [normalize(sentence) for sentence in bac_sentences]
print(normalized_news_sentences[0])

<s> from  mathew  mathew mantis co uk  subject  alt atheism faq  atheist resources  archive name  atheism resources alt atheism archive name  resources last modified  NUM december NUM version  NUM NUM                                atheist resources                        addresses of atheist organizations                                       usa  freedom from religion foundation  darwin fish bumper stickers and assorted other atheist paraphernalia are available from the freedom from religion foundation in the us  </s>


In [9]:
news_vocabulary = extract_vocabulary(normalized_news_sentences)
# bac_vocabulary = extract_vocabulary(normalized_bac_sentences)

In [10]:
news_sentences = replace_unknowns(normalized_news_sentences, news_vocabulary)
# bac_sentences = replace_unknowns(normalized_bac_sentences, bac_vocabulary)

In [11]:
from sklearn.model_selection import train_test_split
news_train, news_test = train_test_split(news_sentences, test_size=0.20)
# bac_train, bac_test = train_test_split(bac_sentences, test_size=0.20)

In [12]:
news_vocabulary = extract_vocabulary(news_train)
# bac_vocabulary = extract_vocabulary(bac_train)

* Luego guardamos estos datos en 4 archivos, 2 para train y 2 para test, uno para 20news y otro para BAC

In [13]:
PATH_NEWS_TRAIN = "../data/20N_l.rojasb_j.arboleda_training.txt"
PATH_NEWS_TEST = "../data/20N_l.rojasb_j.arboleda_test.txt"  
PATH_BAC_TRAIN = "../data/BAC_l.rojasb_j.arboleda_training.txt"
PATH_BAC_TEST = "../data/BAC_l.rojasb_j.arboleda_test.txt" 

def save_file(sentences:list, path:str):
    """
    Save a list of sentences in a file
    Params:
    -------
        sentences: list of sentences
        path: path to the file
    """
    with open(path, "w") as f:
        for sentence in sentences:
            f.write(sentence)
            f.write("\n")
            
save_file(news_train, PATH_NEWS_TRAIN)
save_file(news_test, PATH_NEWS_TEST)
# save_file(bac_train, PATH_BAC_TRAIN)
# save_file(bac_test, PATH_BAC_TEST)

* Luego pasamos a crear los n-gramas, para esto construimos diccionarios para cada combinacion posible de n-gramas, en este caso se construyen diccionario para los monogramas

In [14]:
PATH_NEWS_UNIGRAM="../data/20N_l.rojasb_j.arboleda_unigrams.txt"
PATH_BAC_UNIGRAM="../data/BAC_l.rojasb_j.arboleda_unigrams.txt"

def create_uni_grams(path: str, save_path: str)->dict:
    """
    Creates and saves the monograms
    """
    mono_grams = {}
    total_words = 0
    f = open(path, "r")
    sf = open(save_path, "w")

    sentence = f.readline()
    while len(sentence) != 0:
        words = tokenizer.tokenize(sentence)
        for word in words:
            total_words += 1
            if word not in mono_grams:
                mono_grams[word] = 1
            else:
                mono_grams[word] += 1
        sentence = f.readline()
            
    for word in mono_grams:
        prob = mono_grams[word] / total_words
        sf.write(f"{word},{mono_grams[word]},{prob}\n")

create_uni_grams(PATH_NEWS_TRAIN, PATH_NEWS_UNIGRAM)
# create_uni_grams(PATH_BAC_TRAIN, PATH_BAC_UNIGRAM)

* Posteriormente podemos guardar los diccionarios del unigrama en archivos para poder cargarlos posterioremtne

* Aca creamos los diccionarios para los bigramas y los guardamos en archivos

In [15]:
PATH_NEWS_BIGRAM="../data/20N_l.rojasb_j.arboleda_bigrams.txt"
PATH_BAC_BIGRAM="../data/BAC_l.rojasb_j.arboleda_bigrams.txt"

def create_bi_grams(sentence_path:str, vocabulary:dict, save_path: str)->dict:
    """ 
    This function creates and saves a bigram. Returns the bigram count
    """
    bigram_count = {}
    vocab_size = len(vocabulary)

    f = open(sentence_path, "r")
    sf = open(save_path, "w")
    
    sentence = f.readline()
    while len(sentence) != 0:
        words = tokenizer.tokenize(sentence)
        for i in range(len(words)-1):
            key = words[i] + " " + words[i+1]
            if key not in bigram_count:
                bigram_count[key] = 1
            else:
                bigram_count[key] += 1
        sentence = f.readline()

    for bigram in bigram_count:
        words = bigram.split(" ")
        key = words[0] + " " + words[1]
        prob = (bigram_count[key]+1)/(vocabulary[words[0]] + vocab_size)
        sf.write(f"{bigram},{bigram_count[key]},{prob}\n")
            
    return bigram_count

news_bigrams_count = create_bi_grams(PATH_NEWS_TRAIN, news_vocabulary, PATH_NEWS_BIGRAM)
# bac_bigrams_count = create_bi_grams(PATH_BAC_TRAIN, bac_vocabulary, PATH_BAC_BIGRAM)

In [16]:
PATH_NEWS_TRIGRAM="../data/20N_l.rojasb_j.arboleda_trigrams.txt"
PATH_BAC_TRIGRAM="../data/BAC_l.rojasb_j.arboleda_trigrams.txt"

def create_tri_grams(sentence_path: str, vocabulary: dict, bi_grams_count: dict, save_path: str)->dict:
    """ 
    Creates and saves the trigrams
    """
    trigrams_count={}

    bigram_size = len(bi_grams_count)
    

    f = open(sentence_path, "r")
    sf = open(save_path, "w")

    sentence = f.readline()
    while len(sentence) != 0:
        words = tokenizer.tokenize(sentence)
        for i in range(len(words)-2):
            key = words[i] + " " + words[i+1] + " " + words[i+2]
            if key not in trigrams_count:
                trigrams_count[key] = 1
            else:
                trigrams_count[key] += 1
        sentence = f.readline()

    for tri_gram in trigrams_count:
        words = tri_gram.split(" ")
        key = words[0] + " " + words[1] + " " + words[2]
        prob = (trigrams_count[key] + 1) / (bi_grams_count[words[0] + " " + words[1]] + bi_grams_count)
        sf.write(f"{tri_gram},{trigrams_count[key]},{prob}\n")

create_tri_grams(PATH_NEWS_TRAIN, news_vocabulary, news_bigrams_count, PATH_NEWS_TRIGRAM)
print("done")
# create_tri_grams(PATH_BAC_TRAIN, bac_vocabulary, bac_bigrams_count, PATH_BAC_TRIGRAM)

done


## 2. BAC Dataset

In [17]:
bac_sentences = sent_tokenize(raw_bac)
print(1)
normalized_bac_sentences = [normalize(sentence) for sentence in bac_sentences]
print(2)
bac_vocabulary = extract_vocabulary(normalized_bac_sentences)
print(3)
bac_sentences = replace_unknowns(normalized_bac_sentences, bac_vocabulary)
print(4)
bac_train, bac_test = train_test_split(bac_sentences, test_size=0.20)
print(5)
bac_vocabulary = extract_vocabulary(bac_train)
print(6)
save_file(bac_train, PATH_BAC_TRAIN)
save_file(bac_test, PATH_BAC_TEST)
print(7)
create_uni_grams(PATH_BAC_TRAIN, PATH_BAC_UNIGRAM)
print(8)
bac_bigrams_count = create_bi_grams(PATH_BAC_TRAIN, bac_vocabulary, PATH_BAC_BIGRAM)
print(8)
create_tri_grams(PATH_BAC_TRAIN, bac_vocabulary, bac_bigrams_count, PATH_BAC_TRIGRAM)
print(10)


1
2
3
4
5
6
7
8
8
10
