# Construccion del modelo de lenguaje
## Integrantes
* Juan Esteban Arboleda
* Luccas Rojas

### 1. Preprocesamiento
Lo primero que se llevara a cabo es la union de todos los documentos en un solo un par de archivos, uno con los documentos de 20news y otro con lso documentos de BAC.

In [144]:
import os
import pandas as pd
import numpy as np
import string
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, WhitespaceTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import time

tokenizer = WhitespaceTokenizer()


* A continucion se recorren todos los documentos de ambas carpetas y se unen en un solo archivo.
* Se debe modificar la ruta de los archivos para que se ajuste. La ruta de la carpeta de 20news debe estar en PATH_20NEWS y la ruta de la carpeta de BAC debe estar en PATH_BAC.

In [145]:
PATH_20NEWS = '../data/20news-18828'
PATH_FINAL_20NEWS = "../data/final_20news.txt"

def load_news(documents_path: str,final_document_path:str):
    """
    Creates a single file with all the documents in the folders
    Params:
    -------
        documents_path: path to the folder with the documents
        final_document_path: path to the final document

    """
    open(final_document_path, "w").close()
    with open(final_document_path, "a") as final_document:
        for folder in os.listdir(documents_path):
            for document_file in os.listdir(os.path.join(documents_path, folder)):
                with open(os.path.join(documents_path, folder, document_file), "r") as document:
                    text = document.read()
                final_document.write(text)
                final_document.write("\n")

load_news(PATH_20NEWS,PATH_FINAL_20NEWS)

In [146]:
import re
PATH_BAC= '../data/BAC/blogs/blogs'
PATH_FINAL_BAC = "../data/final_bac.txt"

def load_bac(documents_path: str,final_document_path:str):
    """
    Creates a single file with all the documents in the folders
    Params:
    -------
        documents_path: path to the folder with the documents
        final_document_path: path to the final document
    """
    pattern = r'<post>(.*?)</post>'
    documents = []
    index = []
    id = 1
    columns = ['filename', 'body']
    open(final_document_path, "w").close()
    with open(final_document_path, "a", encoding = 'latin_1') as final_document:
        for file_name in os.listdir(documents_path):
            with open(os.path.join(PATH_BAC,file_name) , encoding="latin_1") as f:
                text = f.read()
                texts = re.findall(pattern, text, re.DOTALL)
            all_text= ". \n".join(texts)
            filtered_text = all_text.replace('\n', ' ').replace('\xa0', ' ')
            final_document.write(filtered_text)


load_bac(PATH_BAC,PATH_FINAL_BAC)

In [147]:
with open(PATH_FINAL_20NEWS, "r") as f:
    raw_news = f.read()
with open(PATH_FINAL_20NEWS, "r") as f:
    raw_bac = f.read()

news_sentences = sent_tokenize(raw_news)
bac_sentences = sent_tokenize(raw_bac)

* Posteriormente se leen los archivos y se normalizan con el formato adecuado para el modelo de lenguaje. De este modo se pone todo el minusculas, se remplazan los numeros por num y se agregan caracteres al inicio y al final de cada frase

In [148]:
def normalize(sentence:str)->str:
    """
    Normalize a sentence by lowercasing it, replacing numbers with NUM and adding <s> and </s> tokens
    Params:
    -------
        sentence: sentence to normalize
    Returns:
    --------
        sentence: normalized sentence
    """
    sentence = sentence.lower().replace("\n", " ").replace(",","").replace(".","").replace("-","").replace("$","").replace("'","").replace(":","").replace("|","").replace(">","").replace("<","").replace("(","").replace(")","").replace("=","").replace("*","")
    words = tokenizer.tokenize(sentence)
    for word in words:
        try:
            word.replace(",","").replace(".","").replace("-","").replace("$","").replace("'","")
            number = float(word)
            sentence = sentence.replace(word, "NUM")
        except:
            pass
    sentence = f"<s> {sentence} </s>"
    return sentence

* Luego se extrae el vocabulario de todas las palabras junto con su frecuencia para asi poder reemplazar los tokens que no se encuentren en el vocabulario por el token UNK

In [149]:
def extract_vocabulary(sentences:list)->dict:
    """
    Extract the vocabulary from a list of sentences
    Params:
    -------
        sentences: list of sentences
    Returns:
    --------
        vocabulary: dictionary in which the keys are the words and the values are the number of times the word appears in the corpus
    """
    vocabulary = {}
    for sentence in sentences:
        words = tokenizer.tokenize(sentence)
        for word in words:
            if word not in vocabulary:
                vocabulary[word] = 1
            else:   
                vocabulary[word] += 1
    return vocabulary

* Con el vocabulario y las frases se cambian todos los elementos que aparecen sola una vez en el corpus por el token "UNK"

In [150]:
def replace_unknowns(sentences:list, vocabulary:dict)->list:
    """
    Replace the words that appear only once in the corpus by the <UNK> token
    Params:
    -------
        sentences: list of sentences
        vocabulary: dictionary in which the keys are the words and the values are the number of times the word appears in the corpus
    Returns:
    --------
        sentences: list of sentences with the <UNK> token
    """
    vocabulary["<UNK>"] = 0
    for i, sentence in enumerate(sentences):
        words = tokenizer.tokenize(sentence)
        for j, word in enumerate(words):
            if vocabulary[word] == 1:
                sentences[i] = sentence.replace(word, "<UNK>")
                vocabulary["<UNK>"] += 1
                del vocabulary[word]
    return sentences

* En este punto se normalizan los 2 sets y se extrae su vocabulario para si poder remplazar los tokens que se encuentren en el vocabulario solo una vez por el token UNK

In [151]:
normalized_news_sentences = [normalize(sentence) for sentence in news_sentences]
normalized_bac_sentences = [normalize(sentence) for sentence in bac_sentences]

In [152]:
news_vocabulary = extract_vocabulary(normalized_news_sentences)
bac_vocabulary = extract_vocabulary(normalized_bac_sentences)

In [153]:
news_sentences = replace_unknowns(normalized_news_sentences, news_vocabulary)
bac_sentences = replace_unknowns(normalized_bac_sentences, bac_vocabulary)

In [154]:
from sklearn.model_selection import train_test_split
news_train, news_test = train_test_split(news_sentences, test_size=0.20)
bac_train, bac_test = train_test_split(bac_sentences, test_size=0.20)

* Luego guardamos estos datos en 4 archivos, 2 para train y 2 para test, uno para 20news y otro para BAC

In [155]:
PATH_NEWS_TRAIN = "../data/20N_l.rojasb_j.arboleda_training.txt"
PATH_NEWS_TEST = "../data/20N_l.rojasb_j.arboleda_test.txt"  
PATH_BAC_TRAIN = "../data/BAC_l.rojasb_j.arboleda_training.txt"
PATH_BAC_TEST = "../data/BAC_l.rojasb_j.arboleda_test.txt" 

def save_file(sentences:list, path:str):
    """
    Save a list of sentences in a file
    Params:
    -------
        sentences: list of sentences
        path: path to the file
    """
    with open(path, "w") as f:
        for sentence in sentences:
            f.write(sentence)
            f.write("\n")
            
save_file(news_train, PATH_NEWS_TRAIN)
save_file(news_test, PATH_NEWS_TEST)
save_file(bac_train, PATH_BAC_TRAIN)
save_file(bac_test, PATH_BAC_TEST)

* Luego pasamos a crear los n-gramas, para esto construimos diccionarios para cada combinacion posible de n-gramas, en este caso se construyen diccionario para los monogramas

In [156]:
mono_grams = {}

def create_uni_grams(path:str)->dict:
    """
    Create a dictionary with the monograms and their counts
    Params:
    -------
        path: path to the file of sentences 
    Returns:
    --------
        mono_grams: dictionary with the monograms and their probabilities
    """
    total_words = 0
    with open(path, "r") as f:
        sentence = f.readline()
        while len(sentence) != 0:
            words = tokenizer.tokenize(sentence)
            for word in words:
                total_words += 1
                if word not in mono_grams:
                    mono_grams[word] = 1
                else:
                    mono_grams[word] += 1
            sentence = f.readline()
            
    for word in mono_grams:
        mono_grams[word] /= total_words
    return mono_grams

news_monogram = create_uni_grams(PATH_NEWS_TRAIN)
bac_monogram = create_uni_grams(PATH_BAC_TRAIN)

* Posteriormente podemos guardar los diccionarios del unigrama en archivos para poder cargarlos posterioremtne

In [157]:
PATH_NEWS_UNIGRAM="../data/20N_l.rojasb_j.arboleda_unigrams.txt"
PATH_BAC_UNIGRAM="../data/BAC_l.rojasb_j.arboleda_unigrams.txt"

def save_unigram(unigram:dict,path:str):
    """
    Save a dictionary of monograms in a file
    Params:
    -------
        unigram: dictionary of monograms
        path: path to the file
    """
    with open(path, "w") as f:
        for word in unigram:
            f.write(f"{word},{unigram[word]}\n")

save_unigram(news_monogram,PATH_NEWS_UNIGRAM)
save_unigram(bac_monogram,PATH_BAC_UNIGRAM)

In [158]:
def create_bi_grams(path:str,vocabulary:dict)->dict:
    """ 
    Create a dictionary with the bigrams and their counts
    Params:
    -------
        path: path to the file of sentences
    Returns:
    --------
        bi_grams: dictionary of dictionaries with the bigrams and their probabilities
    """
    bi_grams = {word:{} for word in vocabulary}
    vocab_size = len(vocabulary)
    
    with open(path, "r") as f:
        sentence = f.readline()
        while len(sentence) != 0:
            words = tokenizer.tokenize(sentence)
            for i in range(len(words)-1):
                if words[i+1] not in bi_grams[words[i]]:
                    bi_grams[words[i]][words[i+1]] = 2
                else:
                    bi_grams[words[i]][words[i+1]] += 1
            sentence = f.readline()

    for bi_gram in bi_grams:
        for word in bi_grams[bi_gram]:
            bi_grams[bi_gram][word] /= (vocabulary[bi_gram] + vocab_size)
    return bi_grams

# news_bigram = create_bi_grams(PATH_NEWS_TRAIN, news_vocabulary)
bac_bigram = create_bi_grams(PATH_BAC_TRAIN, bac_vocabulary)

KeyError: '#601821'