# Construccion del modelo de lenguaje
## Integrantes
* Juan Esteban Arboleda
* Luccas Rojas

### 1. Preprocesamiento
Lo primero que se llevara a cabo es la union de todos los documentos en un solo un par de archivos, uno con los documentos de 20news y otro con lso documentos de BAC.

In [20]:
import os
import pandas as pd
import numpy as np
import string
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import time

* A continucion se recorren todos los documentos de ambas carpetas y se unen en un solo archivo.
* Se debe modificar la ruta de los archivos para que se ajuste. La ruta de la carpeta de 20news debe estar en PATH_20NEWS y la ruta de la carpeta de BAC debe estar en PATH_BAC.

In [21]:
PATH_20NEWS = '../data/20news-18828'
PATH_FINAL_20NEWS = "../data/final_20news.txt"

def load_news(documents_path: str,final_document_path:str):
    """
    Creates a single file with all the documents in the folders
    Params:
    -------
        documents_path: path to the folder with the documents
        final_document_path: path to the final document

    """
    open(final_document_path, "w").close()
    with open(final_document_path, "a") as final_document:
        for folder in os.listdir(documents_path):
            for document_file in os.listdir(os.path.join(documents_path, folder)):
                with open(os.path.join(documents_path, folder, document_file), "r") as document:
                    text = document.read()
                final_document.write(text)
                final_document.write("\n")

load_news(PATH_20NEWS,PATH_FINAL_20NEWS)

In [22]:
import re
PATH_BAC= '../data/BAC/blogs/blogs'
PATH_FINAL_BAC = "../data/final_bac.txt"

def load_bac(documents_path: str,final_document_path:str):
    """
    Creates a single file with all the documents in the folders
    Params:
    -------
        documents_path: path to the folder with the documents
        final_document_path: path to the final document
    """
    pattern = r'<post>(.*?)</post>'
    documents = []
    index = []
    id = 1
    columns = ['filename', 'body']
    open(final_document_path, "w").close()
    with open(final_document_path, "a", encoding = 'latin_1') as final_document:
        for file_name in os.listdir(documents_path):
            with open(os.path.join(PATH_BAC,file_name) , encoding="latin_1") as f:
                text = f.read()
                texts = re.findall(pattern, text, re.DOTALL)
            all_text= ". \n".join(texts)
            filtered_text = all_text.replace('\n', ' ').replace('\xa0', ' ')
            final_document.write(filtered_text)


load_bac(PATH_BAC,PATH_FINAL_BAC)

In [23]:
with open(PATH_FINAL_20NEWS, "r") as f:
    raw_news = f.read()
with open(PATH_FINAL_20NEWS, "r") as f:
    raw_bac = f.read()

news_sentences = sent_tokenize(raw_news)
bac_sentences = sent_tokenize(raw_bac)

* Posteriormente se leen los archivos y se normalizan con el formato adecuado para el modelo de lenguaje. De este modo se pone todo el minusculas, se remplazan los numeros por num y se agregan caracteres al inicio y al final de cada frase

In [24]:
def normalize(sentence:str)->str:
    """
    Normalize a sentence by lowercasing it, replacing numbers with NUM and adding <s> and </s> tokens
    Params:
    -------
        sentence: sentence to normalize
    Returns:
    --------
        sentence: normalized sentence
    """
    sentence = sentence.lower().replace ("\n", " ")
    words = word_tokenize(sentence)
    for word in words:
        try:
            word.replace(",","").replace(".","").replace("-","").replace("$","").replace("'","")
            number = float(word)
            sentence = sentence.replace(word, "NUM")
        except:
            pass
    sentence = f"<s> {sentence} </s>"
    return sentence

* Luego se extrae el vocabulario de todas las palabras junto con su frecuencia para asi poder reemplazar los tokens que no se encuentren en el vocabulario por el token UNK

In [25]:
def extract_vocabulary(sentences:list)->dict:
    """
    Extract the vocabulary from a list of sentences
    Params:
    -------
        sentences: list of sentences
    Returns:
    --------
        vocabulary: dictionary in which the keys are the words and the values are the number of times the word appears in the corpus
    """
    vocabulary = {}
    for sentence in sentences:
        words = word_tokenize(sentence)
        for word in words:
            if word not in vocabulary:
                vocabulary[word] = 1
            else:   
                vocabulary[word] += 1

* Con el vocabulario y las frases se cambian todos los elementos que aparecen sola una vez en el corpus por el token "UNK"

In [26]:
def replace_unknowns(sentences:list, vocabulary:dict)->list:
    """
    Replace the words that appear only once in the corpus by the <UNK> token
    Params:
    -------
        sentences: list of sentences
        vocabulary: dictionary in which the keys are the words and the values are the number of times the word appears in the corpus
    Returns:
    --------
        sentences: list of sentences with the <UNK> token
    """
    for i, sentence in enumerate(sentences):
        words = word_tokenize(sentence)
        for j, word in enumerate(words):
            try:
                if vocabulary[word] == 1:
                    words[j] = "<UNK>"
            except:
                words[j] = "<UNK>"
            sentences[i] = " ".join(words)
    return sentences

* En este punto se normalizan los 2 sets y se extrae su vocabulario para si poder remplazar los tokens que se encuentren en el vocabulario solo una vez por el token UNK

In [27]:
normalized_news_sentences = [normalize(sentence) for sentence in news_sentences]
normalized_bac_sentences = [normalize(sentence) for sentence in bac_sentences]

In [28]:
news_vocabulary = extract_vocabulary(normalized_news_sentences)
bac_vocabulary = extract_vocabulary(normalized_bac_sentences)

In [29]:
news_sentences = replace_unknowns(normalized_news_sentences, news_vocabulary)
bac_sentences = replace_unknowns(normalized_bac_sentences, bac_vocabulary)

TypeError: 'NoneType' object is not subscriptable

In [None]:
print(normalized_news_sentences[0])

<s> from: mathew <mathew@mantis.co.uk> subject: alt.atheism faq: atheist resources  archive-name: atheism/resources alt-atheism-archive-name: resources last-modified: NUM december NUM version: NUM                                atheist resources                        addresses of atheist organizations                                       usa  freedom from religion foundation  darwin fish bumper stickers and assorted other atheist paraphernalia are available from the freedom from religion foundation in the us. </s>


In [None]:
nltk.download('punkt')
nltk.download('stopwords')

documents['tokens'] = documents['body'].apply(word_tokenize)
queries['tokens'] = queries['body'].apply(word_tokenize)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\luccas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\luccas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


NameError: name 'documents' is not defined

Removemos todos los signos de puntuacion, contracciones del ingles y dejamos el texto todo en minusculas (normalizar) 

In [None]:
def remove_punctuation(token_list):
    return [token.lower() for token in token_list if (token not in string.punctuation and (len(token)>1 or token.isnumeric()))]

documents['tokens']=documents['tokens'].apply(lambda x: remove_punctuation(x))
queries['tokens']=queries['tokens'].apply(lambda x: remove_punctuation(x))

Luego de tokenizar, dejar todo en minusculas, quitaremos las stop words para que reduzcan el vocabulario y no afecten el resultado final. Para esto usaremos la libreria nltk y su metodo stopwords.words('english').

In [None]:
stop_words = set(stopwords.words('english'))

#TODO no se si normalizar cuente como poner todo en minusculas
def remove_stop_words(token_list):
    return [token for token in token_list if token not in stop_words]

documents['tokens']=documents['tokens'].apply(lambda x: remove_stop_words(x))
queries['tokens']=queries['tokens'].apply(lambda x: remove_stop_words(x))

Luego de eliminar las stop words se hace stemming a las palabras restantes.

In [None]:
stemmer = PorterStemmer()
def stemming(token_list):
    return [stemmer.stem(token) for token in token_list]

documents['tokens']=documents['tokens'].apply(lambda x: stemming(x))
queries['tokens']=queries['tokens'].apply(lambda x: stemming(x))

En este punto el texto de cada documento y query esta en un formato mas facil de procesar, por lo que se procede a realizar la representacion vectorial de los documentos y queries.

## 2. Representación de los datos

A continuación se hace la implementación para transformar el anterior dataframe en una estructura de indice inertido para así poder realizar busquedas binarias 

In [None]:
def create_inverted_index(documents: pd.DataFrame) -> dict:
    """
    Creates the inverted index for a document set.

    Params
    ------
        documents: pd.DataFrame
            A Pandas DataFrame that represents the document set. The
            DataFrame should have the following columns: "filename", "body".
            DataFrame's index should correspond to the document id
        
    Returns
    -------
        inverted_index: dict
            A python dictionary that represents the inverted index.
            Keys are the terms in the vocabulary.
            Each value has a "df" (document frecuency) and "postings".
            "postings" are a numpy array
    """
    inverted_index = {}

    for id, document in documents.iterrows():
        for token in document['tokens']:
            if token not in inverted_index:
                inverted_index[token] = {"df": 0, "postings": []}
            if id not in inverted_index[token]["postings"]:
                inverted_index[token]["df"] += 1
                inverted_index[token]["postings"].append(id)
    
    return inverted_index

inverted_index = create_inverted_index(documents)

Como se pudo observar en el anterior código, se crea un diccionario que almacenara el índice invertido haciendo un recorrido por cada uno de los documentos y sus tokens. Agregando así todos los tokens del vocabulario y añadiendo a cada token el listado de documentos que contienen ese token. El vocabulario final cuenta con 14682 tokens.

## 3. Modelamiento

In [None]:
def and_intersect(postings1: list, postings2: list) -> list:
    """
    Returns the intersection of two postings lists
    """

    i = 0
    j = 0

    intersection = []

    # Merge algorith taken from the book
    while(i < len(postings1) and j < len(postings2)):
        docId1 = postings1[i]
        docId2 = postings2[j]
        if docId1 == docId2:
            intersection.append(docId1)
            i += 1
            j += 1
        elif docId1 < docId2:
            i += 1
        else:
            j += 1

    return intersection


def and_search(terms: list, inverted_index: dict) -> list:
    """
    Returns a list with the ids if the documents that
    contain all of the terms in terms list.

        Params
        ------
            terms: list[str]
                list of terms to look for in the documents
            
            inverted_index: dict
                Inverted index created from the document base
    """
    term_df_list = []

    for term in terms:
        if term in inverted_index:
            term_df_list.append({
                "term": term,
                "df": inverted_index[term]["df"]
            })
        else:
            # If a term that is not in the inverted index
            # is found. That means that there is no document
            # in the document base that meets the query.
            # Hence, an empty array is returned
            return []
        
    # If there is only one term to match, the function
    # returns the postings of that term
    if len(term_df_list) == 1:
        return inverted_index[term_df_list[0]["term"]]["postings"]

    # Sort term_df_list based on df
    term_df_list.sort(key=lambda elem: elem["df"])

    # Initialize intersection as the smallest postings list
    intersection = inverted_index[term_df_list[0]["term"]]["postings"]

    for i in range(1, len(term_df_list)):
        # If there are no items in the current intersection
        # there is no point in calculating the intersection
        # for the rest of the postings.
        # Hence, the function returns current (empty) intersection
        if len(intersection) == 0:
            return intersection
        
        postings_i = inverted_index[term_df_list[i]["term"]]["postings"]

        # calculate the intersection of current intersection with the next
        # smallest posting list
        intersection = and_intersect(intersection, postings_i)

    return intersection

# query processing

# Clear output file contents
open(QUERIES_RESULTS_FILE_PATH, "w").close()

# Loop through queries
for i, query in queries.iterrows():
    # Open output file
    file = open(QUERIES_RESULTS_FILE_PATH, "a")
    query_str = query['filename'].replace('.naf', '').replace('wes2015.', '')
    file.write(query_str + " ")

    # Perform AND query with all the terms in the query
    res = and_search(query["tokens"], inverted_index)

    # Write output file
    for docId in res:
        if docId < 10:
            doc_str = "d00" + str(docId)
        elif docId < 100:
            doc_str = "d0" + str(docId)
        else:
            doc_str = "d" + str(docId)
        file.write(doc_str)
        if docId != res[len(res) - 1]:
            file.write(",")
    if query_str != "q46":
        file.write("\n")

file.close()