### Importación de librerias

In [7]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import os, re, json
from gensim.parsing.porter import PorterStemmer
from nltk.corpus import stopwords

__paths to change__

In [None]:
# input variables
documents_path = './input/docs-raw-texts/'
queries_path = './input/queries-raw-texts/'

### Read documents methods

In [2]:
def get_documents(path: str) -> list:
    """
    read raw text from naf documents located in the directory path
    """
    data = []
    for file in sorted(os.listdir(path)):
        if file.endswith(".naf"):
            tree = ET.parse(path + file)
            text = tree.find('raw').text
            header = tree.find('nafHeader')
            if header:
                desc = header.find('fileDesc')
                if desc:
                    title = desc.attrib.get('title')
                    text = title + ' ' + text if title else text
            data.append(text)
    return data

In [5]:
def remove_stopwords(document: str) -> list:
    """
    remove the english stop words from data
    """
    lower = document.lower()
    words = lower.split(' ')
    stop_words = stopwords.words('english')
    return [word for word in words if word not in stop_words]

In [4]:
def remove_nonlatin(document: str) -> str:
    """
    replace problematic characters
    """
    document = re.sub('\n', ' ', document)
    document = re.sub('[^a-zA-Z]|[0-9]', ' ', document)
    document = re.sub('\s+', ' ', document)
    return document

In [3]:
def preprocessing(document: str) -> list:
    """
    clean data by removing non-latin characters
    stem data sentences
    remove stop words from a document
    """
    porter = PorterStemmer()
    document = remove_nonlatin(document)
    document = porter.stem_sentence(document)
    document = remove_stopwords(document)
    return document

### indexes and doc-term matrix

In [8]:
def get_words_index(documents: pd.Series) -> pd.Index:
    """
    return a sorted index of every word in the texts
    """
    # get all words in all documents
    words = set()
    for document in documents:
        words.update(set(document))
    # sort the words
    sorted_words = sorted(list(words))
    # get index of sorted words
    words_frame = pd.DataFrame(sorted_words, columns=['data'])
    words_index = words_frame.set_index('data').index
    return words_index

In [9]:
def get_index_word(word: str, words_index: pd.Index) -> int:
    """
    return the provided word index
    """
    try: return words_index.get_loc(word)
    except: return -1

In [None]:
def get_doc_term(documents: pd.DataFrame, words_index: pd.Index) -> list:
    """
    return the document term matrix that indicate how many terms repeats in each document
    """
    doc_term = [[0]*len(documents) for _ in range(len(words_index))]
    for doc_index, document in documents.iterrows():
        for word in document.filtered:
            word_index = get_index_word(word, words_index)
            if word_index != -1:
                doc_term[word_index][doc_index] += 1
    return doc_term

## Representación vectorial ponderada tf.idf

In [13]:
def get_tf(doc_term: list) -> list:
    """
    return the ft score from each word in all the documents
    """
    return [[1 + np.log10(doc) if doc > 0 else 0 for doc in word] for word in doc_term]

In [14]:
def get_idf(doc_term: list) -> list:
    """
    return the idf score from each word in the entire collection
    """
    word_num = len(doc_term)
    return [np.log10(word_num/sum([1 if doc > 0 else 0 for doc in word])) for word in doc_term]

In [12]:
def get_tfidf(doc_term: list) -> list:
    """
    ponderate the tf-idf scores multiping them
    """
    tf = get_tf(doc_term)
    idf = get_idf(doc_term)
    return [[tf_scr * idf[i] for tf_scr in words] for i, words in enumerate(tf)]

In [10]:
doc_term = [
    [157, 73, 0, 0, 0, 0],
    [4, 157, 0, 1, 0, 0],
    [232, 227, 0, 2, 1, 1],
    [0, 1, 0, 0, 0, 0, 0],
    [57, 0, 0, 0, 0, 0],
    [2, 0, 3, 5, 5, 1],
    [2, 0, 1, 1, 1, 0]
]

In [15]:
tfidf = get_tfidf(doc_term)
tfidf

[[1.7387868738260175, 1.5578424688491743, 0.0, 0.0, 0.0, 0.0],
 [0.5895208854579495, 1.1760168802176616, 0.0, 0.36797678529459443, 0.0, 0.0],
 [0.49179214833081875,
  0.4904094661970059,
  0.0,
  0.1901169576248441,
  0.146128035678238,
  0.146128035678238],
 [0.0, 0.8450980400142568, 0.0, 0.0, 0.0, 0.0, 0.0],
 [2.3289844390533956, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.1901169576248441,
  0.0,
  0.21584882741075853,
  0.24826714940986994,
  0.24826714940986994,
  0.146128035678238],
 [0.3161997914285121,
  0.0,
  0.24303804868629444,
  0.24303804868629444,
  0.24303804868629444,
  0.0]]

In [None]:
# Step 1: obtain the documents and convet it to dataframe
data = get_documents(documents_path)
documents = pd.DataFrame.from_dict(data)
documents.head()

In [None]:
# Step 2: apply the preprocessing method
documents['filtered'] = documents.data.apply(preprocessing)
documents.head()

In [None]:
# Step 3: get word-index, doc-term, and the tfidf index
words_index = get_words_index(documents.filtered)
doc_term = get_doc_term(documents, words_index)
tfidf = get_tfidf(doc_term)

## similitud del coseno

In [2]:
def similitud_coseno(vector1,vector2):
    producto_punto = np.dot(vector1,vector2)
    norma_1 = np.linalg.norm(vector1)
    norma_2 = np.linalg.norm(vector2)
    return producto_punto/(norma_1*norma_2)



## Procesamiento

In [3]:
def get_documents(path):
    documents = []
    for filename in sorted(os.listdir(path)):
        if filename.endswith(".naf"):
            tree = ET.parse(path+filename)
            texto = tree.find('raw').text
            documents.append(texto)
    return documents

In [4]:
def remove_stopwords(data):
    filtered_words = [word.lower() for word in data.split(' ') if word.lower() not in stopwords.words('english')]    
    return filtered_words

In [5]:
def preprocesar(documentos):
# 2. Preprocess the data
#remover espacios dobles y triples
    import re
    documentos = re.sub('\n', ' ',documentos)
    documentos = re.sub('[^a-zA-Z]|[0-9]', ' ',documentos)
    documentos = re.sub('\s+', ' ',documentos)
    p=PorterStemmer()
    documentos = p.stem_sentence(documentos)

    filtrada= remove_stopwords(documentos)
    
    return filtrada

In [6]:
datos = get_documents('datos/docs-raw-texts/')
documentos = pd.DataFrame(datos,columns=['Documento'])
documentos['filtrada']=documentos['Documento'].apply(preprocesar)
doc_proc= documentos
doc_proc.filtrada = doc_proc.filtrada.apply(np.unique)
doc_proc.head()

Unnamed: 0,Documento,filtrada
0,William Beaumont and the Human Digestion.\n\nW...,"[accid, acid, activ, affect, ag, alexi, also, ..."
1,Selma Lagerlöf and the wonderful Adventures of...,"[abl, academi, accept, acclaim, accomplish, ac..."
2,Ferdinand de Lesseps and the Suez Canal.\n\nFe...,"[abandon, act, adopt, affair, africa, afterwar..."
3,Walt Disney’s ‘Steamboat Willie’ and the Rise ...,"[aboard, accident, accompani, ad, along, also,..."
4,Eugene Wigner and the Structure of the Atomic ...,"[accept, achiev, ad, administr, albert, along,..."


In [7]:
dictionary = {}
for i in range(len(doc_proc)):
    for j in range(len(doc_proc.iloc[i]['filtrada'])):
        if doc_proc.iloc[i]['filtrada'][j] not in dictionary:
            dictionary[doc_proc.iloc[i]['filtrada'][j]] = len(dictionary)
dictionary_size = len(dictionary)
dictionary

{'accid': 0,
 'acid': 1,
 'activ': 2,
 'affect': 3,
 'ag': 4,
 'alexi': 5,
 'also': 6,
 'american': 7,
 'anoth': 8,
 'armi': 9,
 'around': 10,
 'back': 11,
 'basi': 12,
 'beaumont': 13,
 'becam': 14,
 'becaus': 15,
 'best': 16,
 'better': 17,
 'bit': 18,
 'book': 19,
 'born': 20,
 'break': 21,
 'broken': 22,
 'canadian': 23,
 'caus': 24,
 'chemic': 25,
 'children': 26,
 'close': 27,
 'compani': 28,
 'complet': 29,
 'connecticut': 30,
 'consid': 31,
 'di': 32,
 'differ': 33,
 'digest': 34,
 'discov': 35,
 'dure': 36,
 'earli': 37,
 'eight': 38,
 'emot': 39,
 'even': 40,
 'examin': 41,
 'exist': 42,
 'expect': 43,
 'experi': 44,
 'famou': 45,
 'father': 46,
 'find': 47,
 'fistula': 48,
 'follow': 49,
 'food': 50,
 'fort': 51,
 'french': 52,
 'fur': 53,
 'fuse': 54,
 'gain': 55,
 'gastric': 56,
 'gave': 57,
 'heal': 58,
 'hi': 59,
 'hole': 60,
 'human': 61,
 'hydrochlor': 62,
 'imag': 63,
 'import': 64,
 'inform': 65,
 'insert': 66,
 'insight': 67,
 'interest': 68,
 'island': 69,
 'juic':

In [8]:
def doc_to_vector(doc):
    vector = np.zeros(dictionary_size)
    for token in doc:
        if token in dictionary:
            vector[dictionary[token]] = 1
    return vector

In [9]:
doc_proc['doc_vector'] = doc_proc.filtrada.apply(doc_to_vector)
doc_proc

Unnamed: 0,Documento,filtrada,doc_vector
0,William Beaumont and the Human Digestion.\n\nW...,"[accid, acid, activ, affect, ag, alexi, also, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
1,Selma Lagerlöf and the wonderful Adventures of...,"[abl, academi, accept, acclaim, accomplish, ac...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
2,Ferdinand de Lesseps and the Suez Canal.\n\nFe...,"[abandon, act, adopt, affair, africa, afterwar...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, ..."
3,Walt Disney’s ‘Steamboat Willie’ and the Rise ...,"[aboard, accident, accompani, ad, along, also,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
4,Eugene Wigner and the Structure of the Atomic ...,"[accept, achiev, ad, administr, albert, along,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, ..."
...,...,...,...
326,James Parkinson and Parkinson’s Disease.\n\nWo...,"[abnorm, activist, addit, advanc, advoc, ag, a...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, ..."
327,Juan de la Cierva and the Autogiro.\n\nDemonst...,"[abil, acceler, accept, accid, accomplish, ach...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, ..."
328,Squire Whipple – The Father of the Iron Bridge...,"[academi, across, ag, also, america, american,...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, ..."
329,William Playfair and the Beginnings of Infogra...,"[accept, account, achiev, actual, adapt, after...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, ..."


In [10]:
datos_querry = get_documents('datos/queries-raw-texts/')
queries = pd.DataFrame(datos_querry,columns=['Query'])
queries

Unnamed: 0,Query
0,Fabrication of music instruments
1,famous German poetry
2,Romanticism
3,University of Edinburgh research
4,bridge construction
5,Walk of Fame stars
6,Scientists who worked on the atomic bomb
7,Invention of the Internet
8,early telecommunication methods
9,Who explored the South Pole


In [11]:
queries['filtrada'] = queries.Query.apply(preprocesar)
quer_proc = queries
quer_proc.filtrada = quer_proc.filtrada.apply(np.unique)
quer_proc.head()

Unnamed: 0,Query,filtrada
0,Fabrication of music instruments,"[fabric, instrument, music]"
1,famous German poetry,"[famou, german, poetri]"
2,Romanticism,[romantic]
3,University of Edinburgh research,"[edinburgh, research, univers]"
4,bridge construction,"[bridg, construct]"


In [12]:
quer_proc['query_vector'] = quer_proc.filtrada.apply(doc_to_vector)
quer_proc

Unnamed: 0,Query,filtrada,query_vector
0,Fabrication of music instruments,"[fabric, instrument, music]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,famous German poetry,"[famou, german, poetri]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,Romanticism,[romantic],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,University of Edinburgh research,"[edinburgh, research, univers]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,bridge construction,"[bridg, construct]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,Walk of Fame stars,"[fame, star, walk]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,Scientists who worked on the atomic bomb,"[atom, bomb, scientist, work]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7,Invention of the Internet,"[internet, invent]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8,early telecommunication methods,"[earli, method, telecommun]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9,Who explored the South Pole,"[explor, pole, south]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [13]:
def similitud_coseno_docs(query_vector):
    similitud = doc_proc['doc_vector'].apply(lambda x: similitud_coseno(x,query_vector))
    similitud = similitud[similitud>0]
    similitud = similitud.sort_values(ascending=False)
    output = ''
    for index, value in similitud.items():
        if output == '':
            output += f'd{index}:{value}'
        else:
            output += f',d{index}:{value}'
    return output

    


In [16]:
q = quer_proc.query_vector.apply(similitud_coseno_docs)
f = open("salida/RRDV-consultas_resultads.txt", "w")
for i in range(len(q)):
    f.write(f'q{i+1} {q[i]}\n')
f.close()