In [10]:
import os
import sys
sys.path.append(os.path.abspath("../../shared"))  # Agrega la carpeta al PYTHONPATH
from tokenizer_v2 import tokenizer, loadFileStopWords
from collections import Counter
import math
import pandas as pd

In [11]:
# Directorio donde se encuentran los documentos de texto
DIR = './collection'
STOP_WORDS_FILE = '../../shared/stop-words.txt'
stop_words_list = loadFileStopWords(STOP_WORDS_FILE)
# palabras en comun en la collection: vida salud mundo importante recursos personas necesario sociedad cuidado humanos

In [12]:
def calculate_tf(text, stop_words_list):
    # Calcula el term frequency (frecuencia de término)
    tokens = tokenizer(text, stop_words_list)
    tf = Counter(tokens)
    return {word: count for word, count in tf.items()}

In [13]:
def build_index(directory, stop_words_list):
    # Construye el índice invertido a partir de los documentos en el directorio
    index = {}
    for root, _, files in os.walk(directory):
        for filename in files:
            filepath = os.path.join(root, filename)
            with open(filepath, 'r', encoding='utf-8') as file:
                text = file.read()
                tf = calculate_tf(text, stop_words_list)
                index[filepath] = tf
    return index

In [14]:
def calculate_idf(documents):
    # Calcula el inverse document frequency (frecuencia inversa de documento)
    all_words = set(word for doc in documents for word in doc.keys())
    idf = {}
    total_documents = len(documents)
    for word in all_words:
        # Calcula el número de documentos que contienen el término
        doc_count = sum(1 for doc in documents if word in doc)
        idf[word] = math.log(total_documents / (1 + doc_count))
    return idf

def calculate_tf_idf(tf, idf):
    # Calcula el tf-idf para un término
    tf_idf = {word: tf_value * idf.get(word, 0) for word, tf_value in tf.items()}
    return tf_idf


In [15]:
def search(query, index, idf, stop_words_list):
    # Busca documentos relevantes para la consulta utilizando el modelo vectorial
    query_tf = calculate_tf(query, stop_words_list)
    query_tf_idf = calculate_tf_idf(query_tf, idf)
    scores = {}
    for filepath, doc_tf in index.items():
        score = sum(query_tf_idf.get(term, 0) * doc_tf.get(term, 0) for term in query_tf_idf.keys())
        scores[filepath] = score
    # Ordena los documentos por relevancia (puntuación)
    ranked_documents = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return ranked_documents

In [16]:
# Construye el índice
index = build_index(DIR, stop_words_list)

In [17]:
# Calcula la IDF para los documentos
idf = calculate_idf(index.values())

In [19]:
# Consulta de ejemplo
query = input("Ingrese su consulta: ")

# Busca documentos relevantes para la consulta
results = search(query, index, idf, stop_words_list)

table = pd.DataFrame(results, columns=["filepath", "score"])
# Imprime los resultados
print(f"Query: '{query}':")
table

Query: 'mundo':


Unnamed: 0,filepath,score
0,./collection\tecnologia.txt,1.021651
1,./collection\cultura.txt,0.510826
2,./collection\educacion.txt,0.0
3,./collection\naturaleza.txt,0.0
4,./collection\salud.txt,0.0
