In [19]:
import os
import re
from collections import Counter
import math

In [20]:
# Directorio donde se encuentran los documentos de texto
DIR = '../wiki-small'

In [21]:
def tokenize(text):
    # Tokenización básica: dividir el texto en palabras
    words = re.findall(r'\w+', text.lower())
    return words

def calculate_tf(text):
    # Calcula el term frequency (frecuencia de término)
    tokens = tokenize(text)
    tf = Counter(tokens)
    # Normaliza los valores para obtener tf
    total_words = len(tokens)
    tf_normalized = {word: count / total_words for word, count in tf.items()}
    return tf_normalized

def calculate_idf(documents):
    # Calcula el inverse document frequency (frecuencia inversa de documento)
    all_words = set(word for doc in documents for word in doc.keys())
    idf = {}
    total_documents = len(documents)
    for word in all_words:
        # Calcula el número de documentos que contienen el término
        doc_count = sum(1 for doc in documents if word in doc)
        idf[word] = math.log(total_documents / (1 + doc_count))
    return idf

def calculate_tf_idf(tf, idf):
    # Calcula el tf-idf para un término
    tf_idf = {word: tf_value * idf.get(word, 0) for word, tf_value in tf.items()}
    return tf_idf

def build_index(directory):
    # Construye el índice invertido a partir de los documentos en el directorio
    index = {}
    for root, _, files in os.walk(directory):
        for filename in files:
            filepath = os.path.join(root, filename)
            with open(filepath, 'r', encoding='utf-8') as file:
                text = file.read()
                tf = calculate_tf(text)
                index[filepath] = tf
    return index

def search(query, index, idf):
    # Busca documentos relevantes para la consulta utilizando el modelo vectorial
    query_tf = calculate_tf(query)
    query_tf_idf = calculate_tf_idf(query_tf, idf)
    scores = {}
    for doc_name, doc_tf in index.items():
        score = sum(query_tf_idf.get(term, 0) * doc_tf.get(term, 0) for term in query_tf_idf.keys())
        scores[doc_name] = score
    # Ordena los documentos por relevancia (puntuación)
    ranked_documents = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return ranked_documents


In [22]:
# Construye el índice invertido
index = build_index(DIR)

# # Calcula la IDF para los documentos
idf = calculate_idf(index.values())

# # Consulta de ejemplo
query = input("Ingrese su consulta: ")

# # Busca documentos relevantes para la consulta
results = search(query, index, idf)

# # Imprime los resultados
print("Documentos relevantes para la consulta '{}':".format(query))
for doc_name, score in results:
    print("- {} (Score: {})".format(doc_name, score))

Documentos relevantes para la consulta 'popular Greek artist':
- ../wiki-small\en\articles\m\a\r\Martin_Litchfield_West_2f84.html (Score: 0.007372685445996883)
- ../wiki-small\en\articles\n\i\k\Nikos_Barlos_1005.html (Score: 0.006755932365003466)
- ../wiki-small\en\articles\e\l\e\Eleftherios_Papasymeon_2795.html (Score: 0.006514893088018569)
- ../wiki-small\en\articles\i\o\n\Ion_of_Chios_97ee.html (Score: 0.005802971107301979)
- ../wiki-small\en\articles\b\i\r\Birtha_(Mesopotamia)_f6a0.html (Score: 0.005357269305083054)
- ../wiki-small\en\articles\h\i\s\History_of_Ptolemaic_Egypt_5a77.html (Score: 0.005110200391058326)
- ../wiki-small\en\articles\m\a\r\Maria_Spiridaki_6e42.html (Score: 0.004873368939627389)
- ../wiki-small\en\articles\g\i\o\Giorgos_Bartzokas_30b2.html (Score: 0.004804073018145129)
- ../wiki-small\en\articles\k\u\r\Kurt_Jackson_29f9.html (Score: 0.004546556837592711)
- ../wiki-small\en\articles\l\a\v\Lavrentis_Dianellos_068d.html (Score: 0.004460626231102546)
- ../wiki-