In [1]:
import os
import sys
sys.path.append(os.path.abspath("../../shared"))  # Agrega la carpeta al PYTHONPATH
from tokenizer_v2 import tokenizer, loadFileStopWords
from collections import Counter
import math
import pandas as pd
import scipy
import pyterrier as pt

if not pt.started():
  pt.init()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by pitta on 2024-04-03 16:14) and terrier-helper 0.0.8



In [2]:
# Directorio donde se encuentran los documentos de texto
DIR = '../wiki-small'
STOP_WORDS_FILE = '../../shared/stop-words.en.txt'
stop_words_list = loadFileStopWords(STOP_WORDS_FILE)

In [3]:
# Obtén la relación filepath-docno del índice
filepath_docno_dict = {}
files = pt.io.find_files("../wiki-small")
indexer = pt.FilesIndexer(os.path.abspath("./custom_index"), verbose=True, overwrite=True, meta={"docno": 20, "filename":512})
indexref = indexer.index(files)
index_tr = pt.IndexFactory.of(indexref)
br_TF_IDF_test = pt.BatchRetrieve(index_tr, wmodel="TF_IDF", metadata=["docno", "filename"])

# Obtener todos los documentos indexados
documentos = index_tr.getCollectionStatistics().getNumberOfDocuments()

# Iterar sobre los documentos indexados
for doc_id in range(documentos):
    docno, filepath = index_tr.getMetaIndex().getAllItems(doc_id)
    filepath_docno_dict[filepath] = docno

21:33:46.479 [main] ERROR org.terrier.structures.indexing.Indexer - Could not rename index
java.io.IOException: Rename of index structure file 'f:\UNLu\Recuperacion de la Informacion\TPs code\TP2\Ejercicio 7\custom_index/data_1.direct.bf' (exists) to 'f:\UNLu\Recuperacion de la Informacion\TPs code\TP2\Ejercicio 7\custom_index/data.direct.bf' (exists) failed - likely that source file is still open. Possible indexing bug?
	at org.terrier.structures.IndexUtil.renameIndex(IndexUtil.java:379)
	at org.terrier.structures.indexing.Indexer.index(Indexer.java:388)


In [4]:
def calculate_tf(text, stop_words_list):
    # Calcula el term frequency (frecuencia de término)
    tokens = tokenizer(text, stop_words_list)
    tf = Counter(tokens)
    return {word: count for word, count in tf.items()}

In [5]:
def build_index(directory, stop_words_list):
    # Construye el índice invertido a partir de los documentos en el directorio
    index = {}
    for root, _, files in os.walk(directory):
        for filename in files:
            filepath = os.path.join(root, filename)
            with open(filepath, 'r', encoding='utf-8') as file:
                text = file.read()
                tf = calculate_tf(text, stop_words_list)
                docid = filepath_docno_dict[filepath]
                index[docid] = {'filepath': filepath, 'terms': tf}  # Agrega el docid al índice
    return index

In [6]:
def calculate_idf(documents):
    # Calcula el inverse document frequency (frecuencia inversa de documento)
    all_words = set(word for doc in documents for word in doc.keys())
    idf = {}
    total_documents = len(documents)
    for word in all_words:
        # Calcula el número de documentos que contienen el término
        doc_count = sum(1 for doc in documents if word in doc)
        idf[word] = math.log(total_documents / (1 + doc_count))
    return idf

def calculate_tf_idf(tf, idf):
    # Calcula el tf-idf para un término
    tf_idf = {word: tf_value * idf.get(word, 0) for word, tf_value in tf.items()}
    return tf_idf

In [7]:
def search(query, index, idf, stop_words_list):
    # Busca documentos relevantes para la consulta utilizando el modelo vectorial
    query_tf = calculate_tf(query, stop_words_list)
    query_tf_idf = calculate_tf_idf(query_tf, idf)
    scores = {}
    for docid, doc_info in index.items():
        doc_tf = doc_info['terms']
        score = sum(query_tf_idf.get(term, 0) * doc_tf.get(term, 0) for term in query_tf_idf.keys())
        scores[docid] = score
    # Ordena los documentos por relevancia (puntuación)
    ranked_documents = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    
    # Obtiene el filepath para cada docid
    results = []
    for docid, score in ranked_documents:
        doc_info = index[docid]
        filepath = doc_info['filepath']
        results.append({'docid': docid, 'filepath': filepath, 'score': score})
    return results

In [8]:
# Construye el índice
index = build_index(DIR, stop_words_list)

In [9]:
# Calcula la IDF para los documentos
idf = calculate_idf([doc_info['terms'] for doc_info in index.values()])

In [10]:
# Consulta de ejemplo
query1 = "software"
query2 = "tecnology in the high school"
query3 = "ideas for design clothes"
query4 = "why the sky is blue"
query5 = "Alter ego of batman"

In [11]:
# Busca documentos relevantes para la consulta
results_q1 = search(query1, index, idf, stop_words_list)
results_q2 = search(query2, index, idf, stop_words_list)
results_q3 = search(query3, index, idf, stop_words_list)
results_q4 = search(query4, index, idf, stop_words_list)
results_q5 = search(query5, index, idf, stop_words_list)

vector_q1_My_Soft = [item['docid'] for item in results_q1]
vector_q2_My_Soft = [item['docid'] for item in results_q2]
vector_q3_My_Soft = [item['docid'] for item in results_q3]
vector_q4_My_Soft = [item['docid'] for item in results_q4]
vector_q5_My_Soft = [item['docid'] for item in results_q5]

In [12]:
br_TF_IDF = pt.BatchRetrieve(index_tr, wmodel="TF_IDF", num_results=50, metadata=["docno", "filename"])
vector_q1_TF_IDF = br_TF_IDF.search(query1)["docno"]
vector_q2_TF_IDF = br_TF_IDF.search(query2)["docno"]
vector_q3_TF_IDF = br_TF_IDF.search(query3)["docno"]
vector_q4_TF_IDF = br_TF_IDF.search(query4)["docno"]
vector_q5_TF_IDF = br_TF_IDF.search(query5)["docno"]

In [13]:
coef_spearman10_q1, _ = scipy.stats.spearmanr(vector_q1_My_Soft[:10], vector_q1_TF_IDF[:10])
coef_spearman25_q1, _ = scipy.stats.spearmanr(vector_q1_My_Soft[:25], vector_q1_TF_IDF[:25])
coef_spearman50_q1, _ = scipy.stats.spearmanr(vector_q1_My_Soft[:50], vector_q1_TF_IDF[:50])

coef_spearman10_q2, _ = scipy.stats.spearmanr(vector_q2_My_Soft[:10], vector_q2_TF_IDF[:10])
coef_spearman25_q2, _ = scipy.stats.spearmanr(vector_q2_My_Soft[:25], vector_q2_TF_IDF[:25])
coef_spearman50_q2, _ = scipy.stats.spearmanr(vector_q2_My_Soft[:50], vector_q2_TF_IDF[:50])

coef_spearman10_q3, _ = scipy.stats.spearmanr(vector_q3_My_Soft[:10], vector_q3_TF_IDF[:10])
coef_spearman25_q3, _ = scipy.stats.spearmanr(vector_q3_My_Soft[:25], vector_q3_TF_IDF[:25])
coef_spearman50_q3, _ = scipy.stats.spearmanr(vector_q3_My_Soft[:50], vector_q3_TF_IDF[:50])

coef_spearman10_q4, _ = scipy.stats.spearmanr(vector_q4_My_Soft[:10], vector_q4_TF_IDF[:10])
coef_spearman25_q4, _ = scipy.stats.spearmanr(vector_q4_My_Soft[:25], vector_q4_TF_IDF[:25])
coef_spearman50_q4, _ = scipy.stats.spearmanr(vector_q4_My_Soft[:50], vector_q4_TF_IDF[:50])

coef_spearman10_q5, _ = scipy.stats.spearmanr(vector_q5_My_Soft[:10], vector_q5_TF_IDF[:10])
coef_spearman25_q5, _ = scipy.stats.spearmanr(vector_q5_My_Soft[:25], vector_q5_TF_IDF[:25])
coef_spearman50_q5, _ = scipy.stats.spearmanr(vector_q5_My_Soft[:50], vector_q5_TF_IDF[:50])

dataframes = [
    pd.DataFrame([coef_spearman10_q1, coef_spearman25_q1, coef_spearman50_q1], index=["@10", "@25", "@50"], columns=["Coef. Corr. q1"]),
    pd.DataFrame([coef_spearman10_q2, coef_spearman25_q2, coef_spearman50_q2], index=["@10", "@25", "@50"], columns=["Coef. Corr. q2"]),
    pd.DataFrame([coef_spearman10_q3, coef_spearman25_q3, coef_spearman50_q3], index=["@10", "@25", "@50"], columns=["Coef. Corr. q3"]),
    pd.DataFrame([coef_spearman10_q4, coef_spearman25_q4, coef_spearman50_q4], index=["@10", "@25", "@50"], columns=["Coef. Corr. q4"]),
    pd.DataFrame([coef_spearman10_q5, coef_spearman25_q5, coef_spearman50_q5], index=["@10", "@25", "@50"], columns=["Coef. Corr. q5"])
]

df_merged = pd.concat(dataframes, axis=1)
df_merged

Unnamed: 0,Coef. Corr. q1,Coef. Corr. q2,Coef. Corr. q3,Coef. Corr. q4,Coef. Corr. q5
@10,-0.333333,-0.054545,0.163636,-0.284848,0.406061
@25,-0.194615,-0.051538,0.023846,-0.153077,0.196923
@50,-0.041537,0.03952,0.263673,0.083986,0.143529


No se porque da negativo, no entiendo esos numeros.