# Docs

https://pyterrier.readthedocs.io

# Usage

In [1]:
import pandas as pd
import scipy
import numpy as np
import matplotlib.pyplot as plt
import pyterrier as pt
import os

if not pt.started():
  pt.init()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by pitta on 2024-04-03 16:14) and terrier-helper 0.0.8



### Indexing

In [2]:
files = pt.io.find_files("../wiki-small")
files[:10]

['../wiki-small\\en\\articles\\(\\1\\5\\(15810)_1994_JR1_9064.html',
 '../wiki-small\\en\\articles\\0\\_\\(\\0_(number).html',
 '../wiki-small\\en\\articles\\1\\,\\5\\1,5-anhydro-D-fructose_reductase_0c7b.html',
 '../wiki-small\\en\\articles\\1\\0\\0\\100th_Anniversary_deb0.html',
 '../wiki-small\\en\\articles\\1\\0\\4\\10440_van_Swinden_fef9.html',
 '../wiki-small\\en\\articles\\1\\0\\7\\10769_Minas_Gerais_900d.html',
 '../wiki-small\\en\\articles\\1\\0\\9\\109P_Swift-Tuttle_8861.html',
 '../wiki-small\\en\\articles\\1\\1\\1\\1110s_BC_4367.html',
 '../wiki-small\\en\\articles\\1\\1\\2\\1127_Mimi_6c39.html',
 '../wiki-small\\en\\articles\\1\\1\\5\\1150.html']

In [3]:
indexer = pt.FilesIndexer(os.path.abspath("./custom_index"), verbose=True, overwrite=True, meta={"docno": 20, "filename":512})
indexref = indexer.index(files)

21:23:46.117 [main] ERROR org.terrier.structures.indexing.Indexer - Could not rename index
java.io.IOException: Rename of index structure file 'f:\UNLu\Recuperacion de la Informacion\TPs code\TP2\Ejercicio 5\custom_index/data_1.direct.bf' (exists) to 'f:\UNLu\Recuperacion de la Informacion\TPs code\TP2\Ejercicio 5\custom_index/data.direct.bf' (exists) failed - likely that source file is still open. Possible indexing bug?
	at org.terrier.structures.IndexUtil.renameIndex(IndexUtil.java:379)
	at org.terrier.structures.indexing.Indexer.index(Indexer.java:388)


In [4]:
# load the index, print the statistics
index = pt.IndexFactory.of(indexref)
print(index.getCollectionStatistics().toString())

Number of documents: 6043
Number of terms: 164940
Number of postings: 1689407
Number of fields: 0
Number of tokens: 3128731
Field names: []
Positions:   false



### Retrieval

In [5]:
query1 = "software"
query2 = "tecnology in the high school"
query3 = "ideas for design clothes"
query4 = "why the sky is blue"
query5 = "Alter ego of batman"

In [6]:
# ad-hoc retrieval
br_TF_IDF = pt.BatchRetrieve(index, wmodel="TF_IDF", num_results=50, metadata=["docno", "filename"]) # wmodel="BM25", properties={"termpipelines" : "Stopwords,PorterStemmer"}
vector_q1_TF_IDF = br_TF_IDF.search(query1)["docno"]
vector_q2_TF_IDF = br_TF_IDF.search(query2)["docno"]
vector_q3_TF_IDF = br_TF_IDF.search(query3)["docno"]
vector_q4_TF_IDF = br_TF_IDF.search(query4)["docno"]
vector_q5_TF_IDF = br_TF_IDF.search(query5)["docno"]

In [7]:
# ad-hoc retrieval
br_BM25 = pt.BatchRetrieve(index, wmodel="BM25", num_results=50, metadata=["docno", "filename"]) # wmodel="BM25", properties={"termpipelines" : "Stopwords,PorterStemmer"}
vector_q1_BM25 = br_BM25.search(query1)["docno"]
vector_q2_BM25 = br_BM25.search(query2)["docno"]
vector_q3_BM25 = br_BM25.search(query3)["docno"]
vector_q4_BM25 = br_BM25.search(query4)["docno"]
vector_q5_BM25 = br_BM25.search(query5)["docno"]

In [8]:
coef_spearman10_q1, _ = scipy.stats.spearmanr(vector_q1_TF_IDF[:10], vector_q1_BM25[:10])
coef_spearman25_q1, _ = scipy.stats.spearmanr(vector_q1_TF_IDF[:25], vector_q1_BM25[:25])
coef_spearman50_q1, _ = scipy.stats.spearmanr(vector_q1_TF_IDF[:50], vector_q1_BM25[:50])

coef_spearman10_q2, _ = scipy.stats.spearmanr(vector_q2_TF_IDF[:10], vector_q2_BM25[:10])
coef_spearman25_q2, _ = scipy.stats.spearmanr(vector_q2_TF_IDF[:25], vector_q2_BM25[:25])
coef_spearman50_q2, _ = scipy.stats.spearmanr(vector_q2_TF_IDF[:50], vector_q2_BM25[:50])

coef_spearman10_q3, _ = scipy.stats.spearmanr(vector_q3_TF_IDF[:10], vector_q3_BM25[:10])
coef_spearman25_q3, _ = scipy.stats.spearmanr(vector_q3_TF_IDF[:25], vector_q3_BM25[:25])
coef_spearman50_q3, _ = scipy.stats.spearmanr(vector_q3_TF_IDF[:50], vector_q3_BM25[:50])

coef_spearman10_q4, _ = scipy.stats.spearmanr(vector_q4_TF_IDF[:10], vector_q4_BM25[:10])
coef_spearman25_q4, _ = scipy.stats.spearmanr(vector_q4_TF_IDF[:25], vector_q4_BM25[:25])
coef_spearman50_q4, _ = scipy.stats.spearmanr(vector_q4_TF_IDF[:50], vector_q4_BM25[:50])

coef_spearman10_q5, _ = scipy.stats.spearmanr(vector_q5_TF_IDF[:10], vector_q5_BM25[:10])
coef_spearman25_q5, _ = scipy.stats.spearmanr(vector_q5_TF_IDF[:25], vector_q5_BM25[:25])
coef_spearman50_q5, _ = scipy.stats.spearmanr(vector_q5_TF_IDF[:50], vector_q5_BM25[:50])

dataframes = [
    pd.DataFrame([coef_spearman10_q1, coef_spearman25_q1, coef_spearman50_q1], index=["@10", "@25", "@50"], columns=["Coef. Corr. q1"]),
    pd.DataFrame([coef_spearman10_q2, coef_spearman25_q2, coef_spearman50_q2], index=["@10", "@25", "@50"], columns=["Coef. Corr. q2"]),
    pd.DataFrame([coef_spearman10_q3, coef_spearman25_q3, coef_spearman50_q3], index=["@10", "@25", "@50"], columns=["Coef. Corr. q3"]),
    pd.DataFrame([coef_spearman10_q4, coef_spearman25_q4, coef_spearman50_q4], index=["@10", "@25", "@50"], columns=["Coef. Corr. q4"]),
    pd.DataFrame([coef_spearman10_q5, coef_spearman25_q5, coef_spearman50_q5], index=["@10", "@25", "@50"], columns=["Coef. Corr. q5"])
]

df_merged = pd.concat(dataframes, axis=1)
df_merged

Unnamed: 0,Coef. Corr. q1,Coef. Corr. q2,Coef. Corr. q3,Coef. Corr. q4,Coef. Corr. q5
@10,1.0,0.890909,0.769697,0.951515,1.0
@25,1.0,0.972308,0.52,0.832308,1.0
@50,1.0,0.988379,0.087635,0.598559,0.829244


Para las 5 querys, los primeros 10 puestos del ranking se comportan muy similares, mientras que en los primeros 25 la query 3 desvia la mitad de los documentos entre modelos, y finalmente para los 50, la query 3 solo coincide con 1 o 2 documentos entre modelos, la query 4 coinciden 60% de los documentos, las demas querys se comportan muy similares entre modelos.