In [1]:
pip install ir_datasets

Collecting ir_datasets
  Downloading ir_datasets-0.5.7-py3-none-any.whl (337 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m337.9/337.9 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting inscriptis>=2.2.0 (from ir_datasets)
  Downloading inscriptis-2.5.0-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.4/45.4 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting trec-car-tools>=2.5.4 (from ir_datasets)
  Downloading trec_car_tools-2.6-py3-none-any.whl (8.4 kB)
Collecting lz4>=3.1.10 (from ir_datasets)
  Downloading lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting warc3-wet>=0.2.3 (from ir_datasets)
  Downloading warc3_wet-0.2.3-py3-none-any.whl (13 kB)
Collecting warc3-wet-clueweb09>=0.2.5 (from ir_datasets)
  Downloading warc3-wet-clueweb09-0.2.5.tar.

In [2]:
import ir_datasets
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

In [3]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
# load data
dataset1 = ir_datasets.load('antique/test/non-offensive')
dataset2 = ir_datasets.load('beir/quora/test')

In [5]:
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

In [6]:
# preprocess data
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text)
    processed_words = [ps.stem(word) for word in words if word not in stop_words]
    return ' '.join(processed_words)

In [7]:
# process_documents
def process_documents(dataset):
    documents = []
    for doc in dataset.docs_iter():
        processed_text = preprocess_text(doc.text)
        documents.append({
            'doc_id': doc.doc_id,
            'text': processed_text
        })
    return documents

In [8]:
documents1 = process_documents(dataset1)
documents2 = process_documents(dataset2)

[INFO] Please confirm you agree to the authors' data usage agreement found at <https://ciir.cs.umass.edu/downloads/Antique/readme.txt>
[INFO] If you have a local copy of https://ciir.cs.umass.edu/downloads/Antique/antique-collection.txt, you can symlink it here to avoid downloading it again: /root/.ir_datasets/downloads/684f7015aff377062a758e478476aac8
[INFO] [starting] https://ciir.cs.umass.edu/downloads/Antique/antique-collection.txt
[INFO] [finished] https://ciir.cs.umass.edu/downloads/Antique/antique-collection.txt: [00:01] [93.6MB] [67.2MB/s]
[INFO] [starting] building docstore
[INFO] [starting] opening zip file
[INFO] If you have a local copy of https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/quora.zip, you can symlink it here to avoid downloading it again: /root/.ir_datasets/downloads/18fb154900ba42a600f84b839c173167
[INFO] [starting] https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/quora.zip
docs_iter:   0%|                                  

In [9]:
# TF-IDF
def create_tfidf_vectors(documents):
    vectorizer = TfidfVectorizer()
    doc_texts = [doc['text'] for doc in documents]
    vectors = vectorizer.fit_transform(doc_texts)
    return vectorizer, vectors

In [10]:
vectorizer1, doc_vectors1 = create_tfidf_vectors(documents1)
vectorizer2, doc_vectors2 = create_tfidf_vectors(documents2)

In [11]:
print(f"Dataset 1: {doc_vectors1.shape[0]} documents, {doc_vectors1.shape[1]} features")
print(f"Dataset 2: {doc_vectors2.shape[0]} documents, {doc_vectors2.shape[1]} features")

Dataset 1: 403666 documents, 237358 features
Dataset 2: 522931 documents, 85297 features


In [12]:
#process_query
def process_query(query, vectorizer):
    processed_query = preprocess_text(query)
    return vectorizer.transform([processed_query])

In [None]:
# match_and_rank
#def match_and_rank(query, vectorizer, doc_vectors):
#    query_vector = process_query(query, vectorizer)
#    scores = cosine_similarity(query_vector, doc_vectors).flatten()
#    ranked_doc_indices = scores.argsort()[::-1]
#    return ranked_doc_indices, scores

In [13]:
def match_and_rank(query, vectorizer, doc_vectors, documents):
    query_vector = process_query(query, vectorizer)
    scores = cosine_similarity(query_vector, doc_vectors).flatten()
    ranked_doc_indices = scores.argsort()[::-1]
    ranked_documents = [(documents[idx]['doc_id'], scores[idx]) for idx in ranked_doc_indices]
    return ranked_documents

In [14]:
# test query
search_query = "how to improve search engine performance"

In [None]:
# similarty TF-IDF
#ranked_indices1, scores1 = match_and_rank(search_query, vectorizer1, doc_vectors1)
#ranked_indices2, scores2 = match_and_rank(search_query, vectorizer2, doc_vectors2)

In [15]:
# similarity TF-IDF
ranked_docs1 = match_and_rank(search_query, vectorizer1, doc_vectors1, documents1)
ranked_docs2 = match_and_rank(search_query, vectorizer2, doc_vectors2, documents2)

In [None]:
#print("TF-IDF Ranking Results for Dataset 1:")
#for idx in ranked_indices1[:10]:
#    print(f"Document ID: {documents1[idx]['doc_id']}, Score: {scores1[idx]}")

TF-IDF Ranking Results for Dataset 1:
Document ID: 2538279_4, Score: 0.6402472367428559
Document ID: 3774613_1, Score: 0.5390514687446829
Document ID: 296379_5, Score: 0.509534647726315
Document ID: 2075461_1, Score: 0.48487446772435294
Document ID: 140691_1, Score: 0.48487446772435294
Document ID: 1018545_3, Score: 0.4842181881021651
Document ID: 4295697_1, Score: 0.4592582673175454
Document ID: 547868_3, Score: 0.45561109289893525
Document ID: 2877208_3, Score: 0.4314629214060672
Document ID: 4431784_1, Score: 0.42441823698800113


In [16]:
print("TF-IDF Ranking Results for Dataset 1:")
for doc_id, score in ranked_docs1[:10]:
    print(f"Document ID: {doc_id}, Score: {score}")

TF-IDF Ranking Results for Dataset 1:
Document ID: 4237774_1, Score: 0.6710499975277999
Document ID: 3493286_2, Score: 0.5881821964165571
Document ID: 1698674_2, Score: 0.5732694835610638
Document ID: 4115722_0, Score: 0.5246257305238725
Document ID: 607425_0, Score: 0.5239244655227959
Document ID: 4431784_1, Score: 0.5035777198769322
Document ID: 2076701_3, Score: 0.5030860281142819
Document ID: 2344295_4, Score: 0.4955479165590613
Document ID: 1759207_3, Score: 0.47951935092804154
Document ID: 2075461_1, Score: 0.4746355297580474


In [17]:
print("TF-IDF Ranking Results for Dataset 2:")
for doc_id, score in ranked_docs2[:10]:
    print(f"Document ID: {doc_id}, Score: {score}")

TF-IDF Ranking Results for Dataset 2:
Document ID: 67268, Score: 0.7385739697136127
Document ID: 482830, Score: 0.6884830150245347
Document ID: 89570, Score: 0.6884830150245347
Document ID: 89571, Score: 0.6884830150245347
Document ID: 169139, Score: 0.6741724492008523
Document ID: 67269, Score: 0.6305590549141409
Document ID: 404651, Score: 0.6305590549141409
Document ID: 482831, Score: 0.6268292244597347
Document ID: 404833, Score: 0.6242788042971892
Document ID: 156260, Score: 0.6242788042971892


In [None]:
#print("TF-IDF Ranking Results for Dataset 2:")
#for idx in ranked_indices2[:10]:
#    print(f"Document ID: {documents2[idx]['doc_id']}, Score: {scores2[idx]}")

TF-IDF Ranking Results for Dataset 2:
Document ID: 233080, Score: 0.6938439099533338
Document ID: 210216, Score: 0.6923011121673248
Document ID: 233081, Score: 0.6773129463878851
Document ID: 256946, Score: 0.6670372859153133
Document ID: 108503, Score: 0.6134935471549455
Document ID: 108504, Score: 0.6134935471549455
Document ID: 59515, Score: 0.6083386508660562
Document ID: 59516, Score: 0.6010703833537651
Document ID: 263692, Score: 0.5509190665897268
Document ID: 417903, Score: 0.5440910539548532


In [22]:
# Documents Clustering using K-Means
def cluster_documents(doc_vectors, num_clusters=5):
    kmeans = KMeans(n_clusters=num_clusters, n_init=10).fit(doc_vectors)
    return kmeans.labels_, kmeans.cluster_centers_

In [23]:
num_clusters = 5
labels1, centers1 = cluster_documents(doc_vectors1, num_clusters)
labels2, centers2 = cluster_documents(doc_vectors2, num_clusters)

In [24]:
# Display Clustering Results
#def display_clusters(labels, documents, num_clusters):
#    clusters = {i: [] for i in range(num_clusters)}
#    for label, doc in zip(labels, documents):
#        clusters[label].append(doc['doc_id'])
#    return clusters

In [25]:
#clusters1 = display_clusters(labels1, documents1, num_clusters)
#clusters2 = display_clusters(labels2, documents2, num_clusters)

In [26]:
# print("\nClustering Results for Dataset 1:")
# for cluster_id, doc_ids in clusters1.items():
#     print(f"Cluster {cluster_id}: {doc_ids}")

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
# print("\nClustering Results for Dataset 2:")
# for cluster_id, doc_ids in clusters2.items():
#     print(f"Cluster {cluster_id}: {doc_ids}")

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [27]:
from collections import defaultdict

In [28]:
def group_documents_by_cluster(labels):
    clusters = defaultdict(list)
    for doc_id, cluster_id in enumerate(labels):
        clusters[cluster_id].append(doc_id)
    return clusters

In [29]:
clusters1 = group_documents_by_cluster(labels1)
clusters2 = group_documents_by_cluster(labels2)

In [30]:
def print_cluster_summary(clusters, dataset_name):
    print(f"\nClustering Results for {dataset_name}:")
    for cluster_id, doc_ids in clusters.items():
        print(f"Cluster {cluster_id}: {len(doc_ids)} documents")
        print(f"Sample document IDs: {doc_ids[:10]}")

In [32]:
print_cluster_summary(clusters1, "Dataset 1")
print_cluster_summary(clusters2, "Dataset 2")


Clustering Results for Dataset 1:
Cluster 0: 264182 documents
Sample document IDs: [0, 1, 4, 5, 6, 8, 9, 10, 11, 12]
Cluster 3: 76396 documents
Sample document IDs: [2, 3, 7, 15, 17, 32, 47, 56, 60, 72]
Cluster 2: 35794 documents
Sample document IDs: [13, 20, 25, 43, 61, 70, 71, 83, 88, 91]
Cluster 4: 17429 documents
Sample document IDs: [69, 119, 128, 149, 153, 198, 232, 270, 280, 357]
Cluster 1: 9865 documents
Sample document IDs: [201, 267, 281, 293, 299, 303, 394, 395, 397, 541]

Clustering Results for Dataset 2:
Cluster 2: 446539 documents
Sample document IDs: [0, 1, 2, 3, 5, 6, 7, 8, 9, 10]
Cluster 0: 18276 documents
Sample document IDs: [4, 16, 17, 20, 78, 321, 322, 407, 436, 442]
Cluster 1: 13784 documents
Sample document IDs: [37, 70, 138, 166, 234, 235, 269, 334, 351, 352]
Cluster 4: 37159 documents
Sample document IDs: [38, 39, 55, 56, 67, 68, 114, 117, 131, 175]
Cluster 3: 7173 documents
Sample document IDs: [69, 224, 225, 320, 461, 574, 640, 712, 1142, 1145]
