In [10]:
!pip install kagglehub



In [11]:
import os

print(f"contents of {dataset_path}:")
for root, dirs, files in os.walk(dataset_path):
    for file in files:
        if 'cisi.all' in file.lower():
            print(f"found cisi.all at: {os.path.join(root, file)}")

contents of /kaggle/input/cisi-a-dataset-for-information-retrieval:
found cisi.all at: /kaggle/input/cisi-a-dataset-for-information-retrieval/CISI.ALL


In [12]:
import re
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import kagglehub
import os

# --- 1. data loading and parsing ---
def parse_cisi_data(file_path):
    """parses cisi file into documents."""
    documents = []
    current_doc = {}
    content_key = None

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue

            if line.startswith('.I'):
                if current_doc:
                    doc_text = " ".join(current_doc.get('content', []))
                    documents.append({'id': current_doc['id'], 'text': doc_text})

                current_doc = {'id': int(line.split()[-1]), 'content': []}
                content_key = None

            elif line.startswith(('.T', '.A', '.W', '.B')):
                content_key = line.split()[0]

            elif content_key:
                current_doc['content'].append(line)

        if current_doc:
            doc_text = " ".join(current_doc.get('content', []))
            documents.append({'id': current_doc['id'], 'text': doc_text})

    return documents

# --- 2. custom tokenizer (preprocessing) ---
porter = PorterStemmer()

def custom_tokenizer(text):
    """
    tokenizes, lowercases, stems.
    """
    tokens = re.findall(r'\b\w+\b', text.lower())
    return [porter.stem(token) for token in tokens]

# --- 3. indexing and retrieval function ---

def create_index_and_retrieve(documents, query, k=5):
    """
    1. creates tf-idf index.
    2. processes query, retrieves top k documents.
    """
    doc_ids = [doc['id'] for doc in documents]
    doc_texts = [doc['text'] for doc in documents]

    vectorizer = TfidfVectorizer(
        tokenizer=custom_tokenizer,
        stop_words='english',
        use_idf=True,
        min_df=3,
        max_df=0.85
    )

    tfidf_matrix = vectorizer.fit_transform(doc_texts)

    print(f"total vocabulary size: {len(vectorizer.get_feature_names_out())}")
    print(f"tf-idf matrix shape: {tfidf_matrix.shape}")
    print("\n--- retrieval started ---")

    query_vector = vectorizer.transform([query])

    similarity_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()

    ranked_indices = np.argsort(similarity_scores)[::-1]

    results = []
    for i in ranked_indices[:k]:
        results.append({
            'rank': len(results) + 1,
            'doc_id': doc_ids[i],
            'score': similarity_scores[i]
        })

    return pd.DataFrame(results)

# --- 4. main execution ---
if __name__ == "__main__":
    cisi_documents = None

    try:
        print("downloading cisi dataset...")
        dataset_path = kagglehub.dataset_download("dmaso01dsta/cisi-a-dataset-for-information-retrieval")
        print(f"dataset downloaded to: {dataset_path}")

        cisi_file_path = os.path.join(dataset_path, 'CISI.ALL')

        if not os.path.exists(cisi_file_path):
            print(f"error: 'cisi.all' not found at {cisi_file_path}")
        else:
            cisi_documents = parse_cisi_data(cisi_file_path)
            print(f"successfully loaded {len(cisi_documents)} documents.")

    except Exception as e:
        print(f"an error occurred: {e}")

    if cisi_documents is not None:
        test_query = "what are the methods for automatic document indexing and retrieval systems?"
        print(f"\ntest query: {test_query}")

        retrieval_results = create_index_and_retrieve(cisi_documents, test_query, k=10)

        print("\n--- top 10 retrieval results ---")
        print(retrieval_results.to_markdown(index=False, floatfmt=".4f"))
    else:
        print("document loading failed. skipping retrieval.")

downloading cisi dataset...
Using Colab cache for faster access to the 'cisi-a-dataset-for-information-retrieval' dataset.
dataset downloaded to: /kaggle/input/cisi-a-dataset-for-information-retrieval
successfully loaded 1460 documents.

test query: what are the methods for automatic document indexing and retrieval systems?




total vocabulary size: 4189
tf-idf matrix shape: (1460, 4189)

--- retrieval started ---

--- top 10 retrieval results ---
|    rank |    doc_id |   score |
|--------:|----------:|--------:|
|  1.0000 | 1245.0000 |  0.2046 |
|  2.0000 | 1191.0000 |  0.2016 |
|  3.0000 | 1124.0000 |  0.1814 |
|  4.0000 | 1136.0000 |  0.1596 |
|  5.0000 |  571.0000 |  0.1564 |
|  6.0000 | 1190.0000 |  0.1354 |
|  7.0000 |  309.0000 |  0.1349 |
|  8.0000 |  341.0000 |  0.1295 |
|  9.0000 |  701.0000 |  0.1236 |
| 10.0000 | 1132.0000 |  0.1208 |
