In [None]:
!gdown 1N5V40HQsnCHHytdmG98YfXvN_3DCjV67

Downloading...
From (original): https://drive.google.com/uc?id=1N5V40HQsnCHHytdmG98YfXvN_3DCjV67
From (redirected): https://drive.google.com/uc?id=1N5V40HQsnCHHytdmG98YfXvN_3DCjV67&confirm=t&uuid=2c134875-ca2a-4d14-8377-49b0c1e02f09
To: /content/dataframe_hamshahri2.tar
100% 160M/160M [00:02<00:00, 55.5MB/s]


In [None]:
!tar -xf dataframe_hamshahri2.tar

In [None]:
import pandas as pd

df_docs=pd.read_csv('/content/hamshahri2/docs.csv.gz', compression='gzip')
df_judgments=pd.read_csv('/content/hamshahri2/judgments_dataframe.csv.gz', compression='gzip')
df_queries_fa=pd.read_csv('/content/hamshahri2/queries_fa.csv.gz', compression='gzip')

In [None]:
qrels={}
for item in df_judgments[df_judgments['relevancy'] == 1].groupby('query_id')[['doc_id', 'relevancy']]:
  qrels[str(item[0])]={val[0]:int(val[1]) for val in zip(item[1]['doc_id'],item[1]['relevancy'])}

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Dict, Tuple

class RankerTFIDF:

    def __init__(self, docs) -> None:
        self.docs = docs
        self.vectorizer = TfidfVectorizer()
        self.tfidf_matrix = self.vectorizer.fit_transform(self.docs)

    def search(self, query: str, k: int = 5) -> List[Tuple[float, str]]:
        query_vec = self.vectorizer.transform([query])
        similarities = cosine_similarity(query_vec, self.tfidf_matrix).flatten()
        top_indices = similarities.argsort()[-k:][::-1]
        return [
            (similarities[i], self.docs[i])
            for i in top_indices
        ]

    def batch_search(
        self,
        queries: List[str],
        k: int = 5
    ) -> Dict[str, List[Tuple[int, float]]]:

        query_vecs = self.vectorizer.transform(queries)
        sim_matrix = cosine_similarity(query_vecs, self.tfidf_matrix)

        results = {}
        for i, query in enumerate(queries):
            similarities = sim_matrix[i]
            top_indices = similarities.argsort()[-k:][::-1]
            results[query] = [
                (doc_id, float(similarities[doc_id]))
                for doc_id in top_indices
            ]
        return results

In [None]:
!pip install beir

Collecting beir
  Downloading beir-2.2.0-py3-none-any.whl.metadata (28 kB)
Collecting pytrec-eval-terrier (from beir)
  Downloading pytrec_eval_terrier-0.5.10-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Downloading beir-2.2.0-py3-none-any.whl (77 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytrec_eval_terrier-0.5.10-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (304 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m304.8/304.8 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytrec-eval-terrier, beir
Successfully installed beir-2.2.0 pytrec-eval-terrier-0.5.10


In [None]:
from beir.retrieval.evaluation import EvaluateRetrieval
import numpy as np

In [None]:
def get_metrics_dataframe(metrics):
  data = []
  for metric_group in metrics:
      for metric_name, score in metric_group.items():
          metric, k = metric_name.split('@')
          data.append({
              'Metric': metric,
              'k': int(k),
              'Score': score
          })
  df = pd.DataFrame(data)
  return df.pivot(index='k', columns='Metric', values='Score').reset_index()

In [52]:
def evaluate_docs_and_save(doc_series, df_docs, df_queries_fa, qrels, filename):
    print(f"\nRunning experiment for: {filename}")

    scorer = RankerTFIDF(doc_series)

    queries=list(df_queries_fa.TITLE.values)
    results = scorer.batch_search(queries, k=1000)

    ranked_results={}
    i=0
    for query, doc_scores in results.items():
      i=i+1
      ranked_results[str(i)]={df_docs.at[doc_id,'DOCID']:float(score) for doc_id, score in doc_scores}

    top_k_values=[1, 3, 5, 10, 100, 1000]
    metrics = EvaluateRetrieval.evaluate(qrels, ranked_results, top_k_values)

    metrics_df = get_metrics_dataframe(metrics)
    metrics_df.to_csv(filename, index=False)
    print(f"Saved: {filename}")

    # ← اضافه شده: بازگرداندن metrics برای استفاده بعدی
    return metrics

    # ---- Experiment 1: TITLE ----
metrics_title = evaluate_docs_and_save(
    df_docs["TITLE"],
    df_docs,
    df_queries_fa,
    qrels,
    "metrics_title.csv"
)

# ---- Experiment 2: TEXT ----
metrics_text = evaluate_docs_and_save(
    df_docs["TEXT"],
    df_docs,
    df_queries_fa,
    qrels,
    "metrics_text.csv"
)

# ---- Experiment 3: TITLE + TEXT ----
metrics_title_text = evaluate_docs_and_save(
    df_docs["TITLE"].fillna("") + " " + df_docs["TEXT"].fillna(""),
    df_docs,
    df_queries_fa,
    qrels,
    "metrics_title_text.csv"
)




Running experiment for: metrics_title.csv
Saved: metrics_title.csv

Running experiment for: metrics_text.csv
Saved: metrics_text.csv

Running experiment for: metrics_title_text.csv
Saved: metrics_title_text.csv


In [55]:
from IPython.display import display

# === SHOW RESULTS FOR PART 1 ===
print("=== RESULTS: PART 1 (Title / Text / Title+Text) ===")

print("\n--- TITLE ---")
metrics_df_title = get_metrics_dataframe(metrics_title)
display(metrics_df_title)

print("\n--- TEXT ---")
metrics_df_text = get_metrics_dataframe(metrics_text)
display(metrics_df_text)

print("\n--- TITLE + TEXT ---")
metrics_df_title_text = get_metrics_dataframe(metrics_title_text)
display(metrics_df_title_text)


=== RESULTS: PART 1 (Title / Text / Title+Text) ===

--- TITLE ---


Metric,k,MAP,NDCG,P,Recall
0,1,0.00641,0.48,0.48,0.00641
1,3,0.01711,0.52614,0.53333,0.02006
2,5,0.02576,0.53291,0.54,0.0327
3,10,0.04043,0.4962,0.484,0.05555
4,100,0.113,0.31309,0.2216,0.2221
5,1000,0.14576,0.40397,0.05,0.46653



--- TEXT ---


Metric,k,MAP,NDCG,P,Recall
0,1,0.0059,0.52,0.52,0.0059
1,3,0.0149,0.50816,0.50667,0.01684
2,5,0.0215,0.4951,0.488,0.02591
3,10,0.03488,0.47853,0.468,0.04732
4,100,0.12723,0.35148,0.2698,0.25076
5,1000,0.17236,0.43379,0.05658,0.50316



--- TITLE + TEXT ---


Metric,k,MAP,NDCG,P,Recall
0,1,0.0072,0.58,0.58,0.0072
1,3,0.01864,0.59061,0.59333,0.02154
2,5,0.02636,0.55148,0.536,0.03143
3,10,0.04122,0.51218,0.488,0.05305
4,100,0.15564,0.3925,0.3006,0.28548
5,1000,0.22808,0.54029,0.07252,0.65865


In [None]:
!pip install parsivar

Collecting parsivar
  Downloading parsivar-0.2.3.1-py3-none-any.whl.metadata (242 bytes)
Downloading parsivar-0.2.3.1-py3-none-any.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m38.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: parsivar
Successfully installed parsivar-0.2.3.1


In [None]:
from parsivar import Normalizer, Tokenizer

In [54]:
print("\n=== PART 2: Normalizer + Tokenizer on best system (TITLE+TEXT) ===\n")

# best option = TITLE + TEXT
best_docs = df_docs["TITLE"].fillna("") + " " + df_docs["TEXT"].fillna("")

normalizer = Normalizer()
tokenizer = Tokenizer()

def normalize_and_tokenize(text):
    t = normalizer.normalize(text)
    tokens = tokenizer.tokenize_words(t)
    return " ".join(tokens)

print("Applying Parsivar Normalizer + Tokenizer...")

best_docs_norm_tok = best_docs.apply(normalize_and_tokenize)

print("Done preprocessing. Now evaluating...")

evaluate_docs_and_save(
    best_docs_norm_tok,
    df_docs,
    df_queries_fa,
    qrels,
    "metrics_title_text_norm_tok.csv"
)

print("\n=== PART 2 DONE: Saved metrics_title_text_norm_tok.csv ===\n")


=== PART 2: Normalizer + Tokenizer on best system (TITLE+TEXT) ===

Applying Parsivar Normalizer + Tokenizer...
Done preprocessing. Now evaluating...

Running experiment for: metrics_title_text_norm_tok.csv
Saved: metrics_title_text_norm_tok.csv

=== PART 2 DONE: Saved metrics_title_text_norm_tok.csv ===



In [56]:
import pandas as pd

csv_file = "metrics_title_text_norm_tok.csv"

metrics_df = pd.read_csv(csv_file)

metrics_df


Unnamed: 0,k,MAP,NDCG,P,Recall
0,1,0.00313,0.26,0.26,0.00313
1,3,0.00651,0.23754,0.23333,0.00764
2,5,0.00974,0.23208,0.228,0.01227
3,10,0.01744,0.22773,0.224,0.02412
4,100,0.06248,0.17944,0.1356,0.13726
5,1000,0.09437,0.28195,0.04016,0.37327


In [None]:
from collections import Counter

print("\n=== PART 3: Stopword Extraction & Removal ===\n")

docs_tokens = best_docs_norm_tok.copy()


print("Counting word frequencies...")

all_tokens = []
for text in docs_tokens:
    all_tokens.extend(text.split())

freq = Counter(all_tokens)

print("Vocabulary size:", len(freq))


def remove_top_k_stopwords(text, stopwords):
    return " ".join([w for w in text.split() if w not in stopwords])


k_values = [100, 500, 1000]

for k in k_values:

    print(f"\nProcessing stopwords for k = {k} ...")

    stopwords_k = set([w for w, _ in freq.most_common(k)])

    docs_cleaned = docs_tokens.apply(lambda t: remove_top_k_stopwords(t, stopwords_k))

    print("Re-evaluating system with stopword removal...")

    out_file = f"metrics_stopwords_{k}.csv"

    evaluate_docs_and_save(
        docs_cleaned,
        df_docs,
        df_queries_fa,
        qrels,
        out_file
    )

    print(f"Saved: {out_file}")

print("\n=== PART 3 DONE ===\n")


=== PART 3: Stopword Extraction & Removal ===

Counting word frequencies...
Vocabulary size: 523764

Processing stopwords for k = 100 ...
Re-evaluating system with stopword removal...

Running experiment for: metrics_stopwords_100.csv
Saved: metrics_stopwords_100.csv
Saved: metrics_stopwords_100.csv

Processing stopwords for k = 500 ...
Re-evaluating system with stopword removal...

Running experiment for: metrics_stopwords_500.csv
Saved: metrics_stopwords_500.csv
Saved: metrics_stopwords_500.csv

Processing stopwords for k = 1000 ...
Re-evaluating system with stopword removal...

Running experiment for: metrics_stopwords_1000.csv
Saved: metrics_stopwords_1000.csv
Saved: metrics_stopwords_1000.csv

=== PART 3 DONE ===



In [None]:
from IPython.display import display

# === SHOW RESULTS FOR PART 3 ===
print("=== RESULTS: PART 3 (Stopword Removal k=100,500,1000) ===")

print("\n--- k = 100 ---")
metrics_sw_100 = pd.read_csv("metrics_stopwords_100.csv")
display(metrics_sw_100)

print("\n--- k = 500 ---")
metrics_sw_500 = pd.read_csv("metrics_stopwords_500.csv")
display(metrics_sw_500)

print("\n--- k = 1000 ---")
metrics_sw_1000 = pd.read_csv("metrics_stopwords_1000.csv")
display(metrics_sw_1000)


=== RESULTS: PART 3 (Stopword Removal k=100,500,1000) ===

--- k = 100 ---


Unnamed: 0,k,MAP,NDCG,P,Recall
0,1,0.0031,0.26,0.26,0.0031
1,3,0.00671,0.24224,0.24,0.00773
2,5,0.0101,0.23052,0.224,0.01223
3,10,0.01784,0.22847,0.224,0.02357
4,100,0.0628,0.17749,0.1328,0.13489
5,1000,0.09365,0.27643,0.03916,0.36376



--- k = 500 ---


Unnamed: 0,k,MAP,NDCG,P,Recall
0,1,0.00203,0.18,0.18,0.00203
1,3,0.00531,0.18123,0.18,0.00642
2,5,0.00806,0.18119,0.18,0.01012
3,10,0.01475,0.17948,0.178,0.01966
4,100,0.05246,0.14602,0.112,0.11402
5,1000,0.08065,0.24157,0.0341,0.3272



--- k = 1000 ---


Unnamed: 0,k,MAP,NDCG,P,Recall
0,1,0.00157,0.14,0.14,0.00157
1,3,0.00405,0.14123,0.14,0.00471
2,5,0.0063,0.14935,0.152,0.00787
3,10,0.01194,0.15063,0.154,0.01643
4,100,0.04869,0.13131,0.1048,0.10415
5,1000,0.07633,0.21073,0.03084,0.28031


In [65]:

homework_id = 'hm01'

best_metrics_df = pd.read_csv("metrics_title_text_norm_tok.csv")

best_csv_file = f"{homework_id}_best.csv"
best_metrics_df.to_csv(best_csv_file, index=False)
print(f"{best_csv_file} saved.")

!rm -rf irws

import getpass
your_github_id = getpass.getpass("Enter GitHub PAT (hidden): ")
your_github_name = input("Enter GitHub username: ")
your_email = input('Please enter your email: ')

!rm -rf irws
!git clone https://{your_github_name}:{your_github_id}@github.com/{your_github_name}/irws


!cp hm01_best.csv irws/irws_hm01_best_value.csv

%cd irws
!git add irws_hm01_best_value.csv
!git config user.email "{your_email}"
!git config user.name "{your_github_name}"
!git commit -m "hm01: add best results"
!git push
%cd /



from google.colab import files
uploaded = files.upload()
nb = list(uploaded.keys())[0]

!cp "{nb}" irws/irws_hm01.ipynb
%cd irws
!git add irws_hm01.ipynb
!git commit -m "hm01: add notebook"
!git push
%cd /


hm01_best.csv saved.
Enter GitHub PAT (hidden): ··········
Enter GitHub username: mohamadzanjani
Please enter your email: mmmn.zanjani@gmail.com
Cloning into 'irws'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 3 (delta 0), reused 3 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), done.
/irws
On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean
Everything up-to-date
/


Saving IRWS_HM01.ipynb to IRWS_HM01 (2).ipynb
/irws
[main 09180fb] hm01: add notebook
 1 file changed, 3558 insertions(+)
 create mode 100644 irws_hm01.ipynb
Enumerating objects: 4, done.
Counting objects: 100% (4/4), done.
Delta compression using up to 2 threads
Compressing objects: 100% (3/3), done.
Writing objects: 100% (3/3), 17.39 KiB | 4.35 MiB/s, done.
Total 3 (delta 0), reused 0 (delta 0), pack-reused 0
remote: [1;31merror[m: GH013: Repository rule violations found for refs/heads/main.[K
remote: 
remote: - GITHUB PUSH PROTECTION[K
remote:   —————————————————————————————————————————[K
remote:     Resolve the following violations before pushing again[K
remote: 
remote:     - Push cannot contain secrets[K
remote: 
remote:     [K
remote:      (?) Learn how to resolve a blocked push[K
remote:      https://docs.github.com/code-security/secret-scanning/working-with-secret-scanning-and-push-protection/working-with-push-protection-from-the-command-line#resolving-a-blocked-push