In [4]:
!pip install transformers datasets evaluate accelerate



In [3]:
!pip install pylatexenc



In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
from os import listdir
from os.path import join
dirfiles = listdir("/content/drive/My Drive/MathIRData/ARQMathAgg/dataset_v2/")
print(dirfiles)

['queries_train.tsv', 'collection_train.tsv', 'triples_train.jsonl', 'qrel_train', 'queries_test.tsv', 'qrel_test', 'triples_test.jsonl', '.gitignore', 'collection_test.tsv', 'aggregates']


In [2]:
!pip install ijson

Collecting ijson
  Downloading ijson-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (21 kB)
Downloading ijson-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/114.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.5/114.5 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ijson
Successfully installed ijson-3.3.0


In [12]:
from pylatexenc.latex2text import LatexNodes2Text
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
import torch
import numpy as np
from os import listdir
from os.path import join
import re

import pandas as pd
import json

def main():
  parse_all_documents('/content/drive/My Drive/MathIRData/ARQMathAgg/dataset_v2/',
                      'collection_test.tsv', 'queries_test.tsv', 'aggregates/collection_agg_test.json')

def get_top_k_paragraph(paragraphs, query, k):
    if k > len(paragraphs):
      raise ValueError(f"k ({k}) cannot be greater than the number of paragraphs ({len(paragraphs)}).")
    # Turn latex into plaintext first
    query = LatexNodes2Text(math_mode='verbatim', strict_latex_spaces=True).latex_to_text(query)
    for i in range(len(paragraphs)):
      paragraphs[i] = LatexNodes2Text(math_mode='verbatim', strict_latex_spaces=True).latex_to_text(paragraphs[i])

    tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
    model = AutoModelForSequenceClassification.from_pretrained("AnReu/albert-for-math-ar-base-ft")

    results = []
    for id, paragraph in enumerate(paragraphs):
      inputs = tokenizer.encode_plus(query, paragraph, return_tensors="pt", truncation=True)
      with torch.no_grad():
        outputs = model(**inputs)
        score = torch.nn.functional.softmax(outputs.logits, dim=-1)[0][1].item()
      results.append((id, score))

    results.sort(key=lambda x: x[1], reverse=True)
    rel_scores = [score for _, score in results]
    top_k_results = results[:k]
    return top_k_results, rel_scores

def parse_all_documents(base_path, collection_name, queries_name, meta_name):
    dirfiles = listdir(base_path)
    print(dirfiles)

    collection_path = join(base_path, collection_name)
    queries_path = join(base_path, queries_name)
    meta_path = join(base_path, meta_name)

    try:
        with open(meta_path, 'r', encoding='utf-8') as file:
            meta = json.load(file)
    except json.JSONDecodeError as e:
        print(f"JSONDecodeError: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")

    collection = {}
    with open(collection_path, 'r', encoding='utf-8') as file:
      for line in file:
        match = re.match(r"(\d+)\s+(.*)", line)
        if match:
          index = int(match.group(1))
          text = match.group(2).strip()
          collection[index] = text


    queries = {}
    with open(queries_path, 'r', encoding='utf-8') as file:
      for line in file:
        match = re.match(r"(\d+)\s+(.*)", line)
        if match:
          index = int(match.group(1))
          text = match.group(2).strip()
          queries[index] = text

    for obj in meta:
      qid = obj['qid']
      pids = obj['pids']
      correct_idx = obj['corr_idx']
      paragraphs = [collection.get(pid) for pid in pids]
      query = queries.get(qid)
      top_k, rel_scores = get_top_k_paragraph(paragraphs, query, 1)
      print(f"Predicted {top_k[0]}, correct {correct_idx}")
      res = evaluate_metrics(rel_scores, 1)
      print(f"Evaluated metrics : {res}")

    print(meta)
    print("Done!")

def evaluate_metrics(rel_scores, k=None):
    if k is not None:
      rel_scores = rel_scores[:k]
    else:
      k = len(rel_scores)
    return {
      "nDCG": nDCG(rel_scores, k),
      "Precision@k": precision_at_k(rel_scores, k),
      "Judged@k": judged_at_k(rel_scores, k),
      "MRR": MRR_at_k(rel_scores, k),
      "MAP": MAP_at_k(rel_scores, k)
    }

def nDCG(rel_scores, k):
    sorted_scores = sorted(rel_scores, reverse=True)
    dcg = sum([rel_scores[i] / np.log2(i + 2) for i in range(min(k, len(rel_scores)))])
    idcg = sum([sorted_scores[i] / np.log2(i + 2) for i in range(min(k, len(sorted_scores)))])
    return dcg / idcg if idcg > 0 else 0

def precision_at_k(rel_scores, k):
    rel_docs = sum([1 for i in range(min(k, len(rel_scores))) if rel_scores[i] > 0])
    return rel_docs / k

def judged_at_k(rel_scores, k):
    top_k = rel_scores[:k]
    judged_items = sum(1 for rel in top_k if rel > 0)
    return judged_items / k

def MRR_at_k(rel_scores, k):
    for i in range(min(k, len(rel_scores))):
      if rel_scores[i] > 0:
        return 1 / (i + 1)
    return 0

def MAP_at_k(rel_scores, k):
    ap = 0
    rel_docs = 0
    for i in range(min(k, len(rel_scores))):
      if rel_scores[i] > 0:
        rel_docs += 1
        ap += rel_docs / (i + 1)
    return ap / rel_docs if rel_docs > 0 else 0

def parse_latex_into_array(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    latex_text = ''.join(lines)

    text = LatexNodes2Text(math_mode='verbatim', strict_latex_spaces=True).latex_to_text(latex_text)
    paragraphs = [para.strip() for para in text.split(".\n\n") if para.strip()]
    return paragraphs

if __name__=='__main__':
    main()



['queries_train.tsv', 'collection_train.tsv', 'triples_train.jsonl', 'qrel_train', 'queries_test.tsv', 'qrel_test', 'triples_test.jsonl', '.gitignore', 'collection_test.tsv', 'aggregates']
Predicted (1, 0.9997264742851257), correct 1
Evaluated metrics : {'nDCG': 1.0, 'Precision@k': 1.0, 'Judged@k': 1.0, 'MRR': 1.0, 'MAP': 1.0}
Predicted (1, 0.9997496008872986), correct 1
Evaluated metrics : {'nDCG': 1.0, 'Precision@k': 1.0, 'Judged@k': 1.0, 'MRR': 1.0, 'MAP': 1.0}
Predicted (1, 0.9995967745780945), correct 1
Evaluated metrics : {'nDCG': 1.0, 'Precision@k': 1.0, 'Judged@k': 1.0, 'MRR': 1.0, 'MAP': 1.0}
Predicted (1, 0.9997803568840027), correct 1
Evaluated metrics : {'nDCG': 1.0, 'Precision@k': 1.0, 'Judged@k': 1.0, 'MRR': 1.0, 'MAP': 1.0}
Predicted (2, 0.9995365142822266), correct 2
Evaluated metrics : {'nDCG': 1.0, 'Precision@k': 1.0, 'Judged@k': 1.0, 'MRR': 1.0, 'MAP': 1.0}
Predicted (1, 0.839702844619751), correct 1
Evaluated metrics : {'nDCG': 1.0, 'Precision@k': 1.0, 'Judged@k': 1

KeyboardInterrupt: 