In [1]:
from ranking import rank

In [2]:
models = ["tbs17/MathBERT", "allenai/scibert_scivocab_uncased", "math-similarity/Bert-MLM_arXiv-MP-class_zbMath", "allenai/longformer-base-4096"]

In [3]:
import pandas as pd

problemset_df = pd.read_csv("C:\\Users\\mokrota\\Documents\\GitHub\\math_problem_recommender\\math_problem_recommender\\benchmark\\benchmarkv3\\df.csv")
qa_df = pd.read_csv("C:\\Users\\mokrota\\Documents\\GitHub\\math_problem_recommender\\math_problem_recommender\\benchmark\\benchmarkv3\\q&a.csv")

In [4]:
qa_df

Unnamed: 0,Anchor,Golden,Silver,Wrong,Query,query
0,431,439,475,592,,Find problems that use divisibility to limit n...
1,468,467,63,614,,Find problems that explicitly use idea of all ...
2,194,196,537,100,,Find problems that explore pigeonhole principle
3,66,64,451,145,,Find problems that involve calculating answer ...
4,150,152,161,517,,Find problems where we have to solve floor fun...
5,42,39,440,598,,Find problems that practice divisibility toget...
6,228,542,233,373,,Find calculative problems where we need to use...


In [5]:
def parse_text(row):
    names = ["Anchor", "Golden", "Silver", "Wrong"]
    texts = {}

    for name in names:
        id_name = row[name]
        t = problemset_df[problemset_df['id'] == id_name]['Problem&Solution'].iloc[0]
        texts[name] = t

    return texts

In [6]:
qa_df['Problem&Solution'] = qa_df.apply(parse_text, axis=1)

In [7]:
qa_df

Unnamed: 0,Anchor,Golden,Silver,Wrong,Query,query,Problem&Solution
0,431,439,475,592,,Find problems that use divisibility to limit n...,{'Anchor': 'Problem. Find all odd positive int...
1,468,467,63,614,,Find problems that explicitly use idea of all ...,{'Anchor': 'Problem. For any positive integer ...
2,194,196,537,100,,Find problems that explore pigeonhole principle,{'Anchor': 'Problem. Prove that among any inte...
3,66,64,451,145,,Find problems that involve calculating answer ...,{'Anchor': 'Problem. Find the smallest positiv...
4,150,152,161,517,,Find problems where we have to solve floor fun...,{'Anchor': 'Problem. Determine the number of r...
5,42,39,440,598,,Find problems that practice divisibility toget...,{'Anchor': 'Problem. Let $m\geq2$ be an intege...
6,228,542,233,373,,Find calculative problems where we need to use...,{'Anchor': 'Problem. Let $\tau(n)$ denote the ...


In [8]:
problemsolution = qa_df['Problem&Solution'].to_list()

anchors = []
texts = []
for d in problemsolution:
    anchors.append(d['Anchor'])
    group = [d['Golden'], d['Silver'], d['Wrong']]
    texts.append(group)

In [9]:
true_ranks = {}
pred_ranks = {}
for model in models:
    true_ranks[model] = [[1, 2, 3] for _ in range(len(texts))]
    pred_ranks[model] = [rank(model, anchor, t) for anchor, t in zip(anchors, texts)]
    pred_ranks[model] = list(map(lambda x: list(x), pred_ranks[model]))

Input ids are automatically padded to be a multiple of `config.attention_window`: 512


In [10]:
pred_ranks

{'tbs17/MathBERT': [[1, 2, 3],
  [1, 3, 2],
  [3, 1, 2],
  [2, 1, 3],
  [1, 2, 3],
  [2, 3, 1],
  [1, 3, 2]],
 'allenai/scibert_scivocab_uncased': [[1, 2, 3],
  [1, 3, 2],
  [3, 1, 2],
  [3, 1, 2],
  [1, 2, 3],
  [3, 2, 1],
  [1, 3, 2]],
 'math-similarity/Bert-MLM_arXiv-MP-class_zbMath': [[2, 3, 1],
  [1, 2, 3],
  [2, 1, 3],
  [1, 2, 3],
  [2, 1, 3],
  [2, 1, 3],
  [1, 2, 3]],
 'allenai/longformer-base-4096': [[1, 2, 3],
  [2, 3, 1],
  [2, 1, 3],
  [3, 1, 2],
  [1, 2, 3],
  [3, 2, 1],
  [2, 3, 1]]}

In [11]:
true_ranks
import numpy as np

accuracies = {}
for model in models:
    pair_check = np.array(true_ranks[model]) == np.array(pred_ranks[model])
    accuracies[model] = pair_check.mean()

accuracies

{'tbs17/MathBERT': 0.42857142857142855,
 'allenai/scibert_scivocab_uncased': 0.42857142857142855,
 'math-similarity/Bert-MLM_arXiv-MP-class_zbMath': 0.5714285714285714,
 'allenai/longformer-base-4096': 0.38095238095238093}

In [12]:
from scipy.stats import spearmanr

spearmans = {}

for model in models:
    s = []
    for true_rank, pred_rank in zip(true_ranks[model], pred_ranks[model]):
        rho, _ = spearmanr(true_rank, pred_rank)
        s.append(rho)
    spearmans[model] = np.mean(s)
spearmans

{'tbs17/MathBERT': 0.35714285714285715,
 'allenai/scibert_scivocab_uncased': 0.14285714285714285,
 'math-similarity/Bert-MLM_arXiv-MP-class_zbMath': 0.5714285714285714,
 'allenai/longformer-base-4096': 0.0}

In [13]:
metrics = {"Accuracy": accuracies, 
           "Spearman": spearmans}
metrics

{'Accuracy': {'tbs17/MathBERT': 0.42857142857142855,
  'allenai/scibert_scivocab_uncased': 0.42857142857142855,
  'math-similarity/Bert-MLM_arXiv-MP-class_zbMath': 0.5714285714285714,
  'allenai/longformer-base-4096': 0.38095238095238093},
 'Spearman': {'tbs17/MathBERT': 0.35714285714285715,
  'allenai/scibert_scivocab_uncased': 0.14285714285714285,
  'math-similarity/Bert-MLM_arXiv-MP-class_zbMath': 0.5714285714285714,
  'allenai/longformer-base-4096': 0.0}}

### Show metrics

In [14]:
data = []
for model in models:
    d = {}
    d['Model'] = model
    for key in metrics:
        d[key] = metrics[key][model]
    data.append(d)
metrics_df = pd.DataFrame(data)
metrics_df

Unnamed: 0,Model,Accuracy,Spearman
0,tbs17/MathBERT,0.428571,0.357143
1,allenai/scibert_scivocab_uncased,0.428571,0.142857
2,math-similarity/Bert-MLM_arXiv-MP-class_zbMath,0.571429,0.571429
3,allenai/longformer-base-4096,0.380952,0.0


In [15]:
from transformers import AutoModel, AutoTokenizer
best_model = metrics_df.sort_values('Spearman', ascending=False).iloc[0,0]
best_model, tokenizer = AutoModel.from_pretrained(best_model, output_attentions=True), AutoTokenizer.from_pretrained(best_model)

ex_anchor = anchors[0][:30]
ex_t = texts[0][:30]
ex_anchor, ex_t

('Problem. Find all odd positive',
 ['Problem. Find the smallest positive integer $K$ such that every $K$ -element subset of $\\{1,2,\\ldots,50\\}$ contains two distinct elements $a,b$ such that $a+b$ divides ab.  \n\n(1996 Chinese Mathematical Olympiad)  \n\nSolution. The minimal value is $k=39$ . Suppose $a,b\\in S$ are such that $a+b$ divides $a b$ . Let $c=g c d(a,b)$ , and put $a=c a_{1}$ , $b=c b_{1}$ , so that $a_{1}$ and $b_{1}$ are relatively prime. Then $c(a_{1}+b_{1})$ divides $c^{2}a_{1}b_{1}$ , so $a_{1}+b_{1}$ divides $c a_{1}b_{1}$ . Since $a_{1}$ and $b_{1}$ have no common factor, neither do $a_{1}$ and $a_{1}+b_{1}$ , or $b_{1}$ and $a_{1}+b_{1}$ . In short, $a_{1}+b_{1}$ divides $c$ .  \n\nSince $S\\subseteq\\{1,\\dots,50\\}$ , we have $a+b\\leq99$ , so $c(a_{1}+b_{1})\\leq99$ , which implies $a_{1}+b_{1}\\leq9$ ; on the other hand, of course $a_{1}+b_{1}\\geq3$ . An exhaustive search produces 23 pairs $a,b$ satisfying the condition:  \n\n$$\n\\begin{array}{c c c}{{a_

In [16]:
max_length = 512
an_inputs = tokenizer(ex_anchor, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
an_tokens = tokenizer.convert_ids_to_tokens(an_inputs['input_ids'][0])
an_attention = best_model(**an_inputs, output_attentions=True)

t_inputs = tokenizer(ex_t, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
t_tokens = tokenizer.convert_ids_to_tokens(t_inputs['input_ids'][0])
t_attention = best_model(**t_inputs, output_attentions=True)



In [17]:
from bertviz import head_view
head_view(an_attention.attentions, an_tokens)

<IPython.core.display.Javascript object>