In [2]:
import os
import pandas as pd
from ast import literal_eval

df = pd.read_csv(os.path.abspath(r"C:\Users\mokrota\Documents\GitHub\math_problem_recommender\math_problem_recommender\andreescu-andrica-problems-on-number-theory\benchmark_v1.csv"))
df["Problem&Solution"] = df["Problem&Solution"].apply(literal_eval)
df = df.explode(column=["Problem&Solution"]).reset_index().rename(columns={"index": 'label'})
df = df[["label", "Problem&Solution"]].rename(columns={"Problem&Solution": 'text'})

In [7]:
from sentence_transformers import SentenceTransformer, LoggingHandler
from sentence_transformers import models, util, datasets, evaluation, losses
from torch.utils.data import DataLoader
from datasets import Dataset
from transformers import AutoModel

# Define your sentence transformer model using CLS pooling
model_name = "math-similarity/Bert-MLM_arXiv-MP-class_arXiv"
# word_embedding_model = models.Transformer(model_name)
# pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "cls")
model = SentenceTransformer(model_name)

# Define a list with sentences (1k - 100k sentences)
train_sentences = df['text'].tolist()

# Create the special denoising dataset that adds noise on-the-fly
train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)

# DataLoader to batch your data
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Use the denoising auto-encoder loss
train_loss = losses.DenoisingAutoEncoderLoss(
    model, decoder_name_or_path=model_name, tie_encoder_decoder=True
)

# Call the fit method
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=5,
    weight_decay=0,
    scheduler="constantlr",
    optimizer_params={"lr": 3e-5},
    show_progress_bar=True,
)

model.save("models/tsdae-model")

pytorch_model.bin:  45%|####5     | 199M/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

When tie_encoder_decoder=True, the decoder_name_or_path will be invalid.
Some weights of BertLMHeadModel were not initialized from the model checkpoint at math-similarity/Bert-MLM_arXiv-MP-class_arXiv and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.laye

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss


In [19]:
from similarity import CosineSimScorer, BERTCLSMeanPooler, BERTCLSFirstPooler, EmbSummarizer
models_names = [model_name]
models = [SentenceTransformer("models/tsdae-model")]
tokenizers = [models[0].tokenizer]
sum_classes = {"First CLS": BERTCLSFirstPooler, "Mean CLS": BERTCLSMeanPooler}

In [8]:
import pandas as pd

problemset_df = pd.read_csv("C:\\Users\\mokrota\\Documents\\GitHub\\math_problem_recommender\\math_problem_recommender\\benchmark\\benchmarkv3\\df.csv")
qa_df = pd.read_csv("C:\\Users\\mokrota\\Documents\\GitHub\\math_problem_recommender\\math_problem_recommender\\benchmark\\benchmarkv3\\q&a.csv")

In [10]:
def parse_text(row):
    names = ["Anchor", "Golden", "Silver", "Wrong"]
    texts = {}

    for name in names:
        id_name = row[name]
        t = problemset_df[problemset_df['id'] == id_name]['Problem&Solution'].iloc[0]
        texts[name] = t

    return texts

In [11]:
qa_df['Problem&Solution'] = qa_df.apply(parse_text, axis=1)

In [12]:
problemsolution = qa_df['Problem&Solution'].to_list()
# queries = qa_df['query'].to_list()

anchors = []
texts = []
for d in problemsolution:
    anchors.append(d['Anchor'])
    group = [d['Golden'], d['Silver'], d['Wrong']]
    texts.append(group)

In [13]:
def embed(summarizer: EmbSummarizer):
    anchors_emb = summarizer.summarize(anchors)
    texts_emb = []
    for ts in texts:
        texts_emb.append(summarizer.summarize(ts))
    return anchors_emb, texts_emb

In [14]:
import numpy as np
from scipy.stats import spearmanr
def calc_metrics(pred_ranks, true_ranks):
    pair_check = np.array(true_ranks) == np.array(pred_ranks)
    accuracy = pair_check.mean()
    s = []
    for true_rank, pred_rank in zip(true_ranks, pred_ranks):
        rho, _ = spearmanr(true_rank, pred_rank)
        s.append(rho)
    spearmans = np.mean(s)
    return {"accuracy": accuracy,
            "spearman": spearmans}

In [15]:
def evaluate(summarizer):
    anchors_emb, texts_emb = embed(summarizer)
    ranker = CosineSimScorer()
    pred_ranks = []
    for anchor, ts in zip(anchors_emb, texts_emb):
        pred_ranks.append(ranker.rank(anchor, ts).cpu().numpy())
    pred_ranks = np.stack(pred_ranks)
    true_ranks = np.arange(len(ts))
    true_ranks = np.tile(true_ranks, (len(anchors), 1))
    metrics = calc_metrics(pred_ranks, true_ranks)
    return metrics

In [20]:
results = []
for model, tokenizer, name in zip(models, tokenizers, models_names):
    for s_name in sum_classes:
        s_c = sum_classes[s_name]
        summarizer = s_c(model, tokenizer)
        res = evaluate(summarizer)
        res['name'] = name
        res['pooling method'] = s_name
        results.append(res)

TypeError: SentenceTransformer.forward() missing 1 required positional argument: 'input'