In [1]:
import pandas as pd
import os

df = pd.read_json(os.path.abspath("..\\..\\benchmark\\benchmark_v2\\benchmark_v2.json"))
df

Unnamed: 0,text
Chinese Remainder Theorem,[Solving $x \equiv 2 \mod 3$ and $x \equiv 3 \...
Diophantine Equations,[Solving $3x + 5y = 1$ using the extended Eucl...
Divisibility,[From (1) it follows that $A(x_{1}+y_{1})=p^{k...
Euler’s Theorem,"[By Euler’s Theorem, $10^{\varphi(f k)}\equiv1..."
Extremal Principles,"[If $x\geq3$ , $y\geq3$ , $z\geq3$ then $x y z..."
Fermat’s Little Theorem,[The formula in our problem shows that the sum...
Modular Arithmetics,[Assume that we have a prime $p$ such that $p|...
Pigeonhole Principle,[Let $S$ be the set of nonnegative integers le...
Prime Numbers,[Assume that we have a prime $p$ such that $p|...
Quadratic Residues,"[If $n$ is even, then $p\equiv3$ (mod 4) and $..."


In [2]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

df = df.explode("text").reset_index().rename({"index": "label"}, axis=1)
labels = df['label'].unique()
label_maping = {labels[i]: i for i in range(len(labels))}
df['label'] = df.apply(lambda x: label_maping[x['label']], axis=1)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

In [3]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('tbs17/MathBERT', output_hidden_states=True)
model = BertModel.from_pretrained("tbs17/MathBERT")
device = torch.device("cuda")
model = model.to(device)

In [4]:
def encode(text, tokenizer):
    tokens = tokenizer(text, padding="max_length", truncation=True, return_tensors="pt", max_length=256)
    tokens = {k: v.to(device) for k, v in tokens.items()}
    with torch.no_grad():
        outputs = model(**tokens)
    return outputs.last_hidden_state[:, 0, :] # [cls] tokens

In [9]:
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, return_tensors="pt", max_length=256)

In [6]:
from pytorch_metric_learning.miners import TripletMarginMiner
miner = TripletMarginMiner(margin=0.2, type_of_triplets="semihard")

In [None]:
from torch import nn
from transformers import AutoTokenizer, BertModel
from torch.utils.data import DataLoader, Dataset
from pytorch_metric_learning.miners import TripletMarginMiner
from pytorch_metric_learning.losses import TripletMarginLoss
from pytorch_metric_learning.samplers import MPerClassSampler
import torch
import pandas as pd
from datasets import Dataset

# Assuming you have train_df and a tokenize function defined
# Example tokenize function (adjust based on your needs):
def tokenize(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

class MathBertEmbeddingModel(nn.Module):
    def __init__(self, projection_dim=128):
        super().__init__()
        self.encoder = BertModel.from_pretrained("tbs17/MathBERT")
        self.projection = nn.Linear(self.encoder.config.hidden_size, projection_dim)

    def forward(self, toks):
        out = self.encoder(**toks).last_hidden_state[:, 0, :]  # CLS token
        return self.projection(out)  # embedding (B, D)

# Load your data and tokenize it
train_df = pd.DataFrame({'text': ["some math text 1", "another math text 2", "yet more math 1", "example 2"], 'label': [0, 1, 0, 1]}) # Example DataFrame
tokenizer = AutoTokenizer.from_pretrained("tbs17/MathBERT")
dataset = Dataset.from_pandas(train_df)
dataset = dataset.map(tokenize, batched=True)
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) # Ensure correct format for DataLoader

# Create the DataLoader with a sampler (optional but recommended)
sampler = MPerClassSampler(labels=train_df['label'].values, m=2, batch_size=4) # Adjust m and batch_size
dataloader = DataLoader(dataset, batch_sampler=sampler)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embedding_model = MathBertEmbeddingModel(projection_dim=128).to(device)
optimizer = torch.optim.AdamW(embedding_model.parameters(), lr=2e-5)

loss_fn = TripletMarginLoss(margin=0.3)
miner = TripletMarginMiner(margin=0.3, type_of_triplets="semihard")

embedding_model.train()
for epoch in range(5):
    total_loss = 0
    for batch in dataloader:
        inputs = {k: v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names}
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        embeddings = embedding_model(inputs)  # (B, projection_dim)
        triplets = miner(embeddings, labels)

        if triplets[0].nelement() == 0:
            continue  # skip if miner returns no triplets

        loss = loss_fn(embeddings, labels, triplets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

embedding_model.eval() # Set model to evaluation mode after training

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

{'label': [5, 8, 2, 4, 6, 1, 4, 9, 0, 7], 'text': ['For prime $p$, $a^p \\equiv a \\mod p$ holds for all integers $a$.', 'To each number, associate the triple $(x_{2},x_{3},x_{5})$ recording the parity of the exponents of 2, 3, and 5 in its prime factorization.', "If a prime $p$ divides $ab$, then $p$ divides $a$ or $p$ divides $b$ (Euclid's lemma).", 'Find $a,b$ such that $\\operatorname*{min}\\{a_{5}a+b_{5}b,a_{2}a+b_{2}b\\}=98$ and $a b$ is minimal.', 'Wilson’s theorem states that $(p-1)! \\equiv -1 \\mod p$ for prime $p$.', 'Parametrizing solutions to $x^2 - y^2 = N$ as $(x+y)(x-y) = N$.', 'If $x\\geq3$ , $y\\geq3$ , $z\\geq3$ then $x y z\\geq3y z$ , $x y z\\geq3x z$ , $x y z\\geq3x y$ .', 'The equation $x^2 \\equiv -1 \\mod p$ has solutions if and only if $p \\equiv 1 \\mod 4$.', 'Using CRT to find a number congruent to 1 mod 2, 2 mod 3, and 3 mod 5.', 'Observe that from any 2001 consecutive natural numbers, at least one is a term of the sequence.'], '__index_level_0__': [55, 88, 

TypeError: only integer tensors of a single element can be converted to an index

In [None]:
from torch import nn
from transformers import AutoModel, BertModel
from torch.utils.data import DataLoader, Dataset
from pytorch_metric_learning.miners import TripletMarginMiner
from pytorch_metric_learning.losses import TripletMarginLoss
from pytorch_metric_learning.samplers import MPerClassSampler
import torch
import pandas as pd
from datasets import Dataset

# Assuming you have train_df and a tokenize function defined
# Example tokenize function (adjust based on your needs):
def tokenize(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

class MathBertEmbeddingModel(nn.Module):
    def __init__(self, projection_dim=128):
        super().__init__()
        self.encoder = BertModel.from_pretrained("tbs17/MathBERT")
        self.projection = nn.Linear(self.encoder.config.hidden_size, projection_dim)

    def forward(self, toks):
        out = self.encoder(**toks).last_hidden_state[:, 0, :]  # CLS token
        return self.projection(out)  # embedding (B, D)

# Load your data and tokenize it
train_df = pd.DataFrame({'text': ["some math text 1", "another math text 2", "yet more math 1", "example 2"], 'label': [0, 1, 0, 1]}) # Example DataFrame
tokenizer = BertTokenizer.from_pretrained("tbs17/MathBERT")
dataset = Dataset.from_pandas(train_df)
dataset = dataset.map(tokenize, batched=True)
# dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) # Ensure correct format for DataLoader

# Create the DataLoader with a sampler (optional but recommended)
sampler = MPerClassSampler(labels=train_df['label'].values, m=2, batch_size=4) # Adjust m and batch_size
dataloader = DataLoader(dataset, batch_sampler=sampler)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embedding_model = MathBertEmbeddingModel(projection_dim=128).to(device)
optimizer = torch.optim.AdamW(embedding_model.parameters(), lr=2e-5)

loss_fn = TripletMarginLoss(margin=0.3)
miner = TripletMarginMiner(margin=0.3, type_of_triplets="semihard")

embedding_model.train()
for epoch in range(5):
    total_loss = 0
    for batch in dataloader:
        inputs = {k: v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names}
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        embeddings = embedding_model(inputs)  # (B, projection_dim)
        triplets = miner(embeddings, labels)

        if triplets[0].nelement() == 0:
            continue  # skip if miner returns no triplets

        loss = loss_fn(embeddings, labels, triplets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

embedding_model.eval() # Set model to evaluation mode after training

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

AttributeError: 'list' object has no attribute 'to'