In [2]:
import pandas as pd
import os

df = pd.read_json(os.path.abspath("..\\..\\benchmark\\benchmark_v2\\benchmark_v2.json"))
df

Unnamed: 0,text
Chinese Remainder Theorem,[Solving $x \equiv 2 \mod 3$ and $x \equiv 3 \...
Diophantine Equations,[Solving $3x + 5y = 1$ using the extended Eucl...
Divisibility,[From (1) it follows that $A(x_{1}+y_{1})=p^{k...
Euler’s Theorem,"[By Euler’s Theorem, $10^{\varphi(f k)}\equiv1..."
Extremal Principles,"[If $x\geq3$ , $y\geq3$ , $z\geq3$ then $x y z..."
Fermat’s Little Theorem,[The formula in our problem shows that the sum...
Modular Arithmetics,[Assume that we have a prime $p$ such that $p|...
Pigeonhole Principle,[Let $S$ be the set of nonnegative integers le...
Prime Numbers,[Assume that we have a prime $p$ such that $p|...
Quadratic Residues,"[If $n$ is even, then $p\equiv3$ (mod 4) and $..."


In [3]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

df = df.explode("text").reset_index().rename({"index": "label"}, axis=1)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

dataset = DatasetDict({
    "train": train_dataset,
    "eval": test_dataset
})
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text', '__index_level_0__'],
        num_rows: 80
    })
    eval: Dataset({
        features: ['label', 'text', '__index_level_0__'],
        num_rows: 20
    })
})

In [4]:
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
import torch

tokenizer = BertTokenizer.from_pretrained('tbs17/MathBERT', output_hidden_states=True)
model = BertModel.from_pretrained("tbs17/MathBERT")
device = torch.device("cuda")
model = model.to(device)

In [5]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    texts = [item['text'] for item in batch]
    encoding = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    return {'input_ids': encoding['input_ids'], 'attention_mask': encoding['attention_mask']}

train_dataloader = DataLoader(dataset['train'], batch_size=16, shuffle=True, collate_fn=collate_fn)
eval_dataloader = DataLoader(dataset['eval'], batch_size=16, collate_fn=collate_fn)

In [6]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * 3) # Example 3 epochs

In [8]:
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
import torch
import datasets
import torch.nn.functional as F
from torch.optim import AdamW
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import numpy as np

# Assume your dataset has 'text' and 'label' columns

class LabelledPairDataset(Dataset):
    def __init__(self, dataset, tokenizer, text_column='text', label_column='label'):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.text_column = text_column
        self.label_column = label_column
        self.labels = sorted(list(set(dataset[label_column])))
        self.label_to_indices = {label: np.where(np.array(dataset[label_column]) == label)[0] for label in self.labels}

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        anchor_text = self.dataset[idx][self.text_column]
        anchor_label = self.dataset[idx][self.label_column]

        # Create a positive pair (another item with the same label)
        positive_index = np.random.choice(self.label_to_indices[anchor_label])
        while positive_index == idx:
            positive_index = np.random.choice(self.label_to_indices[anchor_label])
        positive_text = self.dataset[positive_index][self.text_column]

        # Create a negative pair (an item with a different label)
        negative_label = np.random.choice([l for l in self.labels if l != anchor_label])
        negative_index = np.random.choice(self.label_to_indices[negative_label])
        negative_text = self.dataset[negative_index][self.text_column]

        return anchor_text, positive_text, negative_text, anchor_label

def collate_fn_pair(batch):
    anchor_texts, positive_texts, negative_texts, labels = zip(*batch)
    anchor_encoding = tokenizer(anchor_texts, padding=True, truncation=True, return_tensors='pt')
    positive_encoding = tokenizer(positive_texts, padding=True, truncation=True, return_tensors='pt')
    negative_encoding = tokenizer(negative_texts, padding=True, truncation=True, return_tensors='pt')
    labels = torch.tensor(labels)
    return {'anchor_input': anchor_encoding, 'positive_input': positive_encoding, 'negative_input': negative_encoding, 'labels': labels}

# Load your dataset
try:
    df = pd.read_json(os.path.abspath("..\\..\\benchmark\\benchmark_v2\\benchmark_v2.json"))
    df = df.explode("text").reset_index().rename({"index": "label"}, axis=1)

    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

    train_dataset = datasets.Dataset.from_pandas(train_df)
    test_dataset = datasets.Dataset.from_pandas(test_df)

    dataset = datasets.DatasetDict({
        "train": train_dataset,
        "eval": test_dataset
    })
    text_column = 'text'
    label_column = 'label' if 'label' in dataset['train'].column_names else None
except FileNotFoundError:
    print("Error: 'your_dataset.csv' not found.")
    exit()

tokenizer = BertTokenizer.from_pretrained('tbs17/MathBERT')
model = BertModel.from_pretrained("tbs17/MathBERT")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Split dataset
train_dataset, eval_dataset = train_test_split(dataset['train'], test_size=0.2, random_state=42, stratify=dataset['train'][label_column] if label_column else None)
train_dataset = LabelledPairDataset(train_dataset, tokenizer, text_column, label_column)
eval_dataset = LabelledPairDataset(eval_dataset, tokenizer, text_column, label_column)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn_pair)
eval_dataloader = DataLoader(eval_dataset, batch_size=16, collate_fn=collate_fn_pair)

optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * 3)

def get_sentence_embedding(model_output):
    return model_output.pooler_output

def triplet_loss(anchor_embedding, positive_embedding, negative_embedding, margin=1.0):
    # Triplet loss aims to minimize the distance between anchor and positive,
    # and maximize the distance between anchor and negative.
    positive_distance = F.cosine_similarity(anchor_embedding, positive_embedding)
    negative_distance = F.cosine_similarity(anchor_embedding, negative_embedding)
    loss = torch.relu(negative_distance - positive_distance + margin).mean()
    return loss

num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        optimizer.zero_grad()

        anchor_inputs = {k: v.to(device) for k, v in batch['anchor_input'].items()}
        positive_inputs = {k: v.to(device) for k, v in batch['positive_input'].items()}
        negative_inputs = {k: v.to(device) for k, v in batch['negative_input'].items()}

        anchor_output = model(**anchor_inputs)
        positive_output = model(**positive_inputs)
        negative_output = model(**negative_inputs)

        anchor_embedding = get_sentence_embedding(anchor_output)
        positive_embedding = get_sentence_embedding(positive_output)
        negative_embedding = get_sentence_embedding(negative_output)

        loss = triplet_loss(anchor_embedding, positive_embedding, negative_embedding)
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader)}")

# After fine-tuning, you would get embeddings as before.

TypeError: '<' not supported between instances of 'int' and 'ellipsis'