In [1]:
import os
import sys
sys.path.append(os.path.abspath("C:/Users/mokrota/Documents/GitHub/math_problem_recommender/math_problem_recommender"))

In [2]:
import pandas as pd
import os

df = pd.read_json(os.path.abspath("..\\..\\benchmark\\benchmark_v2\\benchmark_v2.json"))
df

Unnamed: 0,text
Chinese Remainder Theorem,[Solving $x \equiv 2 \mod 3$ and $x \equiv 3 \...
Diophantine Equations,[Solving $3x + 5y = 1$ using the extended Eucl...
Divisibility,[From (1) it follows that $A(x_{1}+y_{1})=p^{k...
Euler’s Theorem,"[By Euler’s Theorem, $10^{\varphi(f k)}\equiv1..."
Extremal Principles,"[If $x\geq3$ , $y\geq3$ , $z\geq3$ then $x y z..."
Fermat’s Little Theorem,[The formula in our problem shows that the sum...
Modular Arithmetics,[Assume that we have a prime $p$ such that $p|...
Pigeonhole Principle,[Let $S$ be the set of nonnegative integers le...
Prime Numbers,[Assume that we have a prime $p$ such that $p|...
Quadratic Residues,"[If $n$ is even, then $p\equiv3$ (mod 4) and $..."


In [3]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

df = df.explode("text").reset_index().rename({"index": "label"}, axis=1)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

dataset = DatasetDict({
    "train": train_dataset,
    "eval": test_dataset
})
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text', '__index_level_0__'],
        num_rows: 80
    })
    eval: Dataset({
        features: ['label', 'text', '__index_level_0__'],
        num_rows: 20
    })
})

In [5]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('tbs17/MathBERT', output_hidden_states=True)
model = BertModel.from_pretrained("tbs17/MathBERT")
device = torch.device("cuda")
model = model.to(device)

### Trying on a single text

In [6]:
test_tex = df.iloc[0]['text']
tokens = tokenizer(test_tex, return_tensors="pt").to(device)
test_tex

'Solving $x \\equiv 2 \\mod 3$ and $x \\equiv 3 \\mod 5$ gives $x \\equiv 8 \\mod 15$.'

In [7]:
with torch.no_grad():
    output = model(**tokens)
output.pooler_output.shape

torch.Size([1, 768])

### Applying to the whole dataset

In [8]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, return_tensors="pt").to(device)

In [9]:
encoded = dataset.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [10]:
encoded

DatasetDict({
    train: Dataset({
        features: ['label', 'text', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 80
    })
    eval: Dataset({
        features: ['label', 'text', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 20
    })
})

In [11]:
def get_embed(batch):
    inputs = {k: torch.asarray(v).to(device) for k, v in batch.items() if k in tokenizer.model_input_names}
    with torch.no_grad():
        outputs = model(**inputs)
    return {"pooling_layer": outputs.pooler_output.cpu().numpy(), "hidden_state": outputs.last_hidden_state.cpu().numpy()}

In [12]:
import numpy as np

embeddings_dataset = encoded.map(get_embed, batched=True)
embeddings = np.array(embeddings_dataset['eval']['pooling_layer'])

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [13]:
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import plotly.express as px

cluster_column = 'label'
text_column = 'text'
n_clusters = dataset['eval'].num_rows if cluster_column is None else len(dataset['eval'].unique(cluster_column))
if n_clusters > embeddings.shape[0]:
    n_clusters = embeddings.shape[0]

kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
predicted_clusters = kmeans.fit_predict(embeddings)

# Add predicted clusters back to the dataset['eval']
try:
    dataset['eval'] = dataset['eval'].add_column("predicted_cluster", predicted_clusters)
except Exception as _:
    pass

# Evaluate (Optional)
if cluster_column:
    true_clusters = dataset['eval'][cluster_column]
    ari = adjusted_rand_score(true_clusters, predicted_clusters)
    nmi = normalized_mutual_info_score(true_clusters, predicted_clusters)
    print(f"Adjusted Rand Index: {ari:.4f}")
    print(f"Normalized Mutual Information: {nmi:.4f}")

# 2D Projection using TSNE
tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(embeddings) - 1))
reduced_embeddings = tsne.fit_transform(embeddings)

# Create DataFrames for plotting
plot_df = pd.DataFrame({'x': reduced_embeddings[:, 0],
                        'y': reduced_embeddings[:, 1],
                        'text': dataset['eval'][text_column],
                        'expected_cluster': dataset['eval'][cluster_column] if cluster_column else ['Unknown'] * len(embeddings),
                        'predicted_cluster': dataset['eval']['predicted_cluster']})

# --- Plot 1: Interactive Expected Clusters ---
fig_expected = px.scatter(plot_df,
                          x='x',
                          y='y',
                          color='expected_cluster',
                          hover_name='text',
                          title='Interactive 2D Projection - Expected Clusters',
                          labels={'expected_cluster': 'Expected Cluster',
                                  'x': 'TSNE Dimension 1',
                                  'y': 'TSNE Dimension 2'})
fig_expected.show()

# --- Plot 2: Interactive Predicted Clusters ---
fig_predicted = px.scatter(plot_df,
                           x='x',
                           y='y',
                           color='predicted_cluster',
                           hover_name='text',
                           title='Interactive 2D Projection - Predicted Clusters',
                           labels={'predicted_cluster': 'Predicted Cluster',
                                   'x': 'TSNE Dimension 1',
                                   'y': 'TSNE Dimension 2'})
fig_predicted.show()

Adjusted Rand Index: 0.0662
Normalized Mutual Information: 0.6137
