# Exploration

In [None]:
import pandas as pd
df = pd.read_excel("./data/output_test.xlsx")
print(df.head(10))
print(df.columns)
print(df.dtypes)
print(df.info())

                                 aid                        categories  \
0  http://arxiv.org/abs/2504.15253v1                       cs.CL,cs.LG   
1  http://arxiv.org/abs/2504.01456v1                       astro-ph.GA   
2  http://arxiv.org/abs/2504.02811v1                             cs.IR   
3  http://arxiv.org/abs/2504.02637v1                             cs.NI   
4  http://arxiv.org/abs/2504.15250v1  cond-mat.stat-mech,cond-mat.soft   
5  http://arxiv.org/abs/2504.02646v1         cs.LG,cs.AI,cs.IR,stat.ML   
6  http://arxiv.org/abs/2504.02538v1                       astro-ph.GA   
7  http://arxiv.org/abs/2504.01395v1                       cs.CR,cs.AI   
8  http://arxiv.org/abs/2504.02539v1                           math.HO   
9  http://arxiv.org/abs/2504.16528v1                             cs.GT   

        main_category             published  \
0               cs.CL  2025-04-21T17:33:23Z   
1         astro-ph.GA  2025-04-02T08:10:05Z   
2               cs.IR  2025-04-03T17:55:12Z 

In [35]:
# No missings
print(df.isna().sum())

aid              0
categories       0
main_category    0
published        0
summary          0
title            0
dtype: int64


In [36]:
# Lot of Class imbalance, most come from computer science (computer vision and pattern recognition the most)

print(df.value_counts("categories", sort=True))
print(df.value_counts("main_category", sort=True)) #145 differnet categories

categories
cs.CV                                                                           861
quant-ph                                                                        318
cs.CL                                                                           283
cs.LG                                                                           263
math.AP                                                                         213
                                                                               ... 
astro-ph.HE,astro-ph.CO,astro-ph.SR,hep-ph,nucl-th                                1
astro-ph.EP,physics.soc-ph                                                        1
astro-ph.GA,astro-ph.CO,astro-ph.IM,astro-ph.SR,physics.ed-ph,physics.soc-ph      1
astro-ph.GA,astro-ph.CO,astro-ph.SR                                               1
stat.CO,62-08                                                                     1
Name: count, Length: 2580, dtype: int64
main_category
cs.CV      

In [None]:
df_summary_title = df.loc[:, ["title","summary", "categories"]].copy()
print(df_summary_title.head(10))

                                                                                                                           title  \
0                       Evaluating Judges as Evaluators: The JETTS Benchmark of LLM-as-Judges as\n  Test-Time Scaling Evaluators   
1                               Multiscale exploration of SMACS J0723.3--7327's intracluster light and\n  past dynamical history   
2  An Assessment of the CO2 Emission Reduction Potential of Residential\n  Load Management in Developing and Developed Countries   
3                                                           Medium Access for Push-Pull Data Transmission in 6G Wireless Systems   
4                                            Tracer dynamics in an interacting active bath: fluctuations and energy\n  partition   
5                                                                                    Prompt Optimization with Logged Bandit Data   
6                                              Detection of Deuterated Hydro

In [38]:
df['combined'] = df['title'] + " [SUMMARY] " + df['summary']
print(df[['combined']].head(10))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [44]:
from sklearn.model_selection import train_test_split

# Only keep needed columns
newdf = df[['combined','main_category', 'categories']].copy()
print(newdf.head(10))

# Preparation for multiclass main category classification
# Create a new column 'main_cat_main' based on the value in 'main_category'
def map_main_category(value):
    prefix_map = {
        "cs.": "cs",
        "econ.": "econ",
        "eess.": "eess",
        "math.": "math",
        "q-bio.": "q-bio",
        "q-fin.": "q-fin",
        "stat.": "stat"
    }

    for prefix, category in prefix_map.items():
        if value.startswith(prefix):
            return category

    physics_prefixes = [
        "astro-", "cond-mat", "gr-", "hep-", "math-", "nlin.",
        "nucl-", "physics.", "quant-"
    ]
    if any(value.startswith(prefix) for prefix in physics_prefixes):
        return "physics"

    return "other"

newdf['main_cat_main'] = newdf['main_category'].apply(map_main_category)
print(newdf.head(10))
print(newdf['main_cat_main'].value_counts())


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [46]:
newdf["combined"] = newdf["combined"].str.replace(r"[^a-zA-Z\s]", "", regex=True)
newdf["combined"] = newdf["combined"].str.replace(r"\s+", " ", regex=True)

pd.set_option("display.max_colwidth", None)
print(newdf)

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [48]:
# Perform train-test-split on newdf, ensure to add stratify to keep class imbalance intact
train_df, test_df = train_test_split(newdf, test_size=0.2, random_state=42, stratify=newdf['main_cat_main'])
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42, stratify=train_df['main_cat_main'])
print(f"Train set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")
print(f"Validation set shape: {val_df.shape}")

Train set shape: (8895, 4)
Test set shape: (2471, 4)
Validation set shape: (989, 4)


In [50]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")

CUDA available: True
GPU: NVIDIA GeForce RTX 4060 Laptop GPU


In [52]:
from transformers import pipeline
llm = pipeline(model="distilbert/distilbert-base-uncased",device=0)
print(llm.model.config.is_decoder)
print(llm.model.config.is_encoder_decoder)
print(llm.model)
print(llm.model.config)

Device set to use cuda:0


False
False
DistilBertForMaskedLM(
  (activation): GELUActivation()
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout):

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report, confusion_matrix, classification_report
import numpy as np
from datasets import Dataset
from transformers import TrainerCallback
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.metrics import confusion_matrix, classification_report



unique_categories = newdf['main_cat_main'].unique().tolist()
num_labels = len(unique_categories)
print(f"Number of unique categories: {num_labels}")

# Use transform to apply the same mapping to the test and validation set
label_encoder = LabelEncoder()
train_df['labels'] = label_encoder.fit_transform(train_df['main_cat_main'])
for df in [test_df, val_df]:
    df['labels'] = label_encoder.transform(df['main_cat_main'])

# Use this to easy trainer and to interpret predictions
id2label = {i: label for i, label in enumerate(label_encoder.classes_)}
label2id = {label: i for i, label in enumerate(label_encoder.classes_)}

print(f"\nNumber of unique categories in newdf: {num_labels}")
print(f"Label to ID mapping (example): {label2id}")
print(f"ID to Label mapping (example):: {id2label}")

# Transform the DataFrames to Hugging Face datasets, to speed up training and evaluation when tokenizing (used later in trainer)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
val_dataset = Dataset.from_pandas(val_df)

# Load model and tokenizer
model_name = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, id2label=id2label, label2id=label2id)

# Tokenization function for Dataset.map() method to speed up tokenization, maybe add max_length=64 to speed up further?
def tokenize_function(examples):
    return tokenizer(examples['combined'], padding=True, truncation=True)

# Apply function to datasets.map() to tokenize on batch size. Allready tokenize by batch to speed up
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True,\
                                             remove_columns=['combined', 'main_category', 'categories', 'main_cat_main'])
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True, \
                                           remove_columns=['combined', 'main_category', 'categories', 'main_cat_main'])
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True, \
                                        remove_columns=['combined', 'main_category', 'categories', 'main_cat_main'])

# compute_metrics function for Trainer: class imbalance, so weighted f1 (average of precision and recall) and confusion matrix are best metrics to use
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    weighted_f1 = f1_score(labels, predictions, average='weighted')
    return {
        'weighted_f1': weighted_f1
    }

# Hyperparameter Search Space
def hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 2, 6),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16]),
        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.3),
    }

lr = 2e-5
batch_size = 8
num_epochs = 10 

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_dir="./results",
    logging_strategy="epoch",
    save_strategy="epoch",
    eval_strategy="epoch",
    metric_for_best_model="eval_weighted_f1",
    load_best_model_at_end=True,
    weight_decay=0.01
)


def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id
    )

# We have class imbalance, so weighted f1 (average of precision and recall) and confusion matrix are best metrics to use
## Class imbalance: use class weights
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer, 
    compute_metrics= compute_metrics,
)
trainer.train()
print("\nStart training...")
print("Training completed")


def compute_objective(metrics):
    return metrics["eval_weighted_f1"]

# Run Hyperparameter Tuning 
print("\nStarting hyperparameter tuning...")
best_run = trainer.hyperparameter_search(
    direction="maximize",
    hp_space=hp_space,
    compute_objective=compute_objective,
    n_trials=10
)

print("\nBest hyperparameters found:")
print(best_run.hyperparameters)

# Update Trainer Arguments with Best Parameters
trainer.args.learning_rate = best_run.hyperparameters["learning_rate"]
trainer.args.num_train_epochs = best_run.hyperparameters["num_train_epochs"]
trainer.args.per_device_train_batch_size = best_run.hyperparameters["per_device_train_batch_size"]
trainer.args.weight_decay = best_run.hyperparameters["weight_decay"]

# Retrain Model with Best Hyperparameters
print("\nRetraining with best hyperparameters...")
trainer.train()
print("Training completed.")

# Evaluate on Test Set
predictions = trainer.predict(tokenized_test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=-1)
true_labels = predictions.label_ids

print("\nClassification Report on the Testset:")
print(classification_report(true_labels, predicted_labels, target_names=label_encoder.classes_))

print("\nConfusion Matrix on the Testset:")
print(confusion_matrix(true_labels, predicted_labels))

Number of unique categories: 8

Aantal unieke categorieën in newdf: 8
Label to ID mapping (voorbeeld): {'cs': 0, 'econ': 1, 'eess': 2, 'math': 3, 'physics': 4, 'q-bio': 5, 'q-fin': 6, 'stat': 7}
ID to Label mapping (voorbeeld): {0: 'cs', 1: 'econ', 2: 'eess', 3: 'math', 4: 'physics', 5: 'q-bio', 6: 'q-fin', 7: 'stat'}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/8895 [00:00<?, ? examples/s]

Map:   0%|          | 0/2471 [00:00<?, ? examples/s]

Map:   0%|          | 0/989 [00:00<?, ? examples/s]

  trainer = Trainer(
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Weighted F1
1,0.5654,0.443555,0.862031
2,0.354,0.453476,0.869086
3,0.2396,0.485263,0.884562
4,0.1421,0.582506,0.884837
5,0.0827,0.658291,0.886622
6,0.0459,0.757856,0.878606
7,0.0291,0.834854,0.872926
8,0.0161,0.834629,0.87832
9,0.011,0.847642,0.882728
10,0.0032,0.857472,0.881196



Start training...
Training completed

Starting hyperparameter tuning...


[I 2025-05-28 18:42:27,784] A new study created in memory with name: no-name-d82231a6-d777-42da-978d-04dc34475896
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Weighted F1
1,0.6153,0.488472,0.861173
2,0.4096,0.509951,0.877071
3,0.2787,0.581369,0.879998
4,0.1697,0.58674,0.893691
5,0.0882,0.652961,0.887212
6,0.0511,0.700268,0.888797


[I 2025-05-28 19:09:48,747] Trial 0 finished with value: 0.888797317050634 and parameters: {'learning_rate': 1.935919576485628e-05, 'num_train_epochs': 6, 'per_device_train_batch_size': 4, 'weight_decay': 0.10127885994499723}. Best is trial 0 with value: 0.888797317050634.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Weighted F1
1,0.6349,0.442601,0.881121
2,0.414,0.497282,0.886976
3,0.2682,0.596448,0.879952
4,0.158,0.691605,0.883905
5,0.0775,0.787576,0.878586
6,0.0414,0.806898,0.877187


[I 2025-05-28 19:36:57,286] Trial 1 finished with value: 0.8771869472898052 and parameters: {'learning_rate': 4.683859944137985e-05, 'num_train_epochs': 6, 'per_device_train_batch_size': 4, 'weight_decay': 0.03998220005599627}. Best is trial 0 with value: 0.888797317050634.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Weighted F1
1,0.5934,0.415661,0.856208
2,0.3491,0.376013,0.876857
3,0.2437,0.376916,0.890072
4,0.1605,0.388078,0.89346
5,0.1032,0.452698,0.884931
6,0.0729,0.457719,0.89014


[I 2025-05-28 19:59:48,056] Trial 2 finished with value: 0.8901403520274012 and parameters: {'learning_rate': 1.817238607053195e-05, 'num_train_epochs': 6, 'per_device_train_batch_size': 16, 'weight_decay': 0.27776385810057325}. Best is trial 2 with value: 0.8901403520274012.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Weighted F1
1,0.6153,0.486319,0.860993
2,0.4138,0.499511,0.881484
3,0.2926,0.561316,0.884838
4,0.1866,0.585288,0.888813
5,0.1031,0.658497,0.888448
6,0.0702,0.671296,0.88655


[I 2025-05-28 20:27:01,961] Trial 3 finished with value: 0.886550277878506 and parameters: {'learning_rate': 1.3802174516551864e-05, 'num_train_epochs': 6, 'per_device_train_batch_size': 4, 'weight_decay': 0.148664915246595}. Best is trial 2 with value: 0.8901403520274012.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Weighted F1
1,0.6077,0.472926,0.8654
2,0.3769,0.431755,0.879


[I 2025-05-28 20:36:12,114] Trial 4 finished with value: 0.8790004038908618 and parameters: {'learning_rate': 1.947054786272091e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 4, 'weight_decay': 0.004276486272880864}. Best is trial 2 with value: 0.8901403520274012.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Weighted F1
1,0.5942,0.448246,0.857493


[I 2025-05-28 20:40:16,888] Trial 5 pruned. 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Weighted F1
1,0.5584,0.423271,0.869826


[I 2025-05-28 20:44:22,214] Trial 6 pruned. 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Weighted F1
1,0.6027,0.440434,0.849595


[I 2025-05-28 20:48:14,215] Trial 7 pruned. 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Weighted F1
1,0.581,0.447782,0.861628


[I 2025-05-28 20:52:19,100] Trial 8 pruned. 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Weighted F1
1,0.6173,0.449224,0.845194


[I 2025-05-28 20:56:09,550] Trial 9 pruned. 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Best hyperparameters found:
{'learning_rate': 1.817238607053195e-05, 'num_train_epochs': 6, 'per_device_train_batch_size': 16, 'weight_decay': 0.27776385810057325}

Retraining with best hyperparameters...


Epoch,Training Loss,Validation Loss,Weighted F1
1,0.5934,0.415776,0.856208
2,0.3491,0.376725,0.876857
3,0.2437,0.377133,0.888093
4,0.1607,0.389097,0.892425
5,0.1031,0.453437,0.884991
6,0.073,0.458862,0.89014


Training completed.



Classification Report on the Testset:
              precision    recall  f1-score   support

          cs       0.91      0.91      0.91      1051
        econ       0.50      0.29      0.37        17
        eess       0.55      0.58      0.57       105
        math       0.83      0.86      0.85       423
     physics       0.94      0.95      0.94       795
       q-bio       0.50      0.30      0.38        20
       q-fin       0.86      0.67      0.75         9
        stat       0.60      0.51      0.55        51

    accuracy                           0.88      2471
   macro avg       0.71      0.63      0.66      2471
weighted avg       0.88      0.88      0.88      2471


Confusion Matrix on the Testset:
[[955   0  34  32  23   1   0   6]
 [  4   5   1   2   1   1   0   3]
 [ 34   0  61   8   2   0   0   0]
 [ 28   3   7 363  17   0   0   5]
 [ 14   0   4  21 752   3   0   1]
 [  9   0   1   1   2   6   0   1]
 [  0   1   0   1   0   0   6   1]
 [ 11   1   2   7   2   1   1  