In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, classification_report
import ast

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [6]:
train_df = pd.read_csv("train_df.csv")
test_df = pd.read_csv("test_df.csv")
eval_df = pd.read_csv("eval_df.csv")
train_df_ft = pd.read_csv("train_df_ft.csv")

In [7]:
train_df = train_df[train_df["tags_filtered"].notna()]

In [8]:
from huggingface_hub import login
login(new_session=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [20]:
model_dir = "/content/drive/MyDrive/code_classification_dataset/finetuning/best_run_v3"

In [21]:
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
model.to(device)
model.eval()

id2label = model.config.id2label
print(id2label)

{0: 'games', 1: 'geometry', 2: 'graphs', 3: 'math', 4: 'number theory', 5: 'probabilities', 6: 'strings', 7: 'trees'}


In [25]:
model_name = "Mallard74/codebert-xcode-tags-classification"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [26]:
model.to(device)
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [15]:
BATCH_SIZE = 32
MAX_LEN = 512
THRESHOLD = 0.5

In [None]:
focus_tags_eval = ['math', 'graphs', 'strings', 'number theory',
               'trees', 'geometry', 'games', 'probabilities']

In [None]:
def parse_tags(x):
    """Convert \"['math', 'graphs']\" -> ['math', 'graphs']."""
    if isinstance(x, str):
        return ast.literal_eval(x)
    return x

def make_input(row):
    # Simple concat with separator
    return f"[DESC] {row['description_clean']} [CODE] {row['code_clean']}"

def encode_batch(rows):
    texts = [make_input(row) for _, row in rows.iterrows()]
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt",
    )
def probs_to_tag_lists(probs, focus_tags, threshold=0.5, force_one=True):
    """
    Convert probabilities (n_samples, n_labels) to list of tag lists.
    """
    tag_lists = []
    for p in probs:
        indices = np.where(p >= threshold)[0]
        if len(indices) == 0 and force_one:
            indices = [int(np.argmax(p))]
        tag_lists.append([focus_tags_eval[i] for i in indices])
    return tag_lists




In [31]:
text_col = ["description_clean" , "code_clean"]
tags_col = "tags_filtered"

In [14]:
y_true_tags = [parse_tags(t) for t in test_df[tags_col]]

In [None]:
all_probs = []

with torch.no_grad():
    for start in range(0, len(test_df), BATCH_SIZE):
        batch_df = test_df.iloc[start:start + BATCH_SIZE]   
        batch_texts = [make_input(row) for _, row in batch_df.iterrows()]

        enc = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=MAX_LEN,
            return_tensors="pt",
        )
        enc = {k: v.to(device) for k, v in enc.items()}

        outputs = model(**enc)
        logits = outputs.logits
        batch_probs = torch.sigmoid(logits).cpu().numpy()
        all_probs.append(batch_probs)

probs = np.concatenate(all_probs, axis=0)

In [None]:
y_pred_tags = probs_to_tag_lists(probs, focus_tags_eval, threshold=0.5)

mlb = MultiLabelBinarizer(classes=focus_tags_eval)
mlb.fit([[]])  

Y_true = mlb.transform(y_true_tags)
Y_pred = mlb.transform(y_pred_tags)

print("Y_true shape:", Y_true.shape)
print("Y_pred shape:", Y_pred.shape)

Y_true shape: (531, 8)
Y_pred shape: (531, 8)


In [None]:
# baseline
y_true = np.array(Y_true)
y_pred = np.array(Y_pred)

accuracy = accuracy_score(y_true,y_pred)
f1 = f1_score(y_true, y_pred, average='micro', zero_division=0)
precision = precision_score(y_true, y_pred, average='micro', zero_division=0)
recall = recall_score(y_true, y_pred, average='micro', zero_division=0)

print(f" f1: {f1}| accuracy: {accuracy} | precision: {precision} | recall: {recall}")

print("\nPer-tag report:")
print(classification_report(y_true, y_pred, target_names=focus_tags_eval, zero_division=0))

 f1: 0.6764227642276422| accuracy: 0.5404896421845574 | precision: 0.7577413479052824 | recall: 0.6108663729809104

Per-tag report:
               precision    recall  f1-score   support

         math       0.73      0.90      0.81       282
       graphs       0.72      0.62      0.67       108
      strings       0.89      0.79      0.84        84
number theory       0.00      0.00      0.00        70
        trees       0.88      0.35      0.51        65
     geometry       1.00      0.15      0.26        33
        games       0.00      0.00      0.00        21
probabilities       0.00      0.00      0.00        18

    micro avg       0.76      0.61      0.68       681
    macro avg       0.53      0.35      0.38       681
 weighted avg       0.66      0.61      0.60       681
  samples avg       0.76      0.66      0.69       681



In [None]:
from sklearn.metrics import multilabel_confusion_matrix

cm = multilabel_confusion_matrix(y_true, y_pred)

for i, tag in enumerate(focus_tags_eval):
    tn, fp, fn, tp = cm[i].ravel()
    print(f"\nTag: {tag}")
    print(f"TP: {tp} | FP: {fp} | FN: {fn} | TN: {tn}")


Tag: math
TP: 255 | FP: 96 | FN: 27 | TN: 153

Tag: graphs
TP: 67 | FP: 26 | FN: 41 | TN: 397

Tag: strings
TP: 66 | FP: 8 | FN: 18 | TN: 439

Tag: number theory
TP: 0 | FP: 0 | FN: 70 | TN: 461

Tag: trees
TP: 23 | FP: 3 | FN: 42 | TN: 463

Tag: geometry
TP: 5 | FP: 0 | FN: 28 | TN: 498

Tag: games
TP: 0 | FP: 0 | FN: 21 | TN: 510

Tag: probabilities
TP: 0 | FP: 0 | FN: 18 | TN: 513


In [None]:
#best run v3 (on full train set)

y_true = np.array(Y_true)
y_pred = np.array(Y_pred)

accuracy = accuracy_score(y_true,y_pred)
f1 = f1_score(y_true, y_pred, average='micro', zero_division=0)
precision = precision_score(y_true, y_pred, average='micro', zero_division=0)
recall = recall_score(y_true, y_pred, average='micro', zero_division=0)

print(f" f1: {f1}| accuracy: {accuracy} | precision: {precision} | recall: {recall}")

print("\nPer-tag report:")
print(classification_report(y_true, y_pred, target_names=focus_tags_eval, zero_division=0))

 f1: 0.7588075880758808| accuracy: 0.5536723163841808 | precision: 0.7044025157232704 | recall: 0.8223201174743024

Per-tag report:
               precision    recall  f1-score   support

         math       0.83      0.82      0.82       282
       graphs       0.58      0.83      0.68       108
      strings       0.84      0.88      0.86        84
number theory       0.57      0.76      0.65        70
        trees       0.61      0.85      0.71        65
     geometry       0.80      0.85      0.82        33
        games       0.67      0.76      0.71        21
probabilities       0.42      0.78      0.55        18

    micro avg       0.70      0.82      0.76       681
    macro avg       0.67      0.82      0.73       681
 weighted avg       0.73      0.82      0.77       681
  samples avg       0.76      0.85      0.78       681



In [None]:
from sklearn.metrics import multilabel_confusion_matrix

cm = multilabel_confusion_matrix(y_true, y_pred)

for i, tag in enumerate(focus_tags_eval):
    tn, fp, fn, tp = cm[i].ravel()
    print(f"\nTag: {tag}")
    print(f"TP: {tp} | FP: {fp} | FN: {fn} | TN: {tn}")



Tag: math
TP: 225 | FP: 53 | FN: 57 | TN: 196

Tag: graphs
TP: 87 | FP: 56 | FN: 21 | TN: 367

Tag: strings
TP: 70 | FP: 15 | FN: 14 | TN: 432

Tag: number theory
TP: 51 | FP: 43 | FN: 19 | TN: 418

Tag: trees
TP: 58 | FP: 43 | FN: 7 | TN: 423

Tag: geometry
TP: 29 | FP: 6 | FN: 4 | TN: 492

Tag: games
TP: 16 | FP: 9 | FN: 5 | TN: 501

Tag: probabilities
TP: 13 | FP: 23 | FN: 5 | TN: 490


In [None]:
y_true_int = y_true.astype(int)
y_pred_int = y_pred.astype(int)

conf_8x8 = y_true_int.T @ y_pred_int   

conf_df = pd.DataFrame(
    conf_8x8,
    index=[f"true_{t}" for t in focus_tags_eval],
    columns=[f"pred_{t}" for t in focus_tags_eval]
)

print(conf_df)

                    pred_math  pred_graphs  pred_strings  pred_number theory  \
true_math                 255           15            10                   0   
true_graphs                38           67             2                   0   
true_strings               18            0            66                   0   
true_number theory         64            4             1                   0   
true_trees                 13           42             2                   0   
true_geometry              24            3             1                   0   
true_games                 16            2             2                   0   
true_probabilities         16            2             0                   0   

                    pred_trees  pred_geometry  pred_games  pred_probabilities  
true_math                    1              1           0                   0  
true_graphs                 12              0           0                   0  
true_strings                 0         