In [None]:
!pip install -U "transformers>=4.34.0" "datasets>=2.15.0" "accelerate>=0.23.0" \
    "pandas>=2.0.0" "numpy>=1.26.0" "scikit-learn>=1.3.0" \
    "joblib>=1.3.0" "pyarrow>=14.0.0" "fastparquet>=2023.8.0" "tqdm>=4.66.0"


Collecting transformers>=4.34.0
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting datasets>=2.15.0
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting pandas>=2.0.0
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting numpy>=1.26.0
  Downloading numpy-2.3.5-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn>=1.3.0
  Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting pyarrow>=14.0.0
  Downloading pyarrow-22.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer
import pandas as pd

def load_parquet_dataset(parquet_path, text_field="code", label_field="label"):
    """
    Load a parquet file as a streaming Hugging Face dataset.
    Returns an IterableDataset that can be sampled from.
    """
    dataset = load_dataset("parquet", data_files=parquet_path, split="train", streaming=True)
    return dataset.map(lambda x: {text_field: x[text_field], label_field: x[label_field]})


class CodeDatasetPreprocessor:
    """
    Tokenizes code or text fields in Hugging Face datasets for transformer models.
    """

    def __init__(self, model_name_or_path, max_length=512, text_field="code"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
        self.max_length = max_length
        self.text_field = text_field

    def tokenize_batch(self, examples):
        return self.tokenizer(
            examples[self.text_field],
            truncation=True,
            padding="max_length",
            max_length=self.max_length
        )
    def prepare(self, dataset):
        """
        Apply tokenization to a Hugging Face Dataset (non-streaming).
        """
        return dataset.map(
            self.tokenize_batch,
            batched=True,
            remove_columns=[self.text_field]
        )


In [None]:
# src/metrics.py
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import math
from typing import Dict

def classification_metrics(preds, labels) -> Dict:
    """Compute accuracy, precision, recall, and f1 (weighted).
    preds can be logits (ndarray) or 1d predicted labels."""
    import numpy as _np
    preds_arr = _np.array(preds)
    if preds_arr.ndim > 1:
        pred_labels = preds_arr.argmax(axis=1)
    else:
        pred_labels = preds_arr
    acc = accuracy_score(labels, pred_labels)
    prec, rec, f1, _ = precision_recall_fscore_support(labels, pred_labels, average='weighted', zero_division=0)
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}

def perplexity(loss):
    return math.exp(loss)

In [None]:
# src/models.py
from transformers import AutoModelForSequenceClassification, AutoConfig

def get_sequence_classification_model(name_or_path, num_labels, from_pretrained=True):
    if from_pretrained:
        model = AutoModelForSequenceClassification.from_pretrained(name_or_path, num_labels=num_labels)
    else:
        config = AutoConfig.from_pretrained(name_or_path, num_labels=num_labels)
        model = AutoModelForSequenceClassification.from_config(config)
    return model

In [None]:
# ==============================
# ✅ SETUP
# ==============================
!pip install -q transformers datasets accelerate evaluate scikit-learn

import os
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch, gc

# ==============================
# ✅ DATA HANDLING
# ==============================
def load_parquet_dataset(parquet_path, text_field="code", label_field="label"):
    dataset = load_dataset("parquet", data_files=parquet_path)["train"]
    sample = dataset[0]
    if text_field not in sample:
        raise ValueError(f"'{text_field}' not found in dataset columns: {list(sample.keys())}")
    if label_field not in sample:
        raise ValueError(f"'{label_field}' not found in dataset columns: {list(sample.keys())}")
    return dataset


def subsample_dataset(dataset, max_size=12000, seed=42):
    """Keep manageable subset (for Colab GPU)."""
    if len(dataset) > max_size:
        dataset = dataset.shuffle(seed=seed).select(range(max_size))
    return dataset


# ==============================
# ✅ TOKENIZATION
# ==============================
class CodeDatasetPreprocessor:
    def __init__(self, model_name_or_path, max_length=512, text_field="code"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
        self.max_length = max_length
        self.text_field = text_field

    def tokenize_batch(self, examples):
        return self.tokenizer(
            examples[self.text_field],
            truncation=True,
            max_length=self.max_length,
        )

    def prepare(self, dataset):
        return dataset.map(self.tokenize_batch, batched=True, remove_columns=[self.text_field])


# ==============================
# ✅ METRICS
# ==============================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted", zero_division=0)
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}


# ==============================
# ✅ MAIN TRAINING FUNCTION
# ==============================
def main():
    model_name = "microsoft/codebert-base"
    train_path = "/content/drive/MyDrive/trainingset.parquet"
    val_path = "/content/drive/MyDrive/validationset.parquet"
    out_dir = "/content/drive/MyDrive/experiments_codebert"
    num_labels = 11
    epochs = 15              # ⬆️ more epochs = better convergence
    batch_size = 8
    grad_accum = 8
    lr = 1.5e-5              # ✅ slightly lower LR for stable fine-tuning
    os.makedirs(out_dir, exist_ok=True)
    print("📂 Loading parquet datasets...")
    train_ds = load_parquet_dataset(train_path)
    val_ds = load_parquet_dataset(val_path)

    print("📉 Subsampling datasets (if large)...")
    train_ds = subsample_dataset(train_ds, max_size=12000)
    val_ds = subsample_dataset(val_ds, max_size=4000)

    print("🔧 Tokenizing datasets...")
    preproc = CodeDatasetPreprocessor(model_name, max_length=512, text_field="code")
    train_tok = preproc.prepare(train_ds)
    val_tok = preproc.prepare(val_ds)

    print("🤖 Initializing model...")
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    # Enable memory-efficient training
    model.gradient_checkpointing_enable()

    args = TrainingArguments(
        output_dir=out_dir,
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.05,
        logging_steps=200,
        fp16=True,
        gradient_accumulation_steps=grad_accum,
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        report_to="none",
        lr_scheduler_type="cosine",
        warmup_ratio=0.1,
        optim="adamw_torch",           # ✅ native fused optimizer
        dataloader_num_workers=2,
        push_to_hub=False,
    )

    data_collator = DataCollatorWithPadding(tokenizer=preproc.tokenizer)

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_tok,
        eval_dataset=val_tok,
        tokenizer=preproc.tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # 🧹 Clear any unused memory
    gc.collect()
    torch.cuda.empty_cache()

    print("🚀 Starting fine-tuning with CodeBERT...")
    trainer.train()

    trainer.save_model(out_dir)
    preproc.tokenizer.save_pretrained(out_dir)
    print(f"✅ Training complete. Best model saved to {out_dir}")


if __name__ == "__main__":
    main()

📂 Loading parquet datasets...


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

📉 Subsampling datasets (if large)...
🔧 Tokenizing datasets...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

🤖 Initializing model...


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🚀 Starting fine-tuning with CodeBERT...


  trainer = Trainer(


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.491492,0.88575,0.784553,0.88575,0.832086
2,1.195300,0.430036,0.88225,0.877706,0.88225,0.873736
3,0.430800,0.346289,0.90825,0.87109,0.90825,0.887177
4,0.333900,0.305672,0.90575,0.894839,0.90575,0.898539
5,0.293200,0.320327,0.90225,0.894353,0.90225,0.89418
6,0.266400,0.300815,0.90625,0.899094,0.90625,0.900478
7,0.234100,0.350966,0.897,0.909891,0.897,0.898337
8,0.203700,0.347339,0.89725,0.910762,0.89725,0.902564
9,0.192200,0.332239,0.9035,0.909423,0.9035,0.903959
10,0.162500,0.340069,0.9055,0.911377,0.9055,0.90538


✅ Training complete. Best model saved to /content/drive/MyDrive/experiments_codebert


In [None]:
# src/predict_binary.py
from transformers import logging
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
from datasets import Dataset
import os

os.environ["WANDB_DISABLED"] = "true"
logging.set_verbosity_error()

def main():
    # --- Step 1: Paths ---
    model_dir = "/content/drive/MyDrive/experiments_codebert/checkpoint-1880"
    test_path = "/content/drive/MyDrive/test.parquet"
    output_csv = "/content/drive/MyDrive/taskb.csv"

    # --- Step 2: Load test data ---
    print(f"📂 Loading test data from {test_path} ...")
    test_df = pd.read_parquet(test_path)
    test_df = test_df.reset_index(drop=True)
    test_df["ID"] = test_df.index

    # --- Step 3: Load tokenizer and model ---
    print(f"🚀 Loading model and tokenizer from {model_dir} ...")
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)

    # --- Step 4: Tokenize test data ---
    print("🔡 Tokenizing test data ...")
    def preprocess(batch):
        return tokenizer(batch["code"], truncation=True, padding="max_length", max_length=256)
    test_ds = Dataset.from_pandas(test_df)
    test_tokenized = test_ds.map(preprocess, batched=True)

    # --- Step 5: Initialize Trainer for prediction ---
    trainer = Trainer(model=model)

    # --- Step 6: Predict ---
    print("🤖 Running predictions ...")
    preds = trainer.predict(test_tokenized)
    logits = preds.predictions
    y_pred = torch.argmax(torch.tensor(logits), dim=1).numpy()
    # --- Step 7: Save ONLY ID & predicted label ---
    submission = pd.DataFrame({
        "ID": test_df["ID"],
        "Label": y_pred
    })
    submission.to_csv(output_csv, index=False)
    print(f"✅ Predictions saved to {output_csv} (shape: {submission.shape})")
if __name__ == "__main__":
    main()

📂 Loading test data from /content/drive/MyDrive/test.parquet ...
🚀 Loading model and tokenizer from /content/drive/MyDrive/experiments_codebert/checkpoint-1880 ...
🔡 Tokenizing test data ...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

🤖 Running predictions ...




✅ Predictions saved to /content/drive/MyDrive/taskb.csv (shape: (1000, 2))


In [None]:
# src/predict_binary.py
from transformers import logging
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
from datasets import Dataset
import os

os.environ["WANDB_DISABLED"] = "true"
logging.set_verbosity_error()

def main():
    # --- Step 1: Paths ---
    model_dir = "/content/drive/MyDrive/experiments_codebert/checkpoint-1880"
    test_path = "/content/drive/MyDrive/Test.parquet"
    output_csv = "/content/drive/MyDrive/taskb.csv"

    # --- Step 2: Load test data ---
    print(f"📂 Loading test data from {test_path} ...")
    test_df = pd.read_parquet(test_path)
    test_df["ID"] = test_df.index

    # --- Step 3: Load tokenizer and model ---
    print(f"🚀 Loading model and tokenizer from {model_dir} ...")
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)

    # --- Step 4: Tokenize test data ---
    print("🔡 Tokenizing test data ...")
    def preprocess(batch):
        return tokenizer(batch["code"], truncation=True, padding="max_length", max_length=256)
    test_ds = Dataset.from_pandas(test_df)
    test_tokenized = test_ds.map(preprocess, batched=True)

    # --- Step 5: Initialize Trainer for prediction ---
    trainer = Trainer(model=model)

    # --- Step 6: Predict ---
    print("🤖 Running predictions ...")
    preds = trainer.predict(test_tokenized)
    logits = preds.predictions
    y_pred = torch.argmax(torch.tensor(logits), dim=1).numpy()
    # --- Step 7: Save ONLY ID & predicted label ---
    submission = pd.DataFrame({
        "ID": test_df["ID"],
        "Label": y_pred
    })
    submission.to_csv(output_csv, index=False)
    print(f"✅ Predictions saved to {output_csv} (shape: {submission.shape})")
if __name__ == "__main__":
    main()

📂 Loading test data from /content/drive/MyDrive/Test.parquet ...
🚀 Loading model and tokenizer from /content/drive/MyDrive/experiments_codebert/checkpoint-1880 ...
🔡 Tokenizing test data ...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

🤖 Running predictions ...




✅ Predictions saved to /content/drive/MyDrive/taskb.csv (shape: (1000, 2))
