In [None]:
!pip install git+https://github.com/huggingface/transformers.git

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-xbmzy7tg
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-xbmzy7tg
  Resolved https://github.com/huggingface/transformers.git to commit 919220dab1e29f4d04eacd61a197a45a4fec2613
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.48.0.dev0-py3-none-any.whl size=10328720 sha256=0c790a3026565bcdb083659a94e424a6b20d037fc057da50efb5714ce55e3818
  Stored in directory: /tmp/pip-ephem-wheel-cache-r0n2u7xn/wheels/e7/9c/5b/e1a9c8007c343041e61cc484433d512ea9274272e3fcbe7c16
Successfully b

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from huggingface_hub import login

login(
  token="", # ADD YOUR TOKEN HERE
  add_to_git_credential=True
)


In [None]:
import random
import numpy as np
import torch

def set_seeds(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # for multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Call this function at the start of your script
set_seeds()

In [None]:
import zipfile
import os

# Specify the path to your zip file
zip_path = '/content/drive/My Drive/Colab Notebooks/Datasets/MLDSLatestComptetion.zip'

# Specify the directory to extract to
extract_to = '/content/'  # Current working directory in Colab

# Unzipping the file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

print("Unzipping completed.")

Unzipping completed.


In [None]:
# import torch
# from transformers import pipeline
# from pprint import pprint

# pipe = pipeline(
#     "text-classification",
#     model="answerdotai/ModernBERT-base",
#     torch_dtype=torch.bfloat16,
# )

# input_text = "He walked to the Bar."
# results = pipe(input_text)
# pprint(results)

## Helper Functions

In [None]:
def find_all_linear_names(model):
    cls = torch.nn.Linear #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

In [None]:
def print_trainable_parameters(model, use_4bit=False):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params
    if use_4bit:
        trainable_params /= 2
    print(
        f"all params: {all_param:,d} || trainable params: {trainable_params:,d} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
def create_peft_config(modules):
    """
    Create Parameter-Efficient Fine-Tuning config for your model
    :param modules: Names of the modules to apply Lora to
    """
    config = LoraConfig(
        r=64,  # dimension of the updated matrices
        lora_alpha=32,  # parameter for scaling
        target_modules=modules,
        lora_dropout=0.1,  # dropout probability for layers
        bias="none",
        task_type="SEQ_CLS",
    )

    return config

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np
from collections import defaultdict

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Generate the confusion matrix
    cm = confusion_matrix(labels, predictions, labels=range(10))

    # Compute per-class accuracy
    class_accuracies = np.zeros(10)
    for i in range(10):
        if cm[i].sum() > 0:
            class_accuracies[i] = cm[i, i] / cm[i].sum()
        else:
            class_accuracies[i] = np.nan  # or use 0 if you prefer

    # Compute overall accuracy
    overall_accuracy = accuracy_score(labels, predictions)

    # Create the return dictionary
    results = {f"class_{i}_accuracy": class_accuracies[i] for i in range(10)}
    results["overall_accuracy"] = overall_accuracy

    return results

In [None]:
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

## Loading the dataset

In [None]:
import pandas as pd

train_df = pd.read_csv("/content/Dataset/Train.csv")

In [None]:
train_df

Unnamed: 0.1,Unnamed: 0,text,genre,label,label_model,text_cleaned
0,0,"It starts with pain, followed by hate\nFueled ...",rock,9,LABEL_9,"It starts with pain, followed by hate\nFueled ..."
1,1,Freedom!\nAlone again again alone\nPatiently w...,rock,9,LABEL_9,Freedom!\nAlone again again alone\nPatiently w...
2,2,"Biting the hand that feeds you, lying to the v...",rock,9,LABEL_9,"Biting the hand that feeds you, lying to the v..."
3,3,You say you know just who I am\nBut you can't ...,rock,9,LABEL_9,You say you know just who I am\nBut you can't ...
4,4,My heart is beating faster can't control these...,rock,9,LABEL_9,My heart is beating faster can't control these...
...,...,...,...,...,...,...
290178,290178,I'm the best friend he's got I'd give him the ...,r&b,8,LABEL_8,I'm the best friend he's got I'd give him the ...
290179,290179,"Bad Boys Blue ""I Totally Miss You"" I did you w...",pop,7,LABEL_7,"Bad Boys Blue ""I Totally Miss You"" I did you w..."
290180,290180,Forgive me for the things That I never said to...,pop,7,LABEL_7,Forgive me for the things That I never said to...
290181,290181,The day they found a cure for AIDS The day the...,indie,4,LABEL_4,The day they found a cure for AIDS The day the...


In [None]:
import pandas as pd
import numpy as np

# Replace NaN values with empty strings in 'headline' and 'description' columns
train_df['text_cleaned'] = train_df['text_cleaned'].fillna("")

# Print unique categories and their counts
category_counts = train_df['label_model'].value_counts()
print("Unique categories and their counts:")
print(category_counts)

unique_categories = sorted(train_df['label_model'].unique())

# Display the modified DataFrame (optional)
print(train_df.head())

# Print the shape of the DataFrame
print(f"DataFrame shape: {train_df.shape}")

Unique categories and their counts:
label_model
LABEL_9    121404
LABEL_7    108714
LABEL_6     20291
LABEL_5     13545
LABEL_2      8644
LABEL_4      8449
LABEL_8      2793
LABEL_3      2240
LABEL_1      2213
LABEL_0      1890
Name: count, dtype: int64
   Unnamed: 0                                               text genre  label  \
0           0  It starts with pain, followed by hate\nFueled ...  rock      9   
1           1  Freedom!\nAlone again again alone\nPatiently w...  rock      9   
2           2  Biting the hand that feeds you, lying to the v...  rock      9   
3           3  You say you know just who I am\nBut you can't ...  rock      9   
4           4  My heart is beating faster can't control these...  rock      9   

  label_model                                       text_cleaned  
0     LABEL_9  It starts with pain, followed by hate\nFueled ...  
1     LABEL_9  Freedom!\nAlone again again alone\nPatiently w...  
2     LABEL_9  Biting the hand that feeds you, lying to th

In [None]:
# Creating id to label mapping
id2label = {index: category for index, category in enumerate(unique_categories)}

# Creating label to id mapping
label2id = {category: index for index, category in id2label.items()}

In [None]:
id2label

{0: 'LABEL_0',
 1: 'LABEL_1',
 2: 'LABEL_2',
 3: 'LABEL_3',
 4: 'LABEL_4',
 5: 'LABEL_5',
 6: 'LABEL_6',
 7: 'LABEL_7',
 8: 'LABEL_8',
 9: 'LABEL_9'}

In [None]:
label2id

{'LABEL_0': 0,
 'LABEL_1': 1,
 'LABEL_2': 2,
 'LABEL_3': 3,
 'LABEL_4': 4,
 'LABEL_5': 5,
 'LABEL_6': 6,
 'LABEL_7': 7,
 'LABEL_8': 8,
 'LABEL_9': 9}

In [None]:
import pandas as pd
import numpy as np

# Assuming your dataframe is called 'df'

# Get indices for rows we want to keep
# First get 20,000 samples each from LABEL_7 and LABEL_9
label_9_sample = train_df[train_df['label_model'] == 'LABEL_9'].sample(n=20000, random_state=42)
label_7_sample = train_df[train_df['label_model'] == 'LABEL_7'].sample(n=20000, random_state=42)

# Get all rows from other labels
other_labels = train_df[~train_df['label_model'].isin(['LABEL_7', 'LABEL_9'])]

# Combine all the dataframes
balanced_df = pd.concat([label_9_sample, label_7_sample, other_labels])

# Shuffle the final dataframe and reset index
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Verify the new distribution
print(balanced_df['label_model'].value_counts())

label_model
LABEL_6    20291
LABEL_9    20000
LABEL_7    20000
LABEL_5    13545
LABEL_2     8644
LABEL_4     8449
LABEL_8     2793
LABEL_3     2240
LABEL_1     2213
LABEL_0     1890
Name: count, dtype: int64


In [None]:
balanced_df

Unnamed: 0.1,Unnamed: 0,text,genre,label,label_model,text_cleaned
0,205129,I remember looking up\nTo look up to him\nAnd ...,rock,9,LABEL_9,I remember looking up\nTo look up to him\nAnd ...
1,12877,The mouths of envious\nAlways find another doo...,rock,9,LABEL_9,The mouths of envious\nAlways find another doo...
2,107195,words by Joni Mitchell\nmusic by Charles Mingu...,folk,2,LABEL_2,words by Joni Mitchell\nmusic by Charles Mingu...
3,182774,"Have you ever been walking, walking down that ...",rock,9,LABEL_9,"Have you ever been walking, walking down that ..."
4,210650,When an irresistible force such as you\nMeets ...,jazz,5,LABEL_5,When an irresistible force such as you\nMeets ...
...,...,...,...,...,...,...
100060,145377,My squelchy life\nMy squelchy life\nMy squelch...,rock,9,LABEL_9,My squelchy life\nMy squelchy life\nMy squelch...
100061,150977,So much love could drag you down\nBurn a hole ...,indie,4,LABEL_4,So much love could drag you down\nBurn a hole ...
100062,230264,I gaze at the moon and it's staring back at me...,metal,6,LABEL_6,I gaze at the moon and it's staring back at me...
100063,211591,I can't stand it for another day\nWhen you liv...,rock,9,LABEL_9,I can't stand it for another day\nWhen you liv...


In [None]:
from datasets import Dataset

train_main_dataset = Dataset.from_pandas(balanced_df)

In [None]:
import torch

model_checkpoint = 'answerdotai/ModernBERT-base'

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels = 10 , id2label=id2label, label2id=label2id)

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

if torch.cuda.is_available():
    device = torch.device("cuda")
    model = model.to(device)
    print("Model loaded to GPU.")
else:
    print("CUDA is not available. Model is on CPU.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded to GPU.


In [None]:
# create tokenize function
def tokenize_function(examples):
    # extract text
    text = examples["text_cleaned"]

    #tokenize and truncate text
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )
    return tokenized_inputs

# add pad token if none exists
if tokenizer.pad_token is None:
    print("Setting the pad token")
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [None]:
tokenized_train_dataset = train_main_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/100065 [00:00<?, ? examples/s]

In [None]:
columns_to_remove = ['Unnamed: 0', 'text','genre','label_model', 'text_cleaned']
tokenized_train_dataset = tokenized_train_dataset.remove_columns(columns_to_remove)

In [None]:
tokenized_train_dataset = tokenized_train_dataset.rename_column("label", "labels")

In [None]:
# # Function to convert labels to IDs
# def label_to_id(examples):
#     # Replace each label in the 'labels' column with its corresponding ID
#     examples['labels'] = [label2id[label] for label in examples['labels']]
#     return examples

# # Apply the function to the entire dataset
# tokenized_train_dataset = tokenized_train_dataset.map(label_to_id, batched=True)

# # Checking the updated labels
# print(tokenized_train_dataset['labels'])

In [None]:
tokenized_train_dataset

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 100065
})

In [None]:
train_test_split = tokenized_train_dataset.train_test_split(test_size=0.20)

# Extract the training and validation datasets
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [None]:
test_dataset

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 20013
})

In [None]:
train_dataset

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 80052
})

In [None]:
modules = find_all_linear_names(model)
modules

['Wqkv', 'Wi', 'Wo', 'dense', 'classifier']

In [None]:
from transformers import DataCollatorWithPadding,Trainer, TrainingArguments
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# hyperparameters
lr = 5e-5 # size of optimization step
batch_size = 32 # number of examples processed per optimziation step
num_epochs = 1 # number of times model runs through training data

# define training arguments
training_args = TrainingArguments(
    output_dir='modenbert' + "-mldsclassifier-fullmodel",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,  # Added eval batch size
    num_train_epochs=num_epochs,
    max_steps=2000,  # Added max steps
    evaluation_strategy="steps",  # Enable evaluation at specific steps
    eval_steps=500,  # Evaluate every 500 steps
    save_steps=500,  # Save model every 500 steps (matching eval_steps)
    weight_decay=0.01,
    fp16=True,
    logging_steps=500,
    warmup_steps=600,
    lr_scheduler_type="cosine",
    logging_dir='./logs',
    push_to_hub=True,
)



In [None]:
# creater trainer object
trainer = Trainer(
    model=model, # our peft model
    args=training_args, # hyperparameters
    train_dataset=train_dataset, # training data
    eval_dataset=test_dataset, # validation data
    tokenizer=tokenizer, # define tokenizer
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics, # evaluates model using compute_metrics() function from before
)

# train model
trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33makashmaggon[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Class 0 Accuracy,Class 1 Accuracy,Class 2 Accuracy,Class 3 Accuracy,Class 4 Accuracy,Class 5 Accuracy,Class 6 Accuracy,Class 7 Accuracy,Class 8 Accuracy,Class 9 Accuracy,Class 10 Accuracy,Class 11 Accuracy,Class 12 Accuracy,Class 13 Accuracy,Class 14 Accuracy,Class 15 Accuracy,Class 16 Accuracy,Class 17 Accuracy,Class 18 Accuracy,Class 19 Accuracy,Class 20 Accuracy,Class 21 Accuracy,Class 22 Accuracy,Class 23 Accuracy,Class 24 Accuracy,Class 25 Accuracy,Class 26 Accuracy,Class 27 Accuracy,Class 28 Accuracy,Class 29 Accuracy,Class 30 Accuracy,Class 31 Accuracy,Class 32 Accuracy,Class 33 Accuracy,Class 34 Accuracy,Class 35 Accuracy,Class 36 Accuracy,Class 37 Accuracy,Class 38 Accuracy,Class 39 Accuracy,Class 40 Accuracy,Class 41 Accuracy,Overall Accuracy
500,1.6227,1.494488,0.159204,0.025701,0.398847,0.740319,0.054577,0.465406,0.872924,0.435388,0.001876,0.39162,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.462449


KeyboardInterrupt: 