In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/stanford-sentiment-treebank-v2-sst2/BERT-SST2-Dataset-Paper.pdf
/kaggle/input/stanford-sentiment-treebank-v2-sst2/Writing Code for NLP Research.pdf
/kaggle/input/stanford-sentiment-treebank-v2-sst2/SST2-Data/SST2-Data/stanfordSentimentTreebankRaw/stanfordSentimentTreebankRaw/README.txt
/kaggle/input/stanford-sentiment-treebank-v2-sst2/SST2-Data/SST2-Data/stanfordSentimentTreebankRaw/stanfordSentimentTreebankRaw/sentlex_exp12.txt
/kaggle/input/stanford-sentiment-treebank-v2-sst2/SST2-Data/SST2-Data/stanfordSentimentTreebankRaw/stanfordSentimentTreebankRaw/rawscores_exp12.txt
/kaggle/input/stanford-sentiment-treebank-v2-sst2/SST2-Data/SST2-Data/trainDevTestTrees_PTB/trees/test.txt
/kaggle/input/stanford-sentiment-treebank-v2-sst2/SST2-Data/SST2-Data/trainDevTestTrees_PTB/trees/train.txt
/kaggle/input/stanford-sentiment-treebank-v2-sst2/SST2-Data/SST2-Data/trainDevTestTrees_PTB/trees/dev.txt
/kaggle/input/stanford-sentiment-treebank-v2-sst2/SST2-Data/SST2-Data/stanfordSentim

In [1]:
# Import libraries
from transformers import Trainer, TrainingArguments, BertForSequenceClassification, BertTokenizer
from datasets import load_dataset
from sklearn.metrics import f1_score, matthews_corrcoef
import numpy as np
import torch
from torch.utils.data import DataLoader
import random
import time

# Load CoLA dataset from GLUE benchmark
datasets = load_dataset("glue", "cola")

# Initialize the BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True)

# Tokenizing the datasets
tokenized_datasets = datasets.map(tokenize_function, batched=True)

# Convert the datasets to PyTorch tensors
train_dataset = tokenized_datasets["train"].remove_columns(["idx"]).with_format("torch")
eval_dataset = tokenized_datasets["validation"].remove_columns(["idx"]).with_format("torch")

# Define metric computation function for Matthews Correlation Coefficient (MCC) and F1 score
def compute_metrics(eval_pred):
    logits = eval_pred.predictions
    labels = eval_pred.label_ids
    predictions = np.argmax(logits, axis=-1)
    mcc = matthews_corrcoef(labels, predictions)  # MCC for CoLA
    f1 = f1_score(labels, predictions, average="weighted")  # Weighted F1 score
    return {"mcc": mcc, "f1": f1}

# Placeholder functions for computational cost, inference time, and energy consumption
def compute_computational_cost(model):
    return sum(p.numel() for p in model.parameters())

def compute_inference_time(model, input_data):
    model.eval()
    start_time = time.time()
    with torch.no_grad():
        model(**input_data)
    return time.time() - start_time

def compute_energy_consumption(model):
    return sum(p.numel() for p in model.parameters()) * random.uniform(0.01, 0.05)

# Define pruning function
def prune_model_weights(model, pruning_ratio):
    pruned_model = model
    for name, param in pruned_model.named_parameters():
        if "weight" in name and param.requires_grad:
            flattened_weights = param.abs().flatten()
            threshold_index = int(len(flattened_weights) * pruning_ratio)
            sorted_weights, _ = torch.sort(flattened_weights)
            threshold = sorted_weights[threshold_index]
            param.data[param.abs() < threshold] = 0
    return pruned_model

# Define training arguments
training_args = TrainingArguments(
    output_dir="custom_cola_bert",
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=100,
    save_total_limit=2,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    metric_for_best_model="mcc",  # Use MCC as the evaluation metric
    load_best_model_at_end=True,
    learning_rate=1e-5,
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("fine_tuned_cola_bert")

# Optimize pruning
best_pruned_model = prune_model_weights(model, pruning_ratio=0.3)
predictions = trainer.predict(eval_dataset)  # Evaluate the pruned model
pruned_metrics = compute_metrics(predictions)
pruned_mcc = pruned_metrics["mcc"]
pruned_cost = compute_computational_cost(best_pruned_model)

# Define collate function to handle DataLoader batches
def collate_fn(batch):
    collated = {}
    for key in batch[0]:
        if isinstance(batch[0][key], torch.Tensor):
            collated[key] = torch.stack([item[key] for item in batch])
        else:
            collated[key] = torch.tensor([item[key] for item in batch])  # Convert non-tensor elements to tensors
    return collated

# Ensure DataLoader uses the collate function
dataloader = DataLoader(eval_dataset, batch_size=1, collate_fn=collate_fn)
input_batch = next(iter(dataloader))
input_batch = {k: v.to(next(best_pruned_model.parameters()).device) for k, v in input_batch.items()}

pruned_time = compute_inference_time(best_pruned_model, input_batch)
pruned_energy = compute_energy_consumption(best_pruned_model)

# Display results
print(f"Pruned Model MCC: {pruned_mcc}")
print(f"Pruned Computational Cost: {pruned_cost}")
print(f"Pruned Inference Time: {pruned_time}")
print(f"Pruned Energy Consumption: {pruned_energy}")

# Save the pruned model
best_pruned_model.save_pretrained("best_pruned_model")


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/251k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/37.6k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/37.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8551 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1043 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1063 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/8551 [00:00<?, ? examples/s]

Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

Map:   0%|          | 0/1063 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113757522222538, max=1.0…

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss,Validation Loss,Mcc,F1
100,No log,0.479983,0.451401,0.764435
200,No log,0.458861,0.48565,0.775404
300,No log,0.464909,0.523493,0.794647
400,No log,0.485621,0.502412,0.78051
500,0.446000,0.45953,0.544292,0.801908
600,0.446000,0.470417,0.562623,0.809286
700,0.446000,0.489979,0.536666,0.7973
800,0.446000,0.488557,0.539283,0.79851


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


ValueError: too many dimensions 'str'