In [2]:
!pip install -q transformers datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
!pip install torch



# Step1: Importing Libraries

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch

In [5]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

# Step2: Loading and Tokenizing the Dataset

In [6]:
# Loading the CSV file into a pandas DataFrame
df = pd.read_csv("/content/drive/MyDrive/0-Folder-for-NLP-A3/classification-dataset-numerical-classes.csv")

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239232 entries, 0 to 239231
Data columns (total 4 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   phrase_id        239232 non-null  int64  
 1   sentiment_value  239232 non-null  float64
 2   class            239232 non-null  int64  
 3   phrase           239231 non-null  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 7.3+ MB


### Pre-Processing

In [9]:
#ValueError: Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.
#While tokenizing the above error occured, hence removing nan values before tokenization

df = df.dropna(subset=['phrase'])

In [10]:
# Loading the custom tokenizer
tokenizer = AutoTokenizer.from_pretrained("Bhautiksinh/BertPretrain")

# Tokenizing the text using the custom tokenizer
tokenized_data = tokenizer(df['phrase'].tolist(), truncation=True, padding=True, max_length=128, return_tensors='pt')
tokenized_data['labels'] = df['class'].tolist()

tokenizer_config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/151 [00:00<?, ?B/s]

In [11]:
# Converting tokenized data to a DataFrame
df_tokenized = pd.DataFrame({
    'input_ids': tokenized_data['input_ids'].numpy().tolist(),
    'attention_mask': tokenized_data['attention_mask'].numpy().tolist(),
    'labels': tokenized_data['labels']
})

# Step3: Splitting the dataset into Training and Testing data (validation dataset)

In [12]:
# Split the data into training and validation sets
train_data, val_data = train_test_split(df_tokenized, test_size=0.2, random_state=1)

In [13]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 191384 entries, 87693 to 128037
Data columns (total 3 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   input_ids       191384 non-null  object
 1   attention_mask  191384 non-null  object
 2   labels          191384 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 5.8+ MB


# Step 4 : Creating DataLoaders

Creating PyTorch DataLoader objects for efficient loading of data during training and validation.

In [14]:
# Create DataLoader for training and validation
train_dataloader = DataLoader(
    TensorDataset(
        torch.tensor(train_data['input_ids'].tolist()),
        torch.tensor(train_data['attention_mask'].tolist()),
        torch.tensor(train_data['labels'].tolist())
    ),
    batch_size=32,
    shuffle=True
)

val_dataloader = DataLoader(
    TensorDataset(
        torch.tensor(val_data['input_ids'].tolist()),
        torch.tensor(val_data['attention_mask'].tolist()),
        torch.tensor(val_data['labels'].tolist())
    ),
    batch_size=32
)


# Step 5: Fine-tuning the Bert Model

Fine-tuning the pre-trained BERT model for sequence classification.
Using the AdamW optimizer for training.
Training the model for a specified number of epochs.

In [15]:
num_classes = 5 #Our dataset has five classes : 0, 1, 2, 3, 4
model = AutoModelForSequenceClassification.from_pretrained("Bhautiksinh/BertPretrain", num_labels=num_classes).to(device)

from torch.optim import AdamW as AdamW_torch

optimizer = AdamW_torch(model.parameters(), lr=2e-5)

num_epochs = 4


config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Bhautiksinh/BertPretrain and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
from time import time
from tqdm import tqdm

In [17]:
# Fine-tuning the pre-trained BERT model

# Training loop

for epoch in range(num_epochs):
    loop = tqdm(train_dataloader, leave=True)
    model.train()
    for batch in loop:
        optimizer.zero_grad()
        inputs = {'input_ids': batch[0].to(device), 'attention_mask': batch[1].to(device), 'labels': batch[2].to(device)}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

100%|██████████| 5981/5981 [39:51<00:00,  2.50it/s]
100%|██████████| 5981/5981 [39:52<00:00,  2.50it/s]
100%|██████████| 5981/5981 [39:52<00:00,  2.50it/s]
100%|██████████| 5981/5981 [39:51<00:00,  2.50it/s]


# Step 6: Saving the fine-tunded model

In [18]:
model.save_pretrained('/content/drive/MyDrive/0-Folder-for-NLP-A3')

In [19]:
# Load the fine-tuned model for evaluation
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('/content/drive/MyDrive/0-Folder-for-NLP-A3')


# Step 7: Evaluation Loop

# Evaluation Metrics

In [20]:
val_data

Unnamed: 0,input_ids,attention_mask,labels
53842,"[101, 2009, 9794, 102, 0, 0, 0, 0, 0, 0, 0, 0,...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2
59358,"[101, 2000, 2202, 1037, 2784, 6812, 2005, 4827...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
24915,"[101, 2021, 4416, 1011, 1999, 1011, 5048, 1746...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
98976,"[101, 2104, 12735, 2130, 1996, 10634, 4355, 27...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...",2
173432,"[101, 1996, 6320, 3976, 5469, 102, 0, 0, 0, 0,...","[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2
...,...,...,...
39011,"[101, 2216, 27547, 2774, 102, 0, 0, 0, 0, 0, 0...","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
118995,"[101, 6832, 10908, 102, 0, 0, 0, 0, 0, 0, 0, 0...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2
62231,"[101, 1004, 1001, 4008, 17624, 2100, 1998, 140...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...",1
165980,"[101, 8817, 1997, 17026, 4870, 102, 0, 0, 0, 0...","[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2


In [21]:
from torch.utils.data import DataLoader, TensorDataset
import torch

# Assuming val_data is a DataFrame
input_ids = torch.tensor(val_data['input_ids'].tolist())
attention_mask = torch.tensor(val_data['attention_mask'].tolist())
labels = torch.tensor(val_data['labels'].tolist())

# Creating TensorDataset
val_dataset = TensorDataset(input_ids, attention_mask, labels)

# Creating DataLoader
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)


In [22]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming you have a validation DataLoader named 'val_dataloader'
val_dataloader = DataLoader(
    TensorDataset(
        torch.tensor(val_data['input_ids'].tolist()).to(device),
        torch.tensor(val_data['attention_mask'].tolist()).to(device),
        torch.tensor(val_data['labels'].tolist()).to(device)
    ),
    batch_size=32
)

# Placeholder lists for true labels and predicted labels
true_labels = []
predicted_labels = []

# Evaluation loop
model.eval()
with torch.no_grad():
    for batch in val_dataloader:
        inputs = {
            'input_ids': batch[0].to(device),
            'attention_mask': batch[1].to(device),
            'labels': batch[2].to(device)
        }
        outputs = model.to(device)(
            input_ids=inputs['input_ids'].to(device),
            attention_mask=inputs['attention_mask'].to(device),
            labels=inputs['labels'].to(device)
        )


        # Predicted probabilities
        logits = outputs.logits
        probabilities = torch.nn.functional.softmax(logits, dim=1)

        # Predicted labels
        predicted_batch_labels = torch.argmax(probabilities, dim=1).cpu().numpy()
        predicted_labels.extend(predicted_batch_labels)

        # True labels
        true_batch_labels = batch[2].cpu().numpy()
        true_labels.extend(true_batch_labels)

# Convert lists to numpy arrays for convenience
true_labels = np.array(true_labels)
predicted_labels = np.array(predicted_labels)

# Calculating metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1 = f1_score(true_labels, predicted_labels, average='weighted')

#log the metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: 0.6665
Precision: 0.6647
Recall: 0.6665
F1 Score: 0.6607


# Question 8

Calculating no of parameters after fine-tuning


In [23]:
k = sum(p.numel() for p in model.parameters())
k

109486085

In [29]:
def count_model_parameters(model):
    total_params = 0
    for param in model.parameters():
        num_params = param.numel()
        total_params += num_params
    return total_params



# Counting the number of parameters in the model
total_params = count_model_parameters(model)

print(f"Total number of parameters in your customized model: {total_params}")


Total number of parameters in your customized model: 109486085


In [31]:
count = 0 #no of matrices
for p in model.parameters():
  count += 1
print(count)

201


In [32]:
for n,p in model.named_parameters():
  print(n)

bert.embeddings.word_embeddings.weight
bert.embeddings.position_embeddings.weight
bert.embeddings.token_type_embeddings.weight
bert.embeddings.LayerNorm.weight
bert.embeddings.LayerNorm.bias
bert.encoder.layer.0.attention.self.query.weight
bert.encoder.layer.0.attention.self.query.bias
bert.encoder.layer.0.attention.self.key.weight
bert.encoder.layer.0.attention.self.key.bias
bert.encoder.layer.0.attention.self.value.weight
bert.encoder.layer.0.attention.self.value.bias
bert.encoder.layer.0.attention.output.dense.weight
bert.encoder.layer.0.attention.output.dense.bias
bert.encoder.layer.0.attention.output.LayerNorm.weight
bert.encoder.layer.0.attention.output.LayerNorm.bias
bert.encoder.layer.0.intermediate.dense.weight
bert.encoder.layer.0.intermediate.dense.bias
bert.encoder.layer.0.output.dense.weight
bert.encoder.layer.0.output.dense.bias
bert.encoder.layer.0.output.LayerNorm.weight
bert.encoder.layer.0.output.LayerNorm.bias
bert.encoder.layer.1.attention.self.query.weight
bert.enc

# Pushing model to hugging face hub

In [24]:
pip install huggingface_hub



In [25]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [26]:
model.push_to_hub("KareenaBeniwal/Fine-tuned-bert-model-classification")

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/KareenaBeniwal/Fine-tuned-bert-model-classification/commit/24c4e51f1e3c8829ebbc72e1844bbb78bcbe62a9', commit_message='Upload BertForSequenceClassification', commit_description='', oid='24c4e51f1e3c8829ebbc72e1844bbb78bcbe62a9', pr_url=None, pr_revision=None, pr_num=None)