<a href="https://colab.research.google.com/github/madala-veerendra/Lecture-3-ChiperSchools/blob/main/NLP_empathy_ai_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# PyTorch and related imports
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
import torch.nn.functional as F

# Hugging Face transformers library
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup

# Other utility libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Check if GPU is available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


In [2]:
import pandas as pd

# Define the file paths for each split
splits = {
    'train': 'hf://datasets/google-research-datasets/go_emotions/simplified/train-00000-of-00001.parquet',
    'validation': 'hf://datasets/google-research-datasets/go_emotions/simplified/validation-00000-of-00001.parquet',
    'test': 'hf://datasets/google-research-datasets/go_emotions/simplified/test-00000-of-00001.parquet'
}

# Load the training data
train_data = pd.read_parquet(splits['train'])




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
# Display the first few rows of the training dataset
print(train_data.head())


                                                text labels       id
0  My favourite food is anything I didn't have to...   [27]  eebbqej
1  Now if he does off himself, everyone will thin...   [27]  ed00q6i
2                     WHY THE FUCK IS BAYLESS ISOING    [2]  eezlygj
3                        To make her feel threatened   [14]  ed7ypvh
4                             Dirty Southern Wankers    [3]  ed0bdzj


In [29]:
print(train_data.tail())

                                                    text labels       id  \
43405  Added you mate well I’ve just got the bow and ...   [18]  edsb738   
43406  Always thought that was funny but is it a refe...    [6]  ee7fdou   
43407  What are you talking about? Anything bad that ...    [3]  efgbhks   
43408            More like a baptism, with sexy results!   [13]  ed1naf8   
43409                                    Enjoy the ride!   [17]  eecwmbq   

       sentiment  
43405          2  
43406          0  
43407          0  
43408          2  
43409          2  


In [22]:
print(train_data.describe)

<bound method NDFrame.describe of                                                     text labels       id  \
0      My favourite food is anything I didn't have to...   [27]  eebbqej   
1      Now if he does off himself, everyone will thin...   [27]  ed00q6i   
2                         WHY THE FUCK IS BAYLESS ISOING    [2]  eezlygj   
3                            To make her feel threatened   [14]  ed7ypvh   
4                                 Dirty Southern Wankers    [3]  ed0bdzj   
...                                                  ...    ...      ...   
43405  Added you mate well I’ve just got the bow and ...   [18]  edsb738   
43406  Always thought that was funny but is it a refe...    [6]  ee7fdou   
43407  What are you talking about? Anything bad that ...    [3]  efgbhks   
43408            More like a baptism, with sexy results!   [13]  ed1naf8   
43409                                    Enjoy the ride!   [17]  eecwmbq   

       sentiment  
0              1  
1              

In [4]:
print(train_data.columns)

Index(['text', 'labels', 'id'], dtype='object')


In [5]:
# Preview the labels column to understand its structure
print(train_data['labels'].head())  # Confirm if this prints lists like [27], [2], etc.

# Emotion ID to emotion name mapping, assuming each integer represents a specific emotion
emotion_to_sentiment = {
    0: 'admiration', 1: 'amusement', 2: 'anger', 3: 'annoyance', 4: 'approval',
    5: 'caring', 6: 'confusion', 7: 'curiosity', 8: 'desire', 9: 'disappointment',
    10: 'disapproval', 11: 'disgust', 12: 'embarrassment', 13: 'excitement', 14: 'fear',
    15: 'gratitude', 16: 'grief', 17: 'joy', 18: 'love', 19: 'nervousness',
    20: 'optimism', 21: 'pride', 22: 'realization', 23: 'relief', 24: 'remorse',
    25: 'sadness', 26: 'surprise', 27: 'neutral'
}

# Map emotion names to sentiment categories
sentiment_mapping = {
    'positive': ['admiration', 'amusement', 'approval', 'caring', 'desire', 'excitement',
                 'gratitude', 'joy', 'love', 'optimism', 'pride', 'relief'],
    'neutral': ['neutral', 'curiosity', 'realization', 'surprise'],
    'negative': ['anger', 'annoyance', 'confusion', 'disappointment', 'disapproval',
                 'disgust', 'embarrassment', 'fear', 'grief', 'nervousness', 'remorse', 'sadness']
}

# Function to map labels to sentiment based on emotions
def categorize_sentiment(label_list):
    for label in label_list:
        emotion = emotion_to_sentiment.get(label, 'neutral')
        for sentiment, emotions in sentiment_mapping.items():
            if emotion in emotions:
                return sentiment
    return 'neutral'

# Apply function to create a 'sentiment' column
train_data['sentiment'] = train_data['labels'].apply(categorize_sentiment)

# Check if sentiment mapping was successful
print(train_data[['text', 'labels', 'sentiment']].head())


0    [27]
1    [27]
2     [2]
3    [14]
4     [3]
Name: labels, dtype: object
                                                text labels sentiment
0  My favourite food is anything I didn't have to...   [27]   neutral
1  Now if he does off himself, everyone will thin...   [27]   neutral
2                     WHY THE FUCK IS BAYLESS ISOING    [2]  negative
3                        To make her feel threatened   [14]  negative
4                             Dirty Southern Wankers    [3]  negative


In [6]:
from transformers import BertTokenizer
import torch

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to tokenize the text data
def tokenize_data(data, max_length=128):
    # Tokenize the text, returning input IDs and attention masks
    tokenized = tokenizer.batch_encode_plus(
        data['text'].tolist(),
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    return tokenized

# Tokenize the training data
tokenized_train_data = tokenize_data(train_data)

# Check the shapes of the tokenized outputs
print(tokenized_train_data['input_ids'].shape)      # Should show (number_of_samples, max_length)
print(tokenized_train_data['attention_mask'].shape)  # Should show (number_of_samples, max_length)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



torch.Size([43410, 128])
torch.Size([43410, 128])


In [7]:
# Define a mapping from sentiment strings to integers
sentiment_mapping = {
    'negative': 0,
    'neutral': 1,
    'positive': 2
}

# Map the sentiment labels to numeric values
train_data['sentiment'] = train_data['sentiment'].map(sentiment_mapping)

# Function to create a DataLoader
def create_dataloader(tokenized_data, labels, batch_size=32):
    input_ids = tokenized_data['input_ids']
    attention_masks = tokenized_data['attention_mask']
    labels = torch.tensor(labels.values, dtype=torch.long)  # Now labels should be numeric

    # Create a TensorDataset from the tokenized inputs and labels
    dataset = TensorDataset(input_ids, attention_masks, labels)
    return DataLoader(dataset, batch_size=batch_size)

# Create DataLoaders for training and validation
train_dataloader = create_dataloader(tokenized_train_data, train_data['sentiment'])

# Check the size of the DataLoader
print(f'Train DataLoader created with {len(train_dataloader)} batches.')


Train DataLoader created with 1357 batches.


In [8]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define the model
class SentimentClassifier(torch.nn.Module):
    def __init__(self, num_labels=3):
        super(SentimentClassifier, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        return outputs.logits  # Return the logits

# Instantiate the model
model = SentimentClassifier(num_labels=3)

# Check the model architecture
print(model)




model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


SentimentClassifier(
  (bert): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSdpaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_featu

In [14]:
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW, DistilBertForSequenceClassification, DistilBertTokenizer
import torch
from tqdm import tqdm

# Initialize DistilBERT model and tokenizer for a smaller, faster model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Freeze all layers except the classification layer to reduce computation
for param in model.distilbert.parameters():
    param.requires_grad = False

# Define training parameters
epochs = 1
batch_size = 4  # Reduced batch size for faster CPU training
learning_rate = 1e-4  # Lower learning rate for stability

# Prepare dataset as TensorDataset
input_ids = torch.tensor(tokenized_train_data['input_ids'])
attention_masks = torch.tensor(tokenized_train_data['attention_mask'])
labels = torch.tensor(train_data['sentiment'].values)  # Replace `sentiment` with the correct label column name
train_dataset = TensorDataset(input_ids, attention_masks, labels)

# Create DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)

# Set model to training mode
model.train()

# Define optimizer using PyTorch's AdamW implementation
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    total_loss = 0

    # Loop through the DataLoader
    for batch in tqdm(train_dataloader):
        input_ids, attention_masks, labels = batch

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_masks)
        loss = torch.nn.CrossEntropyLoss()(outputs.logits, labels)

        # Backward pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.3f}')


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  input_ids = torch.tensor(tokenized_train_data['input_ids'])
  attention_masks = torch.tensor(tokenized_train_data['attention_mask'])
100%|██████████| 10853/10853 [3:02:50<00:00,  1.01s/it]

Epoch 1/1, Loss: 0.837





In [16]:
# Define paths to save the model and tokenizer
output_dir = "/content/sentiment_model/"

# Create directory if it doesn't exist
import os
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the trained model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")


Model and tokenizer saved to /content/sentiment_model/


In [17]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

# Load the model and tokenizer from the output directory
model = DistilBertForSequenceClassification.from_pretrained(output_dir)
tokenizer = DistilBertTokenizer.from_pretrained(output_dir)

print("Model and tokenizer loaded for inference.")


Model and tokenizer loaded for inference.


In [18]:
# Test the model with a sample input
sample_text = "I am feeling great today!"
inputs = tokenizer(sample_text, return_tensors="pt", truncation=True, padding=True).to(device)

# Ensure model is in evaluation mode
model.eval()

# Perform the prediction
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=1).cpu().numpy()

# Print the prediction
sentiment_labels = ["neutral", "positive", "negative"]
predicted_sentiment = sentiment_labels[predictions[0]]
print(f"Predicted sentiment for the sample text is: {predicted_sentiment}")


Predicted sentiment for the sample text is: negative


In [20]:
import torch
from transformers import DistilBertTokenizer

# Load tokenizer and model if not already loaded
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Set the model to evaluation mode
model.eval()

def predict_sentiment(text):
    # Tokenize and prepare inputs
    inputs = tokenizer(text, return_tensors="pt", max_length=128, truncation=True, padding='max_length')
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Get model predictions
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)
        confidence, predicted_label = torch.max(probabilities, dim=1)

    # Map labels to human-friendly sentiments
    sentiment_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
    return sentiment_map[predicted_label.item()], confidence.item()

# Chatbot loop
print("Hello! I can help you analyze the sentiment of your messages. Type 'exit' to quit.")
while True:
    user_input = input("You: ")
    if user_input.lower() == "exit":
        print("Goodbye!")
        break

    # Get prediction
    sentiment, confidence = predict_sentiment(user_input)
    print(f"Bot: The sentiment is {sentiment} with confidence {confidence:.2f}")


Hello! I can help you analyze the sentiment of your messages. Type 'exit' to quit.
You: i am not feeling good
Bot: The sentiment is Negative with confidence 0.76
You: say something good
Bot: The sentiment is Positive with confidence 0.60
You: i am feeling low 
Bot: The sentiment is Negative with confidence 0.64
You: this is messed up
Bot: The sentiment is Negative with confidence 0.65
You: its okay
Bot: The sentiment is Positive with confidence 0.89
You: wow that is fine
Bot: The sentiment is Positive with confidence 0.98
You: shame on you
Bot: The sentiment is Negative with confidence 0.88
You: exit
Goodbye!
