In [1]:
!pip install transformers torch pandas numpy scipy tqdm spacy
!pip install spacy




[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import re
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.optim as optim
from torch.utils.data import DataLoader, random_split,Dataset
from scipy.special import softmax
from tqdm import tqdm
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv(r"C:\Users\amil\OneDrive\Documents\AI-Driven Personalized Therapy Recommendations system\Module_2\data_set\processed_data\balanced_emotions.csv")

In [4]:
emotion_labels = ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']

In [5]:
import pandas as pd
emotion_to_response_style = {
    'anger': "De-escalation & Validation",
    'disgust': "Reframing & Encouragement",
    'fear': "Reassurance & Coping Strategies",
    'joy': "Encouragement & Positive Reinforcement",
    'neutral': "Active Listening & Encouragement",
    'sadness': "Compassion & Support",
    'surprise': "Clarification & Stability"
}
df['Detected Emotion'] = df['Detected Emotion'].map(emotion_to_response_style)


In [6]:
ai_response_mapping = {
    "De-escalation & Validation": 0,
    "Reframing & Encouragement": 1,
    "Reassurance & Coping Strategies": 2,
    "Encouragement & Positive Reinforcement": 3,
    "Active Listening & Encouragement": 4,
    "Compassion & Support": 5,
    "Clarification & Stability": 6
}


df["Detected Emotion"] = df["Detected Emotion"].map(ai_response_mapping)

In [7]:
df['Detected Emotion'].value_counts()


Detected Emotion
0    6350
2    6350
5    6350
3    6350
6    6250
4    6250
1    1987
Name: count, dtype: int64

In [8]:
df.dropna(inplace = True)

In [9]:
 df.isnull().sum()

text                0
label               0
Detected Emotion    0
dtype: int64

In [10]:
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

labels = df['Detected Emotion']
unique_labels = np.unique(labels)

class_weights = compute_class_weight(class_weight='balanced', classes=unique_labels, y=labels)

class_weights_dict = {label: weight for label, weight in zip(unique_labels, class_weights)}

print("Computed Class Weights:", class_weights_dict)


Computed Class Weights: {0: 0.8919235095613048, 1: 2.8532565671104715, 2: 0.8919235095613048, 3: 0.8919235095613048, 4: 0.9422249685101124, 5: 0.8919235095613048, 6: 0.9061942857142857}


In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
print(device)

cuda


In [12]:
loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor)

In [13]:
df = df[['text', 'Detected Emotion']]
df

Unnamed: 0,text,Detected Emotion
0,hi assuming that many of us have a certain ten...,1
1,my one friend comes to mind for this question ...,1
2,my one friend comes to mind for this question ...,1
3,this is it this is the worst roomive never had...,1
4,animal hoarding usually starts because a perso...,1
...,...,...
39882,best historical armys what country do think h...,3
39883,comparing trauma i know that this post may be...,0
39884,treatments for panic disorder\nsince panic dis...,2
39885,tired of hearing youre being ridiculous ive h...,0


In [14]:
contractions = {
    "don't": "do not",
    "can't": "cannot",
    "i'm": "i am",
    "he's": "he is",
    "she's": "she is",
    "it's": "it is",
    "that's": "that is",
    "what's": "what is",
    "where's": "where is",
    "there's": "there is",
    "didn't": "did not",
    "won't": "will not",
    "wouldn't": "would not",
    "couldn't": "could not",
    "shouldn't": "should not",
    "haven't": "have not",
    "hasn't": "has not",
    "wasn't": "was not",
    "weren't": "were not",
    "isn't": "is not",
    "aren't": "are not",
    "doesn't": "does not"
}
def expand_contractions(text):
    for contraction, expanded in contractions.items():
        text = re.sub(r"\b" + re.escape(contraction) + r"\b", expanded, text)
    return text

def clean_text(text):
    if not isinstance(text, str):
        text = str(text)
    text = text.lower()
    text = expand_contractions(text)
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@w+|\#', '', text)  # Remove mentions and hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    return text

def preprocess_data(df, text_column, label_column):
    df[text_column] = df[text_column].fillna("").astype(str)
    df[text_column] = df[text_column].apply(clean_text)
    df = df[[text_column, label_column]].dropna().reset_index(drop=True)
    return df

text_column = 'text'
label_column = 'Detected Emotion'
df = preprocess_data(df, text_column, label_column)

In [15]:
df

Unnamed: 0,text,Detected Emotion
0,hi assuming that many of us have a certain ten...,1
1,my one friend comes to mind for this question ...,1
2,my one friend comes to mind for this question ...,1
3,this is it this is the worst roomive never had...,1
4,animal hoarding usually starts because a perso...,1
...,...,...
39641,best historical armys what country do think h...,3
39642,comparing trauma i know that this post may be...,0
39643,treatments for panic disorder\nsince panic dis...,2
39644,tired of hearing youre being ridiculous ive h...,0


In [16]:
MODEL_NAME = "j-hartmann/emotion-english-distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [17]:
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df['text'].tolist(), df['Detected Emotion'].tolist(), test_size=0.2, random_state=42)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42)

In [18]:
class AIDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        encodings = tokenizer(
            texts,
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt"
        )

        self.input_ids = encodings["input_ids"]
        self.attention_mask = encodings["attention_mask"]
        self.labels = torch.as_tensor(labels, dtype=torch.long)  # Efficient conversion

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx],
        }

train_dataset = AIDataset(train_texts, train_labels, tokenizer)
val_dataset = AIDataset(val_texts, val_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)


In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=len(set(labels))
)

model.config.hidden_dropout_prob = 0.3  # Dropout for hidden layers
model.config.attention_probs_dropout_prob = 0.3  # Dropout for attention layers
model.config.classifier_dropout = 0.3  # Dropout before classification layer
model.to(device)

  return self.fget.__get__(instance, owner)()


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (

In [20]:
loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor.to(device))
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)


In [21]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score
from tqdm import tqdm  # ✅ Import tqdm for progress bar

EPOCHS = 15
PATIENCE = 3  # Stop if validation loss does not improve for 'PATIENCE' epochs
best_val_loss = float("inf")  # Track best validation loss
epochs_no_improve = 0  # Count epochs without improvement
base_save_directory = r'C:/Users/amil/OneDrive/Documents/AI-Driven Personalized Therapy Recommendations system/Module_2/Model'

for epoch in range(EPOCHS):
    model.train()
    total_train_loss = 0
    all_preds, all_labels = [], []  # Track predictions for accuracy

    print(f"\n🔹 Epoch {epoch+1}/{EPOCHS} - Training...")
    train_progress = tqdm(train_loader, desc="Training", leave=False)

    for batch in train_progress:
        input_ids, attention_mask, labels = (
            batch["input_ids"].to(device),
            batch["attention_mask"].to(device),
            batch["labels"].to(device),
        )

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Prevent exploding gradients
        optimizer.step()

        total_train_loss += loss.item()

        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

        # ✅ Update tqdm description with the latest loss
        train_progress.set_postfix(loss=loss.item())

    avg_train_loss = total_train_loss / len(train_loader)
    train_acc = accuracy_score(all_labels, all_preds)

    model.eval()
    total_val_loss = 0
    val_preds, val_labels = [], []

    print(f"\n📊 Epoch {epoch+1}/{EPOCHS} - Validating...")
    val_progress = tqdm(val_loader, desc="Validation", leave=False)

    with torch.no_grad():
        for batch in val_progress:
            input_ids, attention_mask, labels = (
                batch["input_ids"].to(device),
                batch["attention_mask"].to(device),
                batch["labels"].to(device),
            )

            outputs = model(input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs.logits, labels)

            total_val_loss += loss.item()

            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            val_preds.extend(preds)
            val_labels.extend(labels.cpu().numpy())

            # ✅ Update tqdm description with the latest loss
            val_progress.set_postfix(loss=loss.item())

    avg_val_loss = total_val_loss / len(val_loader)
    val_acc = accuracy_score(val_labels, val_preds)

    print(f"✅ Epoch {epoch+1} Results:")
    print(f"   🏋️ Training Loss: {avg_train_loss:.4f}, Accuracy: {train_acc:.4f}")
    print(f"   📊 Validation Loss: {avg_val_loss:.4f}, Accuracy: {val_acc:.4f}")

    # ✅ Early Stopping Logic
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_no_improve = 0  # Reset counter
        best_model_path = os.path.join(base_save_directory, "best_model")
        os.makedirs(best_model_path, exist_ok=True)
        model.save_pretrained(best_model_path)
        tokenizer.save_pretrained(best_model_path)
        print(f"🔥 Best model saved at: {best_model_path}")
    else:
        epochs_no_improve += 1
        print(f"⚠️ No improvement in validation loss for {epochs_no_improve}/{PATIENCE} epochs.")

    # ✅ Stop training if no improvement for 'PATIENCE' epochs
    if epochs_no_improve >= PATIENCE:
        print(f"🛑 Early stopping triggered after {epoch+1} epochs. Best model restored from {best_model_path}")
        break  # Stop training

    # ✅ Save model every two epochs
    if (epoch + 1) % 2 == 0:
        epoch_save_directory = os.path.join(base_save_directory, f"epoch_{epoch+1}")
        os.makedirs(epoch_save_directory, exist_ok=True)
        model.save_pretrained(epoch_save_directory)
        tokenizer.save_pretrained(epoch_save_directory)
        print(f"✅ Model saved at: {epoch_save_directory}")

print("🎉 Fine-tuning completed! Best model saved based on validation loss.")



🔹 Epoch 1/15 - Training...


                                                                                                                       


📊 Epoch 1/15 - Validating...


                                                                                                                       

✅ Epoch 1 Results:
   🏋️ Training Loss: 0.6035, Accuracy: 0.7883
   📊 Validation Loss: 0.5662, Accuracy: 0.8066
🔥 Best model saved at: C:/Users/amil/OneDrive/Documents/AI-Driven Personalized Therapy Recommendations system/Module_2/Model\best_model

🔹 Epoch 2/15 - Training...


                                                                                                                       


📊 Epoch 2/15 - Validating...


                                                                                                                       

✅ Epoch 2 Results:
   🏋️ Training Loss: 0.4212, Accuracy: 0.8588
   📊 Validation Loss: 0.5425, Accuracy: 0.8310
🔥 Best model saved at: C:/Users/amil/OneDrive/Documents/AI-Driven Personalized Therapy Recommendations system/Module_2/Model\best_model
✅ Model saved at: C:/Users/amil/OneDrive/Documents/AI-Driven Personalized Therapy Recommendations system/Module_2/Model\epoch_2

🔹 Epoch 3/15 - Training...


                                                                                                                       


📊 Epoch 3/15 - Validating...


                                                                                                                       

✅ Epoch 3 Results:
   🏋️ Training Loss: 0.3206, Accuracy: 0.9022
   📊 Validation Loss: 0.7343, Accuracy: 0.8156
⚠️ No improvement in validation loss for 1/3 epochs.

🔹 Epoch 4/15 - Training...


                                                                                                                       


📊 Epoch 4/15 - Validating...


                                                                                                                       

✅ Epoch 4 Results:
   🏋️ Training Loss: 0.2703, Accuracy: 0.9277
   📊 Validation Loss: 0.8138, Accuracy: 0.8323
⚠️ No improvement in validation loss for 2/3 epochs.
✅ Model saved at: C:/Users/amil/OneDrive/Documents/AI-Driven Personalized Therapy Recommendations system/Module_2/Model\epoch_4

🔹 Epoch 5/15 - Training...


                                                                                                                       


📊 Epoch 5/15 - Validating...


                                                                                                                       

✅ Epoch 5 Results:
   🏋️ Training Loss: 0.2293, Accuracy: 0.9442
   📊 Validation Loss: 0.9304, Accuracy: 0.8303
⚠️ No improvement in validation loss for 3/3 epochs.
🛑 Early stopping triggered after 5 epochs. Best model restored from C:/Users/amil/OneDrive/Documents/AI-Driven Personalized Therapy Recommendations system/Module_2/Model\best_model
🎉 Fine-tuning completed! Best model saved based on validation loss.


In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Define the path where your fine-tuned model is saved
MODEL_PATH = r"C:\Users\amil\OneDrive\Documents\AI-Driven Personalized Therapy Recommendations system\Module_2\sentiment_analysis\Model\fine_tuned_model"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # Set to evaluation mode


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.3, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.3, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (

In [8]:
ai_response_mapping = {
    0: "De-escalation & Validation",
    1: "Reframing & Encouragement",
    2: "Reassurance & Coping Strategies",
    3: "Encouragement & Positive Reinforcement",
    4: "Active Listening & Encouragement",
    5: "Compassion & Support",
    6: "Clarification & Stability"
}


In [12]:
def predict_emotion(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    
    # Move inputs to the same device as model
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Get predicted label
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    
    return predicted_class  # Returns the predicted emotion index

# Example usagetext = "I feel like I want to die"

# Predict the numerical label
predicted_label = predict_emotion('Im feeling lonely, i cant take it anymore')

# Convert numerical label to category text
predicted_category = ai_response_mapping.get(predicted_label, "Unknown Category")

print(f"Predicted Label: {predicted_label}")
print(f"Predicted AI Response Style: {predicted_category}")

Predicted Label: 5
Predicted AI Response Style: Compassion & Support


In [64]:
from sklearn.metrics import classification_report
import torch
from torch.utils.data import DataLoader

test_dataset = AIDataset(test_texts, test_labels, tokenizer)  # Using your separate test set
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
all_preds = []
all_labels = []
with torch.no_grad(): 
    for batch in test_loader:
        input_ids, attention_mask, labels = (
            batch["input_ids"].to(device),
            batch["attention_mask"].to(device),
            batch["labels"].to(device),
        )
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)  
        all_preds.extend(preds.cpu().numpy()) 
        all_labels.extend(labels.cpu().numpy())  
print("📊 Classification Report (Test Set):")
print(classification_report(all_labels, all_preds, digits=4))


📊 Classification Report (Test Set):
              precision    recall  f1-score   support

           0     0.8284    0.8284    0.8284       606
           1     0.8186    0.8302    0.8244       212
           2     0.7791    0.9017    0.8359       661
           3     0.8810    0.7658    0.8194       619
           4     0.8434    0.9018    0.8716       621
           5     0.8376    0.7698    0.8023       643
           6     0.8771    0.8524    0.8646       603

    accuracy                         0.8366      3965
   macro avg     0.8379    0.8357    0.8352      3965
weighted avg     0.8391    0.8366    0.8361      3965

