In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AutoModel, AutoTokenizer
from transformers import AdamW, get_scheduler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import kagglehub
from kagglehub import KaggleDatasetAdapter

In [3]:
import kagglehub
path = kagglehub.dataset_download("muhammedamil1/preprocessed-data")
print("Path to dataset f")

Path to dataset f


In [4]:
import kagglehub
import os
import pandas as pd

preprocessed_data_path = kagglehub.dataset_download("muhammedamil1/preprocessed-data")

print("Files in dataset:", os.listdir(preprocessed_data_path))

file_name = "preprocessed_data1.csv"  
file_path = os.path.join(preprocessed_data_path, file_name)

# Step 4: Load dataset
df = pd.read_csv(file_path)  # Change to pd.read_parquet(file_path) if it's a .parquet file

# Step 5: Verify dataset
print(df.head())

Files in dataset: ['preprocessed_data1.csv']
                                                text                 label
0  stress and anxiety are common issues that peop...                stress
1  i am living in your walls i am living in your ...         mentalillness
2  how helpful have they been for you if youve ta...  personality disorder
3  i guess im just not good enough for anyone fuc...            depression
4  i am struggling with the outcome of a court ma...                   bpd


In [5]:
df

Unnamed: 0,text,label
0,stress and anxiety are common issues that peop...,stress
1,i am living in your walls i am living in your ...,mentalillness
2,how helpful have they been for you if youve ta...,personality disorder
3,i guess im just not good enough for anyone fuc...,depression
4,i am struggling with the outcome of a court ma...,bpd
...,...,...
141995,i cannot suffer from an anxiety disorder when ...,panic_disorder
141996,a question of mine about a question of mine th...,offmychest
141997,welcome to this months personal accountability...,hoarding disorder
141998,agree that you should stop smoking weed usuall...,social anxiety


In [6]:
df['label'].unique()

array(['stress', 'mentalillness', 'personality disorder', 'depression',
       'bpd', 'ocd', 'normal', 'ptsd', 'anxiety', 'bipolar',
       'schizophrenia', 'suicidal', 'bdd', 'social anxiety',
       'hoarding disorder', 'adhd', 'panic_disorder', 'offmychest',
       'eating disorder'], dtype=object)

In [7]:
df['label'].value_counts()

label
stress                  7500
mentalillness           7500
personality disorder    7500
depression              7500
bpd                     7500
normal                  7500
ptsd                    7500
bipolar                 7500
anxiety                 7500
schizophrenia           7500
suicidal                7500
adhd                    7500
bdd                     7500
social anxiety          7500
hoarding disorder       7500
offmychest              7500
panic_disorder          7500
eating disorder         7500
ocd                     7000
Name: count, dtype: int64

In [8]:
df = df.dropna(subset=["text"])

In [9]:
df.drop_duplicates(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(inplace = True)


In [10]:
label_encoder = LabelEncoder()
df["encoded_labels"] = label_encoder.fit_transform(df["label"])
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["encoded_labels"] = label_encoder.fit_transform(df["label"])


Unnamed: 0,text,label,encoded_labels
0,stress and anxiety are common issues that peop...,stress,17
1,i am living in your walls i am living in your ...,mentalillness,8
2,how helpful have they been for you if youve ta...,personality disorder,13
3,i guess im just not good enough for anyone fuc...,depression,5
4,i am struggling with the outcome of a court ma...,bpd,4
...,...,...,...
141990,so i got diagnosed with panic disorder a while...,panic_disorder,12
141992,backgroundi think i have an eating disorder i ...,eating disorder,6
141995,i cannot suffer from an anxiety disorder when ...,panic_disorder,12
141996,a question of mine about a question of mine th...,offmychest,11


In [11]:
for index, class_name in enumerate(label_encoder.classes_):
    print(f"{class_name} --> {index}")


adhd --> 0
anxiety --> 1
bdd --> 2
bipolar --> 3
bpd --> 4
depression --> 5
eating disorder --> 6
hoarding disorder --> 7
mentalillness --> 8
normal --> 9
ocd --> 10
offmychest --> 11
panic_disorder --> 12
personality disorder --> 13
ptsd --> 14
schizophrenia --> 15
social anxiety --> 16
stress --> 17
suicidal --> 18


In [12]:
df['label'].value_counts()

label
suicidal                7496
depression              7467
anxiety                 7445
offmychest              7439
normal                  7416
bipolar                 7376
ptsd                    7350
adhd                    7286
mentalillness           7276
bpd                     7258
schizophrenia           7195
bdd                     6539
panic_disorder          6370
eating disorder         6226
ocd                     6123
personality disorder    6073
hoarding disorder       5922
stress                  4671
social anxiety          4438
Name: count, dtype: int64

In [13]:
import pandas as pd
categories = ['stress', 'mentalillness', 'personality disorder', 'depression',
       'bpd', 'ocd', 'normal', 'ptsd', 'anxiety', 'bipolar',
       'schizophrenia', 'suicidal', 'bdd', 'social anxiety',
       'hoarding disorder', 'adhd', 'panic_disorder', 'offmychest',
       'eating disorder']
target_size = 6000  # Maximum allowed samples per class
balanced_df_list = []

for category in categories:
    category_df = df[df['label'] == category]
    balanced_df_list.append(category_df.sample(n=min(len(category_df), target_size), random_state=42).reset_index(drop=True))

df = pd.concat(balanced_df_list).reset_index(drop=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)


In [14]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"].tolist(), df["encoded_labels"].tolist(), test_size=0.2, random_state=42)

In [15]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, AutoConfig

# ✅ Define model checkpoint (DeBERTa Base)
MODEL_CHECKPOINT = "microsoft/deberta-v3-base"
NUM_LABELS = 19  # Set the correct number of output classes

# ✅ Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

# ✅ Load Pretrained DeBERTa Model
deberta_model = AutoModel.from_pretrained(MODEL_CHECKPOINT)

# ✅ Custom Classifier
class CustomDebertaClassifier(nn.Module):
    def __init__(self, deberta_model, num_labels):
        super(CustomDebertaClassifier, self).__init__()
        self.deberta = deberta_model  # Pretrained DeBERTa model
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(768, num_labels)  # 19 output classes
        self.criterion = nn.CrossEntropyLoss()  # ✅ Correct loss function

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # CLS token representation
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        loss = None
        if labels is not None:
            loss = self.criterion(logits, labels)  
        return {"loss": loss, "logits": logits}

# ✅ Initialize model with Pretrained DeBERTa
model = CustomDebertaClassifier(deberta_model, NUM_LABELS)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("✅ Custom DeBERTa Model Initialized!")


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

✅ Custom DeBERTa Model Initialized!


In [16]:
import torch
from transformers import AutoTokenizer
from torch.optim import AdamW

# ✅ Define model checkpoint path
checkpoint_path = "/kaggle/input/deberta-v3-base/pytorch/default/1/checkpoint.pth"
# ✅ Load tokenizer (remains unchanged)
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/deberta-v3-base/pytorch/default/1")

# ✅ Load the model architecture (Ensure this matches the trained model)
from transformers import AutoModel
import torch.nn as nn

MODEL_CHECKPOINT = "microsoft/deberta-v3-base"
NUM_LABELS = 19  # Adjust as per your model

# Load pretrained DeBERTa model
deberta_model = AutoModel.from_pretrained(MODEL_CHECKPOINT)

# Define the custom classifier
class CustomDebertaClassifier(nn.Module):
    def __init__(self, deberta_model, num_labels):
        super(CustomDebertaClassifier, self).__init__()
        self.deberta = deberta_model  # Pretrained DeBERTa model
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(768, num_labels)  # 19 output classes
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # CLS token representation
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        loss = None
        if labels is not None:
            loss = self.criterion(logits, labels)  
        return {"loss": loss, "logits": logits}

# ✅ Initialize model
model = CustomDebertaClassifier(deberta_model, NUM_LABELS)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# ✅ Load the fine-tuned model checkpoint
checkpoint = torch.load(checkpoint_path, map_location=device)

# Load model weights correctly
if "model_state_dict" in checkpoint:
    model.load_state_dict(checkpoint["model_state_dict"])
else:
    model.load_state_dict(checkpoint)  # Fallback if it's just the model state


optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
if "optimizer_state_dict" in checkpoint:
    optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

# ✅ Set model to evaluation mode (for inference)
model.train()

print("✅ Fine-Tuned Custom DeBERTa Model Successfully Loaded!")


  checkpoint = torch.load(checkpoint_path, map_location=device)


✅ Fine-Tuned Custom DeBERTa Model Successfully Loaded!


In [17]:
class MentalHealthDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = int(self.labels[idx])
        tokens = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": tokens["input_ids"].squeeze(0),
            "attention_mask": tokens["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long),  # Now label is int
        }

In [18]:
train_dataset = MentalHealthDataset(train_texts, train_labels, tokenizer)
test_dataset = MentalHealthDataset(test_texts, test_labels, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True,num_workers=1, pin_memory=True)
test_dataloader = DataLoader(test_dataset, batch_size=6, shuffle=True,num_workers=1, pin_memory=True)


In [19]:
print(type(train_texts), train_texts[:5])  # Print first few elements

<class 'list'> ['ive tried to search up on this for a long time from what i can see in dsm  there were two types of bdd delusional and nondelusional delusional bdd was a delusional disorder and nondelusional was somatic \n\n\n\nobviously now its  disorder in the ocd spectrum and i think this is a good classification for it \n\n\n\nive had bdd for  years the intensity waxes and wanes but its always pretty bad aside from the general dissatisfation and obsession over insert feature here stuff i get tactile sensations all over my face and im convinced its wrinkles forming i feel my face drooping decaying i can feel every microexpression constantly changing in the mirror like i can shapeshift many days i truly do not look human ive had periods of time ive thought i was truly alien when i was  or so i compiled pictures of myself as a child to see if they were all truly the same child because i was sure i must have switched with a human child and now im infiltrating life as an alien when i ha

In [20]:
batch = next(iter(train_dataloader))  # Get the first batch

# Print shape of inputs (tokenized text)
print("Input IDs shape:", batch["input_ids"].shape)
print("Attention Mask shape:", batch["attention_mask"].shape)

# Print shape of labels
print("Labels shape:", batch["labels"].shape)


Input IDs shape: torch.Size([4, 512])
Attention Mask shape: torch.Size([4, 512])
Labels shape: torch.Size([4])


In [21]:
print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")


Train dataset size: 88824
Test dataset size: 22207


In [22]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

class_labels, y_encoded = np.unique(df['label'], return_inverse=True)

class_weights = compute_class_weight(class_weight="balanced", classes=np.arange(len(class_labels)), y=y_encoded)
class_weights_dict = {label: weight for label, weight in zip(class_labels, class_weights)}

print("Class Weights:", class_weights_dict)


Class Weights: {'adhd': 0.9739561403508772, 'anxiety': 0.9739561403508772, 'bdd': 0.9739561403508772, 'bipolar': 0.9739561403508772, 'bpd': 0.9739561403508772, 'depression': 0.9739561403508772, 'eating disorder': 0.9739561403508772, 'hoarding disorder': 0.9867843367283456, 'mentalillness': 0.9739561403508772, 'normal': 0.9739561403508772, 'ocd': 0.9739561403508772, 'offmychest': 0.9739561403508772, 'panic_disorder': 0.9739561403508772, 'personality disorder': 0.9739561403508772, 'ptsd': 0.9739561403508772, 'schizophrenia': 0.9739561403508772, 'social anxiety': 1.316750077085458, 'stress': 1.2510676176632975, 'suicidal': 0.9739561403508772}


In [23]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-3)
num_training_steps = len(train_dataloader) * 3
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class_weights_tensor = torch.tensor(list(class_weights_dict.values()), dtype=torch.float).to(device)
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)


In [None]:
import os
import torch
import numpy as np
from tqdm import tqdm
from torch.cuda.amp import autocast, GradScaler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

save_dir = "/kaggle/working/saved_models"
os.makedirs(save_dir, exist_ok=True)

epochs = 14
patience = 3
best_val_loss = float("inf")
early_stop_counter = 0
grad_clip = 1.0
scaler = GradScaler()

for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    print(f"\n🔄 Epoch {epoch+1}/{epochs}")
    train_progress = tqdm(train_dataloader, desc="Training")
    
    for batch in train_progress:
        batch = {k: v.to(device) for k, v in batch.items()} 
        optimizer.zero_grad()
        
        with autocast():
            outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
            logits = outputs["logits"]
            loss = loss_fn(logits, batch["labels"].long()) 
        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        scaler.step(optimizer)
        scaler.update()
        if isinstance(lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
            lr_scheduler.step(loss.item()) 
        else:
            lr_scheduler.step()
        total_train_loss += loss.item()
        train_progress.set_postfix(loss=loss.item())
    avg_train_loss = total_train_loss / len(train_dataloader)
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        val_progress = tqdm(test_dataloader, desc="Validating")
        for batch in val_progress:
            batch = {k: v.to(device) for k, v in batch.items()}  
            with autocast():
                outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
                logits = outputs["logits"]
                loss = loss_fn(logits, batch["labels"].long())
            total_val_loss += loss.item()
            val_progress.set_postfix(loss=loss.item())
    
    avg_val_loss = total_val_loss / len(test_dataloader)
    print(f"📊 Epoch {epoch+1}/{epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
    
    epoch_save_dir = f"{save_dir}/epoch_{epoch+1}"
    os.makedirs(epoch_save_dir, exist_ok=True)
    
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        early_stop_counter = 0
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': lr_scheduler.state_dict() if lr_scheduler else None,
            'best_val_loss': best_val_loss
        }, f"{epoch_save_dir}/checkpoint.pth")
        tokenizer.save_pretrained(epoch_save_dir)
        print(f"✅ Model checkpoint saved at {epoch_save_dir}/checkpoint.pth with best Val Loss: {avg_val_loss:.4f}")
    else:
        early_stop_counter += 1
        print(f"⚠️ No improvement. Early stop: {early_stop_counter}/{patience}")
    
    if early_stop_counter >= patience:
        print("🚀 Early stopping triggered! Training stopped.")
        break
print("🎉 Training complete!")

  scaler = GradScaler()



🔄 Epoch 1/14


  with autocast():
Training: 100%|██████████| 22206/22206 [2:00:02<00:00,  3.08it/s, loss=1.24]     
  with autocast():
Validating: 100%|██████████| 3702/3702 [08:42<00:00,  7.08it/s, loss=0.000131]


📊 Epoch 1/14 | Train Loss: 0.7978 | Val Loss: 1.0677
✅ Model checkpoint saved at /kaggle/working/saved_models/epoch_1/checkpoint.pth with best Val Loss: 1.0677

🔄 Epoch 2/14


Training: 100%|██████████| 22206/22206 [1:59:58<00:00,  3.08it/s, loss=0.828]   
Validating: 100%|██████████| 3702/3702 [08:42<00:00,  7.09it/s, loss=1.76]   


📊 Epoch 2/14 | Train Loss: 0.7944 | Val Loss: 1.0550
✅ Model checkpoint saved at /kaggle/working/saved_models/epoch_2/checkpoint.pth with best Val Loss: 1.0550

🔄 Epoch 3/14


Training: 100%|██████████| 22206/22206 [1:59:43<00:00,  3.09it/s, loss=0.225]   
Validating: 100%|██████████| 3702/3702 [08:40<00:00,  7.11it/s, loss=0.236]  


📊 Epoch 3/14 | Train Loss: 0.7925 | Val Loss: 1.0434
✅ Model checkpoint saved at /kaggle/working/saved_models/epoch_3/checkpoint.pth with best Val Loss: 1.0434

🔄 Epoch 4/14


Training:   5%|▍         | 1022/22206 [05:30<1:54:18,  3.09it/s, loss=0.607]  