In [1]:
# PyTorch with CUDA 11.8
!pip install torch==2.0.1+cu118 \
              torchvision==0.15.2+cu118 \
              torchaudio==2.0.2+cu118 \
              --index-url https://download.pytorch.org/whl/cu118

# Transformers, Accelerate, and other utilities
!pip install transformers accelerate sentencepiece scikit-learn ipywidgets tqdm

# Install tf-keras (backwards compatibility)
!pip install tf-keras

# If necessary, downgrade Keras
!pip install keras==2.11.0
!pip install --upgrade "numpy<2" "protobuf<3.21"

Looking in indexes: https://download.pytorch.org/whl/cu118
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Collecting keras>=3.5.0 (from tensorflow<2.19,>=2.18->tf-keras)
  Using cached keras-3.7.0-py3-none-any.whl.metadata (5.8 kB)
Using cached keras-3.7.0-py3-none-any.whl (1.2 MB)
Installing collected packages: keras
  Attempting uninstall: keras
    Found existing installation: keras 2.11.0
    Uninstalling keras-2.11.0:
      Successfully uninstalled keras-2.11.0
Successfully installed keras-3.7.0
[0m
[

In [2]:
!pip install pandas

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [3]:
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
import sentencepiece

from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)

2025-01-02 12:21:09.864954: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-02 12:21:09.868763: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-02 12:21:09.878163: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1735820469.892993     888 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1735820469.897280     888 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-02 12:21:09.914692: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [6]:
file_path = "./Shuffled_Combined_Data_Without_Unnecessary_Column (1).csv"
data = pd.read_csv(file_path)
data_cleaned = data.dropna(subset=['body']).reset_index(drop=True)

In [7]:
def format_data_for_t5(data):
    df = data.copy()
    df['input_text'] = "classify: " + df['body']
    df['target_text'] = df['label']
    return df[['input_text', 'target_text']]

In [8]:
formatted_data = format_data_for_t5(data_cleaned)
train_data, val_data = train_test_split(formatted_data, test_size=0.2, random_state=42)
train_data = train_data.reset_index(drop=True)
val_data   = val_data.reset_index(drop=True)

In [9]:
model_name = "google/flan-t5-xl"
tokenizer  = T5Tokenizer.from_pretrained(model_name, legacy=False)  # legacy=False -> new T5 tokenizer
model      = T5ForConditionalGeneration.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.45G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [10]:
class MentalHealthDataset(Dataset):
    def __init__(self, dataframe, tokenizer, source_max_len=512, target_max_len=10):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.source_max_len = source_max_len
        self.target_max_len = target_max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        source_text = str(self.data.iloc[idx]["input_text"])
        target_text = str(self.data.iloc[idx]["target_text"])

        source_encodings = self.tokenizer(
            source_text,
            truncation=True,
            max_length=self.source_max_len,
            return_tensors="pt",
        )
        target_encodings = self.tokenizer(
            target_text,
            truncation=True,
            max_length=self.target_max_len,
            return_tensors="pt",
        )

        # Squeeze from shape [1, seq_len] -> [seq_len]
        input_ids      = source_encodings["input_ids"].squeeze(0)
        attention_mask = source_encodings["attention_mask"].squeeze(0)
        labels         = target_encodings["input_ids"].squeeze(0)

        # Replace pad token ID with -100 so that they're ignored in the loss
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

In [11]:
train_dataset = MentalHealthDataset(train_data, tokenizer)
val_dataset   = MentalHealthDataset(val_data,   tokenizer)

In [12]:
collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding="longest",     # dynamically pads each batch to the longest sequence
    return_tensors="pt",
)

In [13]:
def compute_metrics(eval_pred):
    """
    With Seq2SeqTrainer + `predict_with_generate=True`,
    `eval_pred` is a tuple of (generated_ids, label_ids).
    """
    preds, labels = eval_pred

    # If the model returns a tuple of preds (e.g. (logits, ...)),
    # just take the first element for actual predictions.
    if isinstance(preds, tuple):
        preds = preds[0]

    preds = np.asarray(preds)
    labels = np.asarray(labels)

    # Convert -100 in labels back to pad_token_id for decoding
    labels = np.where(labels == -100, tokenizer.pad_token_id, labels)

    # ------------------------------------------------------------------------
    # 1) CLIP PREDICTIONS TO VALID VOCAB RANGE TO AVOID "piece id is out of range"
    # ------------------------------------------------------------------------
    vocab_size = tokenizer.vocab_size
    unk_id = tokenizer.unk_token_id
    preds = np.where((preds >= 0) & (preds < vocab_size), preds, unk_id)

    # Decode
    decoded_preds  = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    correct = 0
    for pred, label in zip(decoded_preds, decoded_labels):
        if pred.strip().lower() == label.strip().lower():
            correct += 1
    accuracy = correct / len(decoded_labels)
    return {"accuracy": accuracy}

In [14]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results_flant5_xl_gpu",  # Where to save checkpoints
    eval_strategy="epoch",
    save_strategy="epoch",                # Save a checkpoint at end of every epoch
    save_total_limit=3,                   # Keep only the last 3 checkpoints
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,

    # Mixed precision: use BF16 on A100
    bf16=True,
    fp16=False,           # Make sure fp16 is off
    adafactor=True,       # official T5 training often uses Adafactor
    learning_rate=5e-5,   # or 1e-4 if your data is small

    predict_with_generate=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=50,
    seed=42,
)



In [15]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=collator,  # dynamic padding
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

In [16]:
trainer.train()
results = trainer.evaluate()
print("Final Evaluation:", results)

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3536,0.151237,0.854561
2,0.1561,0.267892,0.860365


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

Final Evaluation: {'eval_loss': 0.17169636487960815, 'eval_accuracy': 0.863681592039801, 'eval_runtime': 669.1729, 'eval_samples_per_second': 9.011, 'eval_steps_per_second': 2.254, 'epoch': 2.9991706750704927}


In [17]:
def predict(input_text, max_target_length=10):
    text = "classify: " + input_text
    inputs = tokenizer(text, truncation=True, max_length=512, return_tensors="pt")

    # By default, uses GPU if available
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}

    outputs = model.generate(
        **inputs,
        max_length=max_target_length,
        num_beams=4,
        early_stopping=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [18]:
sample_input = "I love going to school"
predicted_label = predict(sample_input)
print(f"Sample Input: {sample_input}")
print(f"Predicted Label: {predicted_label}")

Sample Input: I love going to school
Predicted Label: normal


In [19]:
trainer.save_model("./flan_t5_xl_gpu_mental_health")  # Saves the final best model
tokenizer.save_pretrained("./flan_t5_xl_gpu_mental_health")  # Saves tokenizer files

('./flan_t5_xl_gpu_mental_health/tokenizer_config.json',
 './flan_t5_xl_gpu_mental_health/special_tokens_map.json',
 './flan_t5_xl_gpu_mental_health/spiece.model',
 './flan_t5_xl_gpu_mental_health/added_tokens.json')