# Install & Import Libraries and Check GPU

In [None]:
# Install necessary libraries

!pip install transformers[torch] datasets evaluate accelerate -q
!pip install pandas scikit-learn -q

# The accelerate library is useful for optimizing training, especially on multiple GPUs or TPUs,
# and is a dependency for the Trainer.
# pandas is for data manipulation if needed, and scikit-learn for metrics.

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset, Dataset, DatasetDict
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import evaluate # Hugging Face's library for evaluation metrics

# Check if GPU is available and set the device
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"PyTorch is using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("PyTorch is using CPU. Training may be very slow.")

# Define the model checkpoint for Clinical_ModernBERT
MODEL_CHECKPOINT = "Simonlee711/Clinical_ModernBERT" # [3]

#1: General Language Sentiment Prediction Fine-Tuning

## 1.1. Dataset Selection and Loading

In [None]:
# Load the SST-2 dataset from the GLUE benchmark
# SST-2 is a binary sentiment classification task (positive/negative) [16]
raw_datasets_stage1 = load_dataset("glue", "sst2")

print("Stage 1 Dataset (SST-2):")
print(raw_datasets_stage1)

# SST-2 has 'sentence', 'label', 'idx' columns.
# 'label' is 0 for negative, 1 for positive.
# We will use 'sentence' as text and 'label' as the sentiment.

# For demonstration, let's take a smaller subset for faster training
# In a real scenario, you would use the full dataset.
# For SST-2, 'train' is large, 'validation' is small.
# We'll use a fraction of train and all of validation.

# To create a consistent train/test split for demonstration if not using the full GLUE validation set
# For GLUE tasks, standard splits are usually preferred.
# Here, raw_datasets_stage1['train'] is the training set
# raw_datasets_stage1['validation'] is the development/validation set
# raw_datasets_stage1['test'] is the test set (labels often unavailable publicly)

# Let's prepare our dataset dictionary
# We'll use the provided train and validation splits
train_df_stage1 = raw_datasets_stage1['train'].to_pandas()
val_df_stage1 = raw_datasets_stage1['validation'].to_pandas()

# For demonstration, if you want to further split the training set:
# train_df_stage1, test_df_stage1 = train_test_split(train_df_stage1, test_size=0.1, random_state=42, stratify=train_df_stage1['label'])
# For this protocol, we will use the official validation set as our test/eval set for Stage 1.

# Convert pandas DataFrames back to Hugging Face Datasets
train_dataset_stage1 = Dataset.from_pandas(train_df_stage1)
eval_dataset_stage1 = Dataset.from_pandas(val_df_stage1)

# Combine into a DatasetDict
processed_datasets_stage1 = DatasetDict({
    'train': train_dataset_stage1,
    'validation': eval_dataset_stage1
})

print("\nProcessed Stage 1 Datasets:")
print(processed_datasets_stage1)
print("\nExample from Stage 1 training set:")
print(processed_datasets_stage1['train'])

## 1.2. Preprocessing and Tokenization

In [None]:
# Load the tokenizer for Clinical_ModernBERT
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

# Define the maximum sequence length for Stage 1.
# SST-2 sentences are relatively short.
MAX_LENGTH_STAGE1 = 128 # [19] suggests 128 for SST-2 with DistilBERT

# Tokenization function
def tokenize_function_stage1(examples):
    # For SST-2, the text is in the 'sentence' column
    return tokenizer(examples["sentence"], truncation=True, padding="max_length", max_length=MAX_LENGTH_STAGE1)

# Apply tokenization to the datasets
tokenized_datasets_stage1 = processed_datasets_stage1.map(tokenize_function_stage1, batched=True)

# Remove columns that are not needed by the model and rename 'label' to 'labels'
# The model expects the label column to be named 'labels' [14, 20]
tokenized_datasets_stage1 = tokenized_datasets_stage1.remove_columns(["sentence", "idx"]) # SST-2 specific columns
tokenized_datasets_stage1 = tokenized_datasets_stage1.rename_column("label", "labels")
tokenized_datasets_stage1.set_format("torch") # Ensure datasets return PyTorch tensors

print("\nTokenized Stage 1 Datasets (showing train features):")
print(tokenized_datasets_stage1['train'].features)
print("\nExample of tokenized input_ids from Stage 1 training set:")
print(tokenized_datasets_stage1['train']['input_ids'])

# Data collator for dynamic padding
# This will pad sequences to the maximum length within a batch, which can be more efficient
# than padding all sequences to MAX_LENGTH_STAGE1 if many are shorter.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## 1.3. Model Initialization

In [None]:
# Determine the number of labels for Stage 1 (SST-2 is binary)
NUM_LABELS_STAGE1 = 2 # Positive, Negative

# Load Clinical_ModernBERT with a sequence classification head
model_stage1 = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=NUM_LABELS_STAGE1,
    # trust_remote_code=True # May be needed if the model has custom code, though often not for standard architectures
    # torch_dtype="auto" # To load in optimal memory data type [20]
)

# Move model to the configured device (GPU or CPU)
model_stage1.to(device)

print(f"\nStage 1 Model ({MODEL_CHECKPOINT}) loaded with a sequence classification head for {NUM_LABELS_STAGE1} labels.")

## 1.4. Stage-1 General Training Protocol

In [None]:
# Define TrainingArguments for Stage 1
training_args_stage1 = TrainingArguments(
    output_dir="./results_stage1",
    learning_rate=2e-5,  # [15, 19]
    per_device_train_batch_size=16, # Adjust based on GPU memory
    per_device_eval_batch_size=16,
    num_train_epochs=3, # [19]
    weight_decay=0.01,
    evaluation_strategy="epoch", # Evaluate every epoch
    save_strategy="epoch",       # Save checkpoint every epoch
    load_best_model_at_end=True, # Load the best model based on metric_for_best_model
    metric_for_best_model="accuracy", # For SST-2, accuracy is a common metric [16]
    logging_dir='./logs_stage1',
    logging_steps=100,
    fp16=torch.cuda.is_available(), # Use mixed precision if GPU is available
    # report_to="tensorboard" # Optional: if you want to use tensorboard
)

# Define a function to compute metrics for evaluation
def compute_metrics_stage1(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Using Hugging Face's evaluate library for standard metrics
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")

    acc = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="binary") # For binary classification

    # For multi-class, average might be "macro" or "weighted"
    # For SST-2, it's binary.

    return {
        "accuracy": acc["accuracy"],
        "f1": f1["f1"],
    }

# Initialize Trainer for Stage 1
trainer_stage1 = Trainer(
    model=model_stage1,
    args=training_args_stage1,
    train_dataset=tokenized_datasets_stage1["train"],
    eval_dataset=tokenized_datasets_stage1["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics_stage1,
)

# Start Stage 1 training
print("\nStarting Stage 1 training...")
trainer_stage1.train()
print("Stage 1 training finished.")

## 1.5. Evaluation and Model Saving

In [None]:
# Evaluate the fine-tuned Stage 1 model on its validation set
print("\nEvaluating Stage 1 model...")
eval_results_stage1 = trainer_stage1.evaluate()
print("\nStage 1 Evaluation Results:")
print(eval_results_stage1)

# Save the fine-tuned Stage 1 model and tokenizer
STAGE1_MODEL_PATH = "./fine_tuned_clinical_modernbert_stage1"
trainer_stage1.save_model(STAGE1_MODEL_PATH)
tokenizer.save_pretrained(STAGE1_MODEL_PATH) # Save tokenizer with the model

print(f"\nStage 1 model and tokenizer saved to {STAGE1_MODEL_PATH}")

# 2: Healthcare-Domain Sentiment Prediction Fine-Tuning

## 2.1. Load Stage-2 Healthcare Dataset

In [None]:
# --- STAGE 2: HEALTHCARE-DOMAIN SENTIMENT PREDICTION ---

# IMPORTANT: Replace 'path/to/your/stage2_data.csv' with the actual path to your dataset.
# The dataset should have 'text' and 'label' columns.
# 'label' should be numerical (e.g., 0 for negative, 1 for neutral, 2 for positive).
STAGE2_DATA_PATH = 'stage2_data.csv' # Placeholder

# Example: Creating a dummy CSV file for demonstration purposes.
# In a real scenario, you would upload or point to your actual data file.
import pandas as pd
dummy_stage2_data = {
    'text':,
    # Assuming 3 labels: 0=Negative, 1=Neutral, 2=Positive
    'label':
}
dummy_stage2_df = pd.DataFrame(dummy_stage2_data)
dummy_stage2_df.to_csv(STAGE2_DATA_PATH, index=False)
print(f"Dummy Stage 2 data created at {STAGE2_DATA_PATH}")


try:
    # Load the custom dataset
    # Assuming a CSV file with 'text' and 'label' columns
    df_stage2 = pd.read_csv(STAGE2_DATA_PATH)
    print(f"\nSuccessfully loaded Stage 2 data from {STAGE2_DATA_PATH}")
    print("Stage 2 DataFrame head:")
    print(df_stage2.head())

    # Ensure 'text' and 'label' columns exist
    if 'text' not in df_stage2.columns or 'label' not in df_stage2.columns:
        raise ValueError("Stage 2 data must contain 'text' and 'label' columns.")

    # Determine the number of unique labels for Stage 2
    NUM_LABELS_STAGE2 = df_stage2['label'].nunique()
    print(f"Number of unique labels in Stage 2 data: {NUM_LABELS_STAGE2}")
    if NUM_LABELS_STAGE2 <= 1:
        raise ValueError("Stage 2 data must have at least two unique labels for classification.")


    # Split data into training and validation sets
    train_df_stage2, val_df_stage2 = train_test_split(
        df_stage2,
        test_size=0.2, # 20% for validation
        random_state=42,
        stratify=df_stage2['label'] if NUM_LABELS_STAGE2 > 1 else None # Stratify if more than one label
    )

    # Convert pandas DataFrames to Hugging Face Datasets
    train_dataset_stage2 = Dataset.from_pandas(train_df_stage2)
    eval_dataset_stage2 = Dataset.from_pandas(val_df_stage2)

    # Combine into a DatasetDict
    raw_datasets_stage2 = DatasetDict({
        'train': train_dataset_stage2,
        'validation': eval_dataset_stage2
    })

    print("\nRaw Stage 2 Datasets:")
    print(raw_datasets_stage2)
    print("\nExample from Stage 2 training set:")
    print(raw_datasets_stage2['train'])

except FileNotFoundError:
    print(f"ERROR: Stage 2 data file not found at {STAGE2_DATA_PATH}. Please upload your data.")
except ValueError as e:
    print(f"ERROR: {e}")
except Exception as e:
    print(f"An unexpected error occurred while loading Stage 2 data: {e}")

## 2.2. Model Initialization from Stage-1

In [None]:
# Path where Stage 1 model was saved
# STAGE1_MODEL_PATH = "./fine_tuned_clinical_modernbert_stage1" # Defined in Stage 1

if 'NUM_LABELS_STAGE2' in locals() and NUM_LABELS_STAGE2 > 1:
    # Load the fine-tuned Stage 1 model
    # Ensure the number of labels matches the Stage 2 dataset
    model_stage2 = AutoModelForSequenceClassification.from_pretrained(
        STAGE1_MODEL_PATH,
        num_labels=NUM_LABELS_STAGE2,
        # trust_remote_code=True # If applicable
    )

    # The tokenizer is the same as Stage 1, already loaded.
    # If not, load it: tokenizer = AutoTokenizer.from_pretrained(STAGE1_MODEL_PATH)

    model_stage2.to(device)
    print(f"\nStage 2 Model loaded from {STAGE1_MODEL_PATH} with a new classification head for {NUM_LABELS_STAGE2} labels.")
else:
    print("Skipping Stage 2 model initialization as NUM_LABELS_STAGE2 is not properly defined.")

## 2.3. Stage-2 Healthcare-Domain Training Protocol

In [None]:
if 'model_stage2' in locals() and model_stage2 is not None and \
   'tokenized_datasets_stage2' in locals() and tokenized_datasets_stage2 is not None:

    training_args_stage2 = TrainingArguments(
        output_dir="./results_stage2",
        learning_rate=1e-5,  # Lower learning rate for second stage fine-tuning
        per_device_train_batch_size=8, # Potentially smaller due to longer MAX_LENGTH_STAGE2
        per_device_eval_batch_size=8,
        num_train_epochs=3, # Adjust as needed
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro" if NUM_LABELS_STAGE2 > 2 else "f1", # Use f1_macro for multi-class
        logging_dir='./logs_stage2',
        logging_steps=50,
        fp16=torch.cuda.is_available(),
        # report_to="tensorboard"
    )

    def compute_metrics_stage2(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)

        accuracy_metric = evaluate.load("accuracy")
        precision_metric = evaluate.load("precision")
        recall_metric = evaluate.load("recall")
        f1_metric = evaluate.load("f1")

        acc = accuracy_metric.compute(predictions=predictions, references=labels)

        # Adjust averaging for multi-class if NUM_LABELS_STAGE2 > 2
        avg_method = "macro" if NUM_LABELS_STAGE2 > 2 else "binary"

        precision = precision_metric.compute(predictions=predictions, references=labels, average=avg_method, zero_division=0)
        recall = recall_metric.compute(predictions=predictions, references=labels, average=avg_method, zero_division=0)
        f1 = f1_metric.compute(predictions=predictions, references=labels, average=avg_method, zero_division=0)

        return {
            "accuracy": acc["accuracy"],
            "precision_macro" if avg_method == "macro" else "precision": precision["precision"],
            "recall_macro" if avg_method == "macro" else "recall": recall["recall"],
            "f1_macro" if avg_method == "macro" else "f1": f1["f1"],
        }

    trainer_stage2 = Trainer(
        model=model_stage2,
        args=training_args_stage2,
        train_dataset=tokenized_datasets_stage2["train"],
        eval_dataset=tokenized_datasets_stage2["validation"],
        tokenizer=tokenizer, # Same tokenizer
        data_collator=data_collator, # Same data collator
        compute_metrics=compute_metrics_stage2,
    )

    print("\nStarting Stage 2 training...")
    trainer_stage2.train()
    print("Stage 2 training finished.")
else:
    print("Skipping Stage 2 training as prerequisite variables are not available.")

## 2.4. Evaluation and Final Model Saving

In [None]:
if 'trainer_stage2' in locals() and trainer_stage2 is not None:
    print("\nEvaluating final Stage 2 model...")
    eval_results_stage2 = trainer_stage2.evaluate()
    print("\nStage 2 Evaluation Results (Final Model):")
    print(eval_results_stage2)

    # Save the final fine-tuned model and tokenizer
    FINAL_MODEL_PATH = "./final_sentiment_model_clinical_modernbert_sdn"
    trainer_stage2.save_model(FINAL_MODEL_PATH)
    tokenizer.save_pretrained(FINAL_MODEL_PATH) # Save the tokenizer with the final model

    print(f"\nFinal domain-adapted model and tokenizer saved to {FINAL_MODEL_PATH}")
else:
    print("Skipping Stage 2 evaluation and saving as trainer_stage2 is not available.")

# Utilizing HUBERT for Sentiment Analysis

In [None]:
## 1.1. Loading Model for Inference

In [None]:
# --- INFERENCE WITH THE FINAL MODEL ---
# FINAL_MODEL_PATH = "./final_sentiment_model_clinical_modernbert_sdn" # Defined in Stage 2

# Load the final fine-tuned model and tokenizer
try:
    final_model = AutoModelForSequenceClassification.from_pretrained(FINAL_MODEL_PATH)
    # The tokenizer should also be loaded from the final model path to ensure consistency
    final_tokenizer = AutoTokenizer.from_pretrained(FINAL_MODEL_PATH)

    final_model.to(device) # Move model to GPU if available
    final_model.eval()     # Set model to evaluation mode

    print(f"\nFinal model and tokenizer loaded from {FINAL_MODEL_PATH} for inference.")

    # Define human-readable labels if NUM_LABELS_STAGE2 was, for example, 3
    # This depends on how labels were encoded (0, 1, 2) during Stage 2 data prep.
    # Example:
    if 'NUM_LABELS_STAGE2' in locals() and NUM_LABELS_STAGE2 == 3:
        sentiment_labels_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
        # If your labels are different (e.g., -1, 0, 1 or 1, 2, 3), adjust this map.
        # Ensure this map aligns with the numeric labels used in your stage2_data.csv
    elif 'NUM_LABELS_STAGE2' in locals() and NUM_LABELS_STAGE2 == 2: # For binary like SST-2
        sentiment_labels_map = {0: "Negative", 1: "Positive"}
    else:
        sentiment_labels_map = None
        print("Warning: NUM_LABELS_STAGE2 not clearly defined for sentiment_labels_map. Predictions will be numeric.")


    # Prediction function
    def predict_sentiment(text, model, tokenizer, max_length=MAX_LENGTH_STAGE2): # Use MAX_LENGTH_STAGE2
        # Preprocess the input text (apply the same cleaning as in Stage 2)
        cleaned_text = preprocess_forum_text(text) # Ensure this function is defined and accessible

        inputs = tokenizer(
            cleaned_text,
            padding="max_length", # Or True, if using data_collator logic implicitly
            truncation=True,
            max_length=max_length,
            return_tensors="pt" # Return PyTorch tensors
        )
        inputs = {k: v.to(device) for k, v in inputs.items()} # Move inputs to the same device as the model

        with torch.no_grad(): # Disable gradient calculations for inference
            outputs = model(**inputs)
            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=-1)
            predicted_class_id = torch.argmax(probabilities, dim=-1).item()

        if sentiment_labels_map:
            predicted_label = sentiment_labels_map.get(predicted_class_id, "Unknown")
        else:
            predicted_label = predicted_class_id

        return {
            "predicted_label": predicted_label,
            "predicted_class_id": predicted_class_id,
            "probabilities": probabilities.cpu().numpy().tolist() # Convert to list for easier handling
        }

    # Example forum post snippets for prediction
    sample_posts =

    print("\n--- Example Predictions ---")
    for i, post_text in enumerate(sample_posts):
        if final_model and final_tokenizer:
            prediction_result = predict_sentiment(post_text, final_model, final_tokenizer)
            print(f"\nPost {i+1}: \"{post_text}\"")
            print(f"  Predicted Sentiment: {prediction_result['predicted_label']} (ID: {prediction_result['predicted_class_id']})")
            if sentiment_labels_map and prediction_result['probabilities']:
                 # Print probabilities per label
                prob_strings = [f"{sentiment_labels_map.get(j, str(j))}: {prob:.4f}" for j, prob in enumerate(prediction_result['probabilities'])]
                print(f"  Probabilities: [{', '.join(prob_strings)}]")
            else:
                print(f"  Probabilities: {prediction_result['probabilities']}")
        else:
            print("Final model or tokenizer not available for prediction.")

except NameError as e:
    print(f"Error during inference setup (likely a variable like FINAL_MODEL_PATH was not defined due to earlier skips): {e}")
except Exception as e:
    print(f"An unexpected error occurred during inference: {e}")