#### This cell imports all the necessary libraries for model training, evaluation, and data processing, including tools for natural language processing (NLP) from the Hugging Face Transformers library, data handling with pandas, and metrics calculation.

In [None]:
# Import required libraries for handling model training, evaluation, and data processing.
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric
import torch
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelBinarizer

#### This cell loads the pre-trained RoBERTa model and its corresponding tokenizer, specifically fine-tuned for emotion classification. Additionally, it loads the SetFit/emotion dataset from Hugging Face for training and evaluation purposes.

In [None]:
# Load the pre-trained model and tokenizer for emotion classification
model = AutoModelForSequenceClassification.from_pretrained("SamLowe/roberta-base-go_emotions", num_labels=28)
tokenizer = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions")

# Load the SetFit/emotion dataset for training and evaluation
dataset = load_dataset("SetFit/emotion")

#### This cell defines a preprocessing function that tokenizes the text data and converts labels into a multi-label format suitable for emotion classification. The function is applied to the entire dataset, and the data is formatted for PyTorch compatibility.

In [None]:
# Define a preprocessing function to tokenize the text and encode labels into a multi-label format
def preprocess_function(examples):
    inputs = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)
    num_labels = 28
    labels = np.zeros((len(examples['label']), num_labels))
    for idx, label in enumerate(examples['label']):
        labels[idx, label] = 1
    inputs["labels"] = torch.tensor(labels, dtype=torch.float32)
    return inputs

# Apply the preprocessing function to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

#### This cell creates smaller subsets of the tokenized dataset for training and validation purposes by randomly selecting a specified number of examples from the original dataset.

In [None]:
# Create smaller subsets for training and validation
small_train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(1000))
small_validation_dataset = tokenized_dataset["test"].shuffle(seed=42).select(range(500))

#### This cell defines the training arguments, such as batch size, number of epochs, and logging settings, to fine-tune the model on a CPU. It initializes the Trainer object with the specified arguments and starts the fine-tuning process on the training dataset.

In [None]:
# Set up training arguments for model fine-tuning on CPU
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    save_total_limit=2,
    no_cuda=True,
)

# Initialize the Trainer for model fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_validation_dataset,
    tokenizer=tokenizer,
)

# Start fine-tuning the model
trainer.train()

#### This cell saves the fine-tuned model and tokenizer to a local directory for future use or further fine-tuning.

In [None]:
# Save the fine-tuned model and tokenizer to a specified directory
model.save_pretrained("./fine_tuned_model_epoch_1")
tokenizer.save_pretrained("./fine_tuned_model_epoch_1")

#### This cell reloads the fine-tuned model and tokenizer, reinitializes the Trainer object for evaluation, and evaluates the model on the validation dataset to check its performance.

In [None]:
# Reload the fine-tuned model and tokenizer for evaluation
model = AutoModelForSequenceClassification.from_pretrained("./fine_tuned_model_epoch_1")
tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_model_epoch_1")

# Re-initialize the Trainer for model evaluation
trainer = Trainer(
    model=model,
    eval_dataset=small_validation_dataset,
)

# Evaluate the model on the validation dataset
results = trainer.evaluate()
print(results)

#### This cell loads the accuracy metric, computes predictions from the model, and evaluates its accuracy on the validation dataset, providing a quantitative measure of performance.

In [None]:
# Load the accuracy metric to evaluate the model performance
metric = load_metric("accuracy", trust_remote_code=True)

# Compute the predictions and evaluate accuracy
encoded_inputs = tokenizer(small_validation_dataset["text"], return_tensors="pt", padding=True, truncation=True, max_length=128)
with torch.no_grad():
    outputs = model(**encoded_inputs)
logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)
labels = torch.tensor(small_validation_dataset["label"])
results = metric.compute(predictions=predictions, references=labels)
print(results)

#### This cell loads a custom dataset, preprocesses it by one-hot encoding the sentiment labels and tokenizing the text, and converts the processed data into a PyTorch-compatible dataset for further fine-tuning.

In [None]:
# Load a custom dataset and preprocess it for fine-tuning
file_path = "F:/metal health app/mood tracking/tweet_emotions.csv"
new_dataset = pd.read_csv(file_path)

# One-hot encode labels using LabelBinarizer
lb = LabelBinarizer()
labels = lb.fit_transform(new_dataset['sentiment'])

# Adjust labels to match the number of target labels
num_labels = 28
adjusted_labels = np.zeros((len(labels), num_labels))
for i, label in enumerate(labels):
    adjusted_labels[i, :len(label)] = label

# Tokenize the custom dataset
encoded_inputs = tokenizer(list(new_dataset['content']), padding=True, truncation=True, max_length=128, return_tensors="pt")
encoded_inputs["labels"] = torch.tensor(adjusted_labels, dtype=torch.float32)

# Convert the preprocessed data into a PyTorch dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

custom_dataset = CustomDataset(encoded_inputs)


 #### This cell sets up the training arguments and initializes the Trainer for fine-tuning the model on the custom dataset. It then starts the fine-tuning process.

In [None]:
# Define training arguments for fine-tuning the model on the custom dataset
training_args = TrainingArguments(
    output_dir="./results_custom",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    save_total_limit=2,
    no_cuda=True,
)

# Initialize the Trainer for fine-tuning on the custom dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=custom_dataset,
    eval_dataset=custom_dataset,
    tokenizer=tokenizer,
)

# Start fine-tuning the model on the custom dataset
trainer.train()

#### This cell reloads the fine-tuned model from a checkpoint, sets it to evaluation mode, and uses the Trainer object to evaluate the model's performance on the custom dataset. The predictions are then converted to probabilities.

In [None]:
# Reload the model from the checkpoint for evaluation on the custom dataset
checkpoint_path = r"F:\\metal health app\\mood tracking\\results_custom\\checkpoint-11000"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
model.eval()

# Define the Trainer for model evaluation
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    eval_dataset=custom_dataset
)

# Evaluate the model on the custom dataset
results = trainer.predict(custom_dataset)
print("Predictions:", results.predictions)

# Convert logits to probabilities
probabilities = torch.sigmoid(torch.tensor(results.predictions))


#### This cell provides custom input text and generates predictions for each emotion label using the fine-tuned model. It displays the probability of each emotion and makes binary predictions based on a specified threshold to identify the most likely emotions.

In [None]:
# Define emotion labels and provide custom input text for emotion prediction
emotion_labels = [
    "admiration", "amusement", "anger", "annoyance", "approval", "caring",
    "confusion", "curiosity", "desire", "disappointment", "disapproval", "disgust",
    "embarrassment", "excitement", "fear", "gratitude", "grief", "joy", "love",
    "nervousness", "optimism", "pride", "realization", "relief", "remorse",
    "sadness", "surprise", "neutral"
]

custom_input = ["I am feeling wonderful today!", "I am so annoyed and angry right now!", "I'm confused about what to do next."]

# Tokenize custom input texts
encoded_inputs = tokenizer(custom_input, return_tensors="pt", padding=True, truncation=True, max_length=128)

# Generate predictions using the model
with torch.no_grad():
    outputs = model(**encoded_inputs)

# Convert logits to probabilities
probabilities = torch.sigmoid(outputs.logits)

# Display probabilities for each emotion label for each input
print("\nProbabilities for each emotion label for each input:")
for i, text in enumerate(custom_input):
    print(f"\nText: {text}")
    for emotion, prob in zip(emotion_labels, probabilities[i].tolist()):
        print(f"  {emotion}: {prob:.4f}")

# Adjust threshold and make binary predictions
threshold = 0.5
predictions = (probabilities > threshold).int()

# Display predicted emotions based on threshold
print("\nPredicted emotions based on adjusted threshold:")
for i, text in enumerate(custom_input):
    print(f"\nText: {text}")
    print("Predicted Emotions: ", end="")
    for emotion, pred in zip(emotion_labels, predictions[i].tolist()):
        if pred == 1:
            print(f"{emotion}", end=", ")
    print()