In [None]:
pip install transformers datasets pandas scikit-learn



In [None]:
!pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.53.0-py3-none-any.whl.metadata (39 kB)
Downloading transformers-4.53.0-py3-none-any.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m77.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.52.4
    Uninstalling transformers-4.52.4:
      Successfully uninstalled transformers-4.52.4
Successfully installed transformers-4.53.0


In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, GPTNeoForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# ----------------------
# Step 1: Load and Clean Data
# ----------------------
df = pd.read_excel("/content/drive/MyDrive/GP(AI2025)/Data/External Data/Filtered Drugs Data (Cleaned).xlsx")

In [None]:
# Drop rows where the rating is missing
df = df.dropna(subset=["Filtered Rating"])

# Rename the label column
df.rename(columns={"Filtered Rating": "label"}, inplace=True)

# Remove the "/5" and convert to integer
df["label"] = df["label"].astype(str).str.split('/').str[0].astype(int)

# Map numeric labels to sentiment categories
def map_sentiment(score):
    if score in [0, 1, 2]:
        return 0  # negative
    elif score == 3:
        return 1  # neutral
    else:  # 4 or 5
        return 2  # positive

df["label"] = df["label"].apply(map_sentiment)

# Optional: check distribution
print(df["label"].value_counts())

# Rename text column to "text"
df.rename(columns={"Feedback": "text"}, inplace=True)

# ----------------------
# Step 2: Convert to Hugging Face Datasets
# ----------------------
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["label"], random_state=42)

dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df.reset_index(drop=True)),
    "validation": Dataset.from_pandas(val_df.reset_index(drop=True)),
    "test": Dataset.from_pandas(test_df.reset_index(drop=True)),
})

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={"Filtered Rating": "label"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["label"] = df["label"].astype(str).str.split('/').str[0].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["label"] = df["label"].apply(map_sentiment)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveat

label
2    18007
0     2103
1      462
Name: count, dtype: int64


In [None]:
from transformers import AutoTokenizer

# Changed checkpoint to DistilGPT-2
checkpoint = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token = tokenizer.eos_token  # Critical for GPT-2 (no native pad token)

def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",  # Now uses eos_token for padding
        truncation=True,
        max_length=512,  # Same max length as before
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Format remains identical (but attention_mask will use eos_token for padding)
for split in tokenized_datasets:
    tokenized_datasets[split].set_format(
        type="torch",
        columns=["input_ids", "attention_mask", "label"]
    )
# model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/16457 [00:00<?, ? examples/s]

Map:   0%|          | 0/2057 [00:00<?, ? examples/s]

Map:   0%|          | 0/2058 [00:00<?, ? examples/s]

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.asarray(np.argmax(logits, axis=-1))  # Use np.asarray for compatibility
    labels = np.asarray(labels)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted")
    }

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",  # Explicitly set to steps
    eval_steps=100,        # Evaluate every 100 steps
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,     # Log every 100 steps
    report_to="none",
    disable_tqdm=False,
    save_strategy="steps",
    save_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train model
print("Starting training...")
try:
    train_result = trainer.train()
    print("Training completed!")
except Exception as e:
    print(f"Training failed with error: {str(e)}")
    raise

# Extract training and validation metrics
train_log_history = trainer.state.log_history

# Extract training and validation loss/accuracy
train_loss = []
val_loss = []
val_accuracy = []
val_f1 = []

for log in train_log_history:
    if 'loss' in log and 'step' in log:
        train_loss.append((log['step'], log['loss']))
    if 'eval_loss' in log and 'step' in log:
        val_loss.append((log['step'], log['eval_loss']))
        val_accuracy.append((log['step'], log['eval_accuracy']))
        val_f1.append((log['step'], log['eval_f1']))

# Debug: Print logged metrics
print("Training loss entries:", train_loss)
print("Validation loss entries:", val_loss)
print("Validation accuracy entries:", val_accuracy)
print("Validation F1 entries:", val_f1)

# Plot Learning Curves (Training and Validation Loss)
plt.figure(figsize=(10, 6))
train_steps, train_loss_values = zip(*train_loss) if train_loss else ([], [])
val_steps, val_loss_values = zip(*val_loss) if val_loss else ([], [])
plt.plot(train_steps, train_loss_values, label='Training Loss', marker='o')
plt.plot(val_steps, val_loss_values, label='Validation Loss', marker='o')
plt.xlabel('Training Steps')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Over Steps (DistilGPT-2)')
plt.legend()
plt.grid(True)
plt.savefig('/content/drive/MyDrive/GP(AI2025)/Data/sentiment_loss_distlgpt2.png')
plt.show()

# Plot Validation Accuracy and F1 Score
plt.figure(figsize=(10, 6))
val_steps, val_acc_values = zip(*val_accuracy) if val_accuracy else ([], [])
val_steps, val_f1_values = zip(*val_f1) if val_f1 else ([], [])
plt.plot(val_steps, val_acc_values, label='Validation Accuracy', marker='o')
plt.plot(val_steps, val_f1_values, label='Validation F1 Score', marker='o')
plt.xlabel('Training Steps')
plt.ylabel('Score')
plt.title('Validation Accuracy vs F1 Score Over Steps (DistilGPT-2)')
plt.legend()
plt.grid(True)
plt.savefig('/content/drive/MyDrive/GP(AI2025)/Data/sentiment_accuracy_distlgpt2.png')
plt.show()

# Evaluate on test set
print("\n✅ Final Test Evaluation:")
test_results = trainer.evaluate(tokenized_datasets["test"])
print(f"Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"F1 Score: {test_results['eval_f1']:.4f}")

# Generate predictions for confusion matrix
predictions = trainer.predict(tokenized_datasets["test"])
preds = np.asarray(np.argmax(predictions.predictions, axis=1))
labels = np.asarray(predictions.label_ids)

# Plot Confusion Matrix
id2label = {0: "Negative", 1: "Neutral", 2: "Positive"}
cm = confusion_matrix(labels, preds)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=[id2label[i] for i in range(3)],
            yticklabels=[id2label[i] for i in range(3)])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix (DistilGPT-2)')
plt.savefig('/content/drive/MyDrive/GP(AI2025)/Data/sentiment_confusionmatrix_distlgpt2.png')
plt.show()

# **STOP HERE DON'T RUN THE NEXT CELL**

In [None]:
import os
import zipfile
from google.colab import files

# Define folder name for saving
model_name = "distilgpt2-sentiment"
os.makedirs(model_name, exist_ok=True)

# Save model and tokenizer
trainer.save_model(model_name)
tokenizer.save_pretrained(model_name)

# Zip the folder
zip_filename = f"{model_name}.zip"
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, _, files_ in os.walk(model_name):
        for file in files_:
            filepath = os.path.join(root, file)
            zipf.write(filepath, os.path.relpath(filepath, model_name))

# Download the zip file
files.download(zip_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>