In [None]:
#!git clone https://github.com/nathan-limjw/smart_dining_assistant.git

# Change directory to the root of the cloned repository
%cd smart_dining_assistant/sentiment_analysis

In [None]:
import pandas as pd
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
import json
import os
import shutil
from datetime import datetime
import numpy as np
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, Trainer, TrainingArguments, pipeline


import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    precision_recall_fscore_support,
)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def load_data():
    print("Loading dataset...")
    ds = load_dataset("Johnnyeee/Yelpdata_663")

    train_df = ds["train"].to_pandas()
    test_df = ds["test"].to_pandas()

    train_df = train_df[train_df["categories"].str.contains("restaurant", case=False, na=False)]
    test_df = test_df[test_df["categories"].str.contains("restaurant", case=False, na=False)]
    print(f"Filtered dataset to only retain restaurant reviews: train = {len(train_df)}, test = {len(test_df)} ")

    train_df["sentiment"] = train_df["stars_x"].apply(
        lambda x: 0 if x < 3 else (1 if x == 3 else 2)
    )
    test_df["sentiment"] = test_df["stars_x"].apply(
        lambda x: 0 if x < 3 else (1 if x == 3 else 2)
    )

    train_df = train_df[["text", "sentiment"]].dropna()
    test_df = test_df[["text", "sentiment"]].dropna()

    min_train_samples = train_df["sentiment"].value_counts().min()
    samples_per_class = min(100000, min_train_samples)

    print(f"Sampling {samples_per_class} from each class")

    train_balanced = pd.concat(
        [
            train_df[train_df["sentiment"] == 0].sample(n=samples_per_class, random_state=42),
            train_df[train_df["sentiment"] == 1].sample(n=samples_per_class, random_state=42),
            train_df[train_df["sentiment"] == 2].sample(n=samples_per_class, random_state=42),
        ]
    )
    train_final, val = train_test_split(
        train_balanced, test_size=0.2, stratify=train_balanced["sentiment"], random_state=42
    )

    min_train_samples = test_df["sentiment"].value_counts().min()
    test_samples_per_class = min(15000, min_train_samples)

    test_balanced = pd.concat(
        [
            test_df[test_df["sentiment"] == 0].sample(n=test_samples_per_class, random_state=42),
            test_df[test_df["sentiment"] == 1].sample(n=test_samples_per_class, random_state=42),
            test_df[test_df["sentiment"] == 2].sample(n=test_samples_per_class, random_state=42),
        ]
    )

    train_final.to_csv("data/train.csv", index=False)
    val.to_csv("data/val.csv", index=False)
    test_balanced.to_csv("data/test.csv", index=False)

    print("Data saved to /data")

    return train_final, val, test_balanced



In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average="weighted", zero_division=0
    )

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}


def load_and_tokenize_data():
    print("Loading data splits...")
    train_df = pd.read_csv("data/train.csv")
    val_df = pd.read_csv("data/val.csv")
    test_df = pd.read_csv("data/test.csv")

    train_df = train_df[:20]
    val_df = val_df[:20]
    test_df = test_df[:20]

    print("Loading tokenizer...")
    tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

    def tokenize(input):
        return tokenizer(
            input["text"], padding="max_length", truncation=True, max_length=512
        )

    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)
    test_dataset = Dataset.from_pandas(test_df)

    print("Tokenizing all datasets...")
    train_dataset = train_dataset.map(tokenize, batched=True)
    val_dataset = val_dataset.map(tokenize, batched=True)
    test_dataset = test_dataset.map(tokenize, batched=True)

    train_dataset = train_dataset.rename_column("sentiment", "labels")
    val_dataset = val_dataset.rename_column("sentiment", "labels")
    test_dataset = test_dataset.rename_column("sentiment", "labels")

    train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
    val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
    test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

    print("Data tokenized!")
    return train_dataset, val_dataset, test_dataset


def run_hyperparameter_tuning():
    print("Finding the best hyperparameters...")

    configs = [
        {
            "name": "config_1_default",
            "learning_rate": 2e-5,
            "batch_size": 16,
            "epochs": 3,
            "weight_decay": 0.01,
        },
        {
            "name": "config_2_higher_lr",
            "learning_rate": 3e-5,
            "batch_size": 16,
            "epochs": 3,
            "weight_decay": 0.01,
        },
        {
            "name": "config_3_lower_lr_more_epochs",
            "learning_rate": 1e-5,
            "batch_size": 16,
            "epochs": 5,
            "weight_decay": 0.01,
        },
        {
            "name": "config_4_larger_batch",
            "learning_rate": 2e-5,
            "batch_size": 32,
            "epochs": 3,
            "weight_decay": 0.01,
        },
        {
            "name": "config_5_lower_weight_decay",
            "learning_rate": 2e-5,
            "batch_size": 16,
            "epochs": 3,
            "weight_decay": 0.001,
        },
    ]

    train_dataset, val_dataset, test_dataset = load_and_tokenize_data()

    results = []

    for i, config in enumerate(configs):
        print(f"Current Config: {config['name']}")
        print(f"Learning Rate: {config['learning_rate']}")
        print(f"Batch Size: {config['batch_size']}")
        print(f"Epochs: {config['epochs']}")
        print(f"Weight Decay: {config['weight_decay']}")

        model = DistilBertForSequenceClassification.from_pretrained(
            "distilbert-base-uncased", num_labels=3
        )

        training_args = TrainingArguments(
            output_dir=f"models/{config['name']}",
            num_train_epochs=config["epochs"],
            per_device_train_batch_size=config["batch_size"],
            per_device_eval_batch_size=32,
            learning_rate=config["learning_rate"],
            weight_decay=config["weight_decay"],
            warmup_steps=500,
            eval_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            logging_dir=f"results/logs/{config['name']}",
            logging_steps=100,
            save_total_limit=2,
            report_to="none",
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
        )

        print("\nTraining with configs...")
        start_time = datetime.now()
        trainer.train()
        trainer.save_model(f"models/{config['name']}") #new

        tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
        tokenizer.save_pretrained(f"models/{config["name"]}")


        end_time = datetime.now()
        training_time = (end_time - start_time).total_seconds() / 60

        print("\nEvaluating on validation dataset...")
        eval_results = trainer.evaluate()

        result_entry = {
            "config_name": config["name"],
            "config_number": i + 1,
            "learning_rate": config["learning_rate"],
            "batch_size": config["batch_size"],
            "epochs": config["epochs"],
            "weight_decay": config["weight_decay"],
            "val_accuracy": eval_results["eval_accuracy"],
            "val_precision": eval_results["eval_precision"],
            "val_recall": eval_results["eval_recall"],
            "val_f1": eval_results["eval_f1"],
            "val_loss": eval_results["eval_loss"],
            "training_time_minutes": training_time,
        }

        results.append(result_entry)

        print("\nTraining Results:")
        print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
        print(f"Time: {training_time:.1f} mins")

    os.makedirs("results/metrics", exist_ok=True)

    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values("val_accuracy", ascending=False)
    results_df.to_csv("results/metrics/hyperparam_tuning_results.csv", index=False)

    print("Best configuration for Model!")
    best_config = results_df.iloc[0]
    print(f"Config: {best_config['config_name']}")
    print(f"Accuracy: {best_config['val_accuracy']:.4f}")

    best_config_dict = {
        "config_name": best_config["config_name"],
        "learning_rate": float(best_config["learning_rate"]),
        "batch_size": int(best_config["batch_size"]),
        "epochs": int(best_config["epochs"]),
        "weight_decay": float(best_config["weight_decay"]),
        "val_accuracy": float(best_config["val_accuracy"]),
        "val_f1": float(best_config["val_f1"]),
    }

    with open("results/metrics/best_config.json", "w") as f:
        json.dump(best_config_dict, f, indent=4)

    source_dir = f"models/{best_config['config_name']}"
    dest_dir = "models/sentiment_model"

    if os.path.exists(dest_dir):
        shutil.rmtree(dest_dir)

    shutil.copytree(source_dir, dest_dir)

    print("Best model has been saved under: models/sentiment_model")
    print(
        "Results have been saved under 'results/metrics/hyperparam_tuning_results.csv'"
    )



In [None]:

def plot_confusion_matrix(
    y_true, y_pred, output_path="results/figures/confusion_matrix.png"
):
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))

    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=["Negative", "Neutral", "Positive"],
        yticklabels=["Negative", "Neutral", "Positive"],
        cbar_kws={"label": "Count"},
    )
    plt.title("Confusion Matrix for Sentiment Classification")
    plt.ylabel("True Label")
    plt.xlabel("Predicted Label")
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches="tight")
    plt.close()

    return f"Confusion matrix saved to {output_path}"


def evaluate_model(model_path="models/sentiment_model", test_path="data/test.csv"):
    classifier = pipeline(
        "text-classification",
        model=model_path,
        tokenizer=model_path,
        return_all_scores=True,
    )

    test_df = pd.read_csv(test_path)

    print("\nMaking predictions on test data...")
    predictions = []
    confidences = []

    for text in test_df["text"]:
        result = classifier(text[:512])[0]
        best = max(result, key=lambda x: x["score"])

        label_num = int(best["label"].split("_")[1])
        predictions.append(label_num)
        confidences.append(best["score"])

    test_df["predicted"] = predictions
    test_df["confidence"] = confidences

    y_true = test_df["sentiment"]
    y_pred = test_df["predicted"]

    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average="weighted"
    )

    print("\n Metrics for Evaluation")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    print("\nPer-Class Performance:")
    print(
        classification_report(
            y_true, y_pred, target_names=["Negative", "Neutral", "Positive"]
        )
    )

    metrics_dict = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }
    metrics_df = pd.DataFrame([metrics_dict])
    metrics_df.to_csv("results/metrics/test_metrics.csv", index=False)
    print("\nEvaluation metrics saved to 'results/metrics/test_metrics.csv'")

    precision_per_class, recall_per_class, f1_per_class, support_per_class = (
        precision_recall_fscore_support(y_true, y_pred, average=None)
    )
    per_class_df = pd.DataFrame(
        {
            "Class": ["Negative", "Neutral", "Positive"],
            "Precision": precision_per_class,
            "Recall": recall_per_class,
            "F1-Score": f1_per_class,
            "Support": support_per_class,
        }
    )
    per_class_df.to_csv("results/metrics/per_class_metrics.csv", index=False)
    print("Per-class metrics saved to 'results/metrics/per_class_metrics.csv'")

    plot_confusion_matrix(y_true, y_pred)

    test_df.to_csv("results/metrics/test_predictions.csv", index=False)
    print("Predictions saved to 'results/metrics/test_predictions.csv'")

    errors = test_df[test_df["sentiment"] != test_df["predicted"]]
    error_rate = len(errors) * 100 / len(test_df)
    print(f"Error percentage: {error_rate:.2%}")

    if len(errors) > 0:
        errors.to_csv("results/metrics/error_examples.csv", index=False)
        print("Errors saved to 'results/metrics/error_examples.csv'")

    return metrics_dict


In [None]:
os.makedirs("data", exist_ok= True)
os.makedirs("results/metrics", exist_ok = True)

train, val, test = load_data()

In [None]:
run_hyperparameter_tuning()

In [None]:
evaluate_model()