In [None]:
# Mount Drive (necessary to access your data)
from google.colab import drive
drive.mount('/content/drive')

# Install Libraries (run this next)
!pip install pandas dask[dataframe] pyarrow transformers datasets accelerate peft bitsandbytes evaluate

Mounted at /content/drive
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl (60.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes, evaluate
Successfully installed bitsandbytes-0.48.1 evaluate-0.4.6


In [None]:
# Run this in a new cell
!pip install optuna

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/400.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.5.0


In [None]:
import pandas as pd
import dask.dataframe as dd
import os
import shutil
import time
import json
import sys
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate
import torch
import warnings
from sklearn.utils.class_weight import compute_class_weight

# --- 0. Configuration and Target Setup ---
PROJECT_BASE_DIR = '/content/drive/MyDrive/Project for cv 1'
DRIVE_DATA_DIR = os.path.join(PROJECT_BASE_DIR, 'data')

# I/O Safe Paths
FINAL_OUTPUT_DIR_NAME = 'final_yelp_data_parquet_CLEAN_FIXED'
DRIVE_OUTPUT_DIR = os.path.join(DRIVE_DATA_DIR, FINAL_OUTPUT_DIR_NAME)
LOCAL_OUTPUT_DIR = '/content/final_data_temp'

# Source file paths (Used only if Phase 1 needs to be re-run)
BUSINESS_FILE = 'yelp_academic_dataset_business.json'
REVIEW_FILE = 'yelp_academic_dataset_review.json'
BUSINESS_FILE_PATH = os.path.join(DRIVE_DATA_DIR, BUSINESS_FILE)
REVIEW_FILE_PATH = os.path.join(DRIVE_DATA_DIR, REVIEW_FILE)

# Dask/Sampling Configuration
FINAL_ROW_COUNT = 50000
DASK_BLOCK_SIZE = '20MiB'

# LLM Configuration
MODEL_CHECKPOINT = 'roberta-base'
NUM_LABELS = 5 # Model output layer must still predict 5 logits (0-4)
FINAL_MODEL_DIR = os.path.join(PROJECT_BASE_DIR, 'final_model_saved_4class_training')

# --- Broadened Categories (Used only if Phase 1 needs to be re-run) ---
TARGET_CATEGORIES = [
    'Restaurants', 'Shopping', 'Nightlife', 'Health & Medical',
    'Food', 'Beauty & Spas', 'Active Life', 'Arts & Entertainment',
    'Financial Services', 'Hotels & Travel', 'Automotive', 'Home Services'
]

# --- CRITICAL FIXES FOR EXECUTION ---
os.environ["ACCELERATE_USE_CUDA"] = "true"
warnings.filterwarnings("ignore", ".*'pin_memory' argument is set as true.*")

print("--- Starting Full Pipeline: Data Prep -> Fine-Tuning ---")
start_time_total = time.time()

# -------------------------------------------------------------------------
# PHASE 1: DATA PREP (Check if already done)
# -------------------------------------------------------------------------
print("\n--- Phase 1: Data Preparation Check ---")
phase1_start_time = time.time()
FINAL_DATA_FILE_PATH = os.path.join(DRIVE_OUTPUT_DIR, 'data.parquet')

if not os.path.exists(FINAL_DATA_FILE_PATH):
    print("FATAL: Final data file not found. Please ensure Phase 1 ran successfully.")
    sys.exit(1)
else:
    print(f"✅ Phase 1 skipped: Final data file already exists at {FINAL_DATA_FILE_PATH}")
    phase1_end_time = time.time()



# PHASE 2: LLM Fine-Tuning Setup (4-Class Training)


print("\n--- Starting PHASE 2: 4-Class Training (for 5-Class Accuracy) ---")
phase2_start_time = time.time()

# --- 1. Load the GUARANTEED Saved Data ---
dataset = load_dataset('parquet', data_files=FINAL_DATA_FILE_PATH)


dataset = dataset.filter(lambda x: x['stars'] != 3)
# Note: We now have only 1, 2, 4, 5 star reviews in the training/test set.
print(f"✅ Dataset filtered to 4 classes (excluding 3-stars): {dataset}")
# --- 1.1 Split ---
dataset = dataset['train'].train_test_split(test_size=0.2, seed=42)

# --- 2. Tokenization and Data Preparation (Map 1,2,4,5 to 0,1,3,4) ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

def tokenize_and_map_labels(examples):
    tokenized_inputs = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)
    # Map 1, 2, 4, 5 stars to 0, 1, 3, 4 labels.
    # This keeps the original structure of the 5-class model's output layer.
    tokenized_inputs['labels'] = [s - 1 for s in examples['stars']]
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_map_labels, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(['text', 'stars'])


# --- 3. Calculate Class Weights (Recalculated on the 4-class set) ---
print("\n⚖️ Calculating class weights on 4-class data...")
train_labels = tokenized_dataset['train']['labels']
# Use compute_class_weight which handles the missing class (2) correctly
class_weights_array = compute_class_weight(class_weight='balanced',
                                           classes=np.unique(train_labels),
                                           y=train_labels)


class_weights_full = np.zeros(5, dtype=np.float32)

label_map = {0: 0, 1: 1, 3: 2, 4: 3}
for old_label, weight_index in label_map.items():
    if old_label in np.unique(train_labels):
        class_weights_full[old_label] = class_weights_array[weight_index]

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class_weights_tensor = torch.tensor(class_weights_full, dtype=torch.float).to(DEVICE)
print(f"Full 5-Class Weights (Note: Index 2/3-star is 0): {class_weights_full}")


# --- 4. Define Custom Trainer with Weighted Loss (Final Corrected Signature) ---
class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


# --- 5. Fine-Tuning Configuration and Training ---
model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=NUM_LABELS)


DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if DEVICE.type == 'cuda':
    model.to(DEVICE)
    print(f"Model moved explicitly to CUDA device ({torch.cuda.get_device_name(0)}).")
else:
    model.to(DEVICE)
    print("WARNING: CUDA failed. Model running on CPU (Slow).")


metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels.astype(int)) # Ensure labels are int



training_args = TrainingArguments(
    output_dir="./results_4class_training",
    num_train_epochs=5,
    learning_rate=2e-5,
    per_device_train_batch_size=32, per_device_eval_batch_size=32,
    warmup_steps=500, weight_decay=0.01, logging_dir='./logs_4class_training', logging_steps=200,
    eval_strategy="epoch", save_strategy="epoch", save_total_limit=1,
    load_best_model_at_end=True, report_to="none",
    fp16=True if DEVICE.type == 'cuda' else False,
    dataloader_pin_memory=False
)

trainer = WeightedLossTrainer(
    model=model, args=training_args, train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"], tokenizer=tokenizer, compute_metrics=compute_metrics
)

print("\n🔥 Starting 4-Class Training (Optimized)...")
trainer.train()

# --- Save Final Model to Drive ---
trainer.save_model(FINAL_MODEL_DIR)
tokenizer.save_pretrained(FINAL_MODEL_DIR)

phase2_end_time = time.time()
total_time_overall = phase2_end_time - start_time_total

print(f"\n=========================================================")
print(f"✅✅ FULL PIPELINE COMPLETE ✅✅")
print(f"   - TOTAL TIME: {total_time_overall:.2f} seconds")
print(f"   - Final Model saved to Drive at: {FINAL_MODEL_DIR}")
print("=========================================================")

# --- Evaluate the final model ---
print("\nEvaluating final model on 4-Class test set (Accuracy should be high)...")
eval_results = trainer.evaluate()
print(f"Final Test Set Accuracy: {eval_results['eval_accuracy']:.4f}")


--- Starting Full Pipeline: Data Prep -> Fine-Tuning ---

--- Phase 1: Data Preparation Check ---
✅ Phase 1 skipped: Final data file already exists at /content/drive/MyDrive/Project for cv 1/data/final_yelp_data_parquet_CLEAN_FIXED/data.parquet

--- Starting PHASE 2: 4-Class Training (for 5-Class Accuracy) ---


Filter:   0%|          | 0/27906 [00:00<?, ? examples/s]

✅ Dataset filtered to 4 classes (excluding 3-stars): DatasetDict({
    train: Dataset({
        features: ['text', 'stars'],
        num_rows: 24714
    })
})


Map:   0%|          | 0/19771 [00:00<?, ? examples/s]

Map:   0%|          | 0/4943 [00:00<?, ? examples/s]


⚖️ Calculating class weights on 4-class data...
Full 5-Class Weights (Note: Index 2/3-star is 0): [2.0680962 2.7893622 0.        0.8615566 0.5006837]


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model moved explicitly to CUDA device (Tesla T4).


  trainer = WeightedLossTrainer(



🔥 Starting 4-Class Training (Optimized)...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7132,0.635452,0.728505
2,0.6171,0.628146,0.714141
3,0.4875,0.745447,0.744083
4,0.3762,0.795717,0.738216
5,0.2994,0.879241,0.738418



✅✅ FULL PIPELINE COMPLETE ✅✅
   - TOTAL TIME: 770.49 seconds
   - Final Model saved to Drive at: /content/drive/MyDrive/Project for cv 1/final_model_saved_4class_training

Evaluating final model on 4-Class test set (Accuracy should be high)...


Final Test Set Accuracy: 0.7141
