# Multi-Competitor NER & Sentiment Analysis - COMPLETE FIXED VERSION

**Fixed Issues:**
1. ‚úÖ AdamW import (now uses `torch.optim.AdamW` instead of transformers)
2. ‚úÖ CSV encoding (auto-detects and converts to UTF-8)
3. ‚úÖ Excel output (full formatted .xlsx export)
4. ‚úÖ NER model (single-label classification that actually works)
5. ‚úÖ Complete sentiment model implementation

**Pipeline:**
1. Load and convert CSVs to UTF-8
2. Train NER classifier (14-class)
3. Train sentiment model (3-class)
4. Generate predictions
5. Export to formatted Excel

## 1. Setup & Installation

In [None]:
# Install required libraries
!pip install -q transformers datasets torch torchvision accelerate
!pip install -q scikit-learn pandas numpy matplotlib seaborn
!pip install -q sentencepiece protobuf
!pip install -q openpyxl xlsxwriter  # For Excel export
!pip install -q chardet  # For encoding detection

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import re
import gc
import warnings
import chardet
from pathlib import Path
warnings.filterwarnings('ignore')

# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from torch.optim import AdamW  # FIXED: Use torch.optim.AdamW instead of transformers

# Transformers
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, confusion_matrix,
    accuracy_score, f1_score, precision_recall_fscore_support
)
from sklearn.utils.class_weight import compute_class_weight

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

# Set seeds
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print("‚úì Libraries imported successfully!")
print(f"‚úì PyTorch version: {torch.__version__}")
print(f"‚úì Using AdamW from: torch.optim")

In [None]:
# Check GPU and configure device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

if torch.cuda.is_available():
    gpu_props = torch.cuda.get_device_properties(0)
    gpu_memory_gb = gpu_props.total_memory / 1e9
    print(f"GPU: {gpu_props.name}")
    print(f"GPU Memory: {gpu_memory_gb:.2f} GB")
    
    # Adaptive batch size
    if gpu_memory_gb >= 15:
        BATCH_SIZE = 16
        GRAD_ACCUM_STEPS = 2
    else:
        BATCH_SIZE = 8
        GRAD_ACCUM_STEPS = 4
else:
    BATCH_SIZE = 4
    GRAD_ACCUM_STEPS = 8

EFFECTIVE_BATCH_SIZE = BATCH_SIZE * GRAD_ACCUM_STEPS
print(f"\nBatch size: {BATCH_SIZE}, Accumulation: {GRAD_ACCUM_STEPS}, Effective: {EFFECTIVE_BATCH_SIZE}")

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Create directories
!mkdir -p '/content/drive/MyDrive/KFC_ML_Models'
!mkdir -p '/content/results'

MODEL_SAVE_DIR = '/content/drive/MyDrive/KFC_ML_Models'
RESULTS_DIR = '/content/results'

## 2. Configuration

In [None]:
# Competitor list (14 total)
COMPETITORS = [
    'Burger King', 'Deliveroo', "Domino's", 'Five Guys', 'Greggs',
    'Just Eat', 'KFC', "McDonald's", "Nando's", "Papa John's",
    'Pizza Hut', 'Pret a Manger', 'Taco Bell', 'Uber Eats'
]

SENTIMENT_MAP = {0: 'negative', 1: 'neutral', 2: 'positive'}

# Model configs
NER_MODEL_NAME = 'bert-base-uncased'
SENTIMENT_MODEL_NAME = 'cardiffnlp/twitter-roberta-base-sentiment-latest'

# Hyperparameters
MAX_SEQ_LENGTH = 128
NER_LEARNING_RATE = 2e-5
SENTIMENT_LEARNING_RATE = 2e-5
NER_EPOCHS = 5
SENTIMENT_EPOCHS = 5
WARMUP_RATIO = 0.1
WEIGHT_DECAY = 0.01

print(f"‚úì Configuration loaded")
print(f"  Competitors: {len(COMPETITORS)}")
print(f"  NER Model: {NER_MODEL_NAME}")
print(f"  Sentiment Model: {SENTIMENT_MODEL_NAME}")

## 3. CSV Loading with UTF-8 Conversion (FIXED)

In [None]:
def detect_encoding(file_path):
    """
    Detect file encoding using chardet.
    """
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read(100000))  # Read first 100KB
    return result['encoding'], result['confidence']

def convert_to_utf8(input_file, output_file=None):
    """
    Convert CSV file to UTF-8 encoding.
    If output_file is None, overwrites input_file.
    """
    if output_file is None:
        output_file = input_file
    
    # Detect encoding
    encoding, confidence = detect_encoding(input_file)
    print(f"  Detected encoding: {encoding} (confidence: {confidence:.2%})")
    
    if encoding and encoding.lower() != 'utf-8':
        # Read with detected encoding
        try:
            with open(input_file, 'r', encoding=encoding, errors='replace') as f:
                content = f.read()
            
            # Write as UTF-8
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(content)
            
            print(f"  ‚úì Converted to UTF-8")
        except Exception as e:
            print(f"  ‚ö† Conversion failed: {e}")
            print(f"  Trying with 'utf-8' and error handling...")
    else:
        print(f"  ‚úì Already UTF-8")

def load_csv_safe(file_path):
    """
    Load CSV with automatic UTF-8 conversion.
    """
    print(f"\nLoading: {file_path}")
    
    # Convert to UTF-8 first
    convert_to_utf8(file_path)
    
    # Load with pandas
    try:
        df = pd.read_csv(file_path, encoding='utf-8', low_memory=False)
        print(f"  ‚úì Loaded {len(df)} rows")
        return df
    except Exception as e:
        print(f"  ‚ö† Error loading with UTF-8: {e}")
        print(f"  Trying with encoding detection...")
        encoding, _ = detect_encoding(file_path)
        df = pd.read_csv(file_path, encoding=encoding, low_memory=False, errors='replace')
        print(f"  ‚úì Loaded {len(df)} rows with {encoding}")
        return df

print("‚úì CSV loading functions defined")

In [None]:
# Upload CSV files
from google.colab import files
print("Please upload your CSV files:")
uploaded = files.upload()

In [None]:
# Load datasets with UTF-8 conversion
print("Loading and converting datasets to UTF-8...")

df_large = load_csv_safe('KFC_social_data.xlsx - Sheet1.csv')
df_train_sample = load_csv_safe('KFC_training_sample.csv')
df_test = load_csv_safe('KFC_test_sample.csv')
df_test_pred = load_csv_safe('KFC_test_sample_for_prediction.csv')

print(f"\n‚úì All datasets loaded and converted to UTF-8")

## 4. Data Preprocessing

In [None]:
def clean_sentiment(value):
    """Extract numeric sentiment (0, 1, 2)"""
    if pd.isna(value):
        return None
    
    if isinstance(value, (int, float)):
        if value in [0, 1, 2]:
            return int(value)
        return None
    
    value_str = str(value).strip().lower()
    if value_str in ['0', 'negative']: return 0
    elif value_str in ['1', 'neutral']: return 1
    elif value_str in ['2', 'positive']: return 2
    
    match = re.match(r'^(\d)', value_str)
    if match:
        digit = int(match.group(1))
        if digit in [0, 1, 2]:
            return digit
    
    return None

def normalize_competitor_name(comp_str):
    """Normalize competitor names"""
    if pd.isna(comp_str) or not comp_str:
        return None
    
    comp_str = str(comp_str).strip()
    
    if comp_str in COMPETITORS:
        return comp_str
    
    for comp in COMPETITORS:
        if comp.lower() == comp_str.lower():
            return comp
    
    return None

def prepare_dataset(df, name="dataset"):
    """Clean and prepare dataset"""
    print(f"\nPreparing {name}...")
    print(f"  Initial rows: {len(df)}")
    
    # Select columns
    essential_cols = ['Competitor', 'Tweet', 'SENTIMENT']
    if 'Tweet' not in df.columns:
        if 'Full Text' in df.columns:
            df['Tweet'] = df['Full Text']
        elif 'Snippet' in df.columns:
            df['Tweet'] = df['Snippet']
    
    available_cols = [col for col in essential_cols if col in df.columns]
    metadata_cols = [col for col in ['Impact', 'Impressions', 'Reach (new)', 'Date', 'Url'] if col in df.columns]
    
    df_clean = df[available_cols + metadata_cols].copy()
    
    # Clean sentiment
    if 'SENTIMENT' in df_clean.columns:
        df_clean['SENTIMENT'] = df_clean['SENTIMENT'].apply(clean_sentiment)
        before = len(df_clean)
        df_clean = df_clean.dropna(subset=['SENTIMENT'])
        print(f"  Dropped {before - len(df_clean)} rows with invalid sentiment")
        df_clean['SENTIMENT'] = df_clean['SENTIMENT'].astype(int)
    
    # Clean competitor names
    df_clean['Competitor'] = df_clean['Competitor'].apply(normalize_competitor_name)
    before = len(df_clean)
    df_clean = df_clean.dropna(subset=['Competitor', 'Tweet'])
    print(f"  Dropped {before - len(df_clean)} rows with invalid competitor/tweet")
    
    # Clean tweet text
    df_clean['Tweet'] = df_clean['Tweet'].astype(str).str.strip()
    df_clean = df_clean[df_clean['Tweet'].str.len() > 0]
    
    df_clean = df_clean.reset_index(drop=True)
    
    print(f"  Final rows: {len(df_clean)}")
    print(f"  Unique competitors: {df_clean['Competitor'].nunique()}")
    
    if 'SENTIMENT' in df_clean.columns:
        sent_dist = df_clean['SENTIMENT'].value_counts().sort_index()
        print(f"  Sentiment distribution: {dict(sent_dist)}")
    
    return df_clean

# Prepare datasets
df_large_clean = prepare_dataset(df_large, "Large dataset")
df_test_clean = prepare_dataset(df_test, "Test dataset")

# For prediction data (no sentiment)
if 'SENTIMENT' not in df_test_pred.columns:
    df_test_pred['SENTIMENT'] = 1  # Dummy value
df_test_pred_clean = prepare_dataset(df_test_pred, "Test prediction dataset")

## 5. Multi-Competitor Extraction (Regex)

In [None]:
def extract_all_competitors(tweet_text):
    """
    Extract ALL competitors mentioned using regex.
    """
    found_competitors = set()
    tweet_lower = tweet_text.lower()
    
    patterns = {
        'Burger King': [r'\bburger\s*king\b', r'\bbk\b'],
        'Deliveroo': [r'\bdeliveroo\b'],
        "Domino's": [r'\bdomino(?:s|\'s)?\b'],
        'Five Guys': [r'\bfive\s*guys\b'],
        'Greggs': [r'\bgreggs?\b'],
        'Just Eat': [r'\bjust\s*eat\b'],
        'KFC': [r'\bkfc\b', r'\bkentucky\s*fried\s*chicken\b', r'@kfc'],
        "McDonald's": [r'\bmcdonald(?:s|\'s)?\b', r'\bmaccies\b', r'\bmaccas\b', r'\bmcdonalds\b', r'@mcdonald'],
        "Nando's": [r'\bnando(?:s|\'s)\b', r'@nando'],
        "Papa John's": [r'\bpapa\s*john(?:s|\'s)?\b', r'@papajohn'],
        'Pizza Hut': [r'\bpizza\s*hut\b', r'@pizzahut'],
        'Pret a Manger': [r'\bpret(?:\s*a\s*manger)?\b', r'@pret'],
        'Taco Bell': [r'\btaco\s*bell\b', r'@tacobell'],
        'Uber Eats': [r'\buber\s*eats\b', r'@ubereats']
    }
    
    for competitor, pattern_list in patterns.items():
        for pattern in pattern_list:
            if re.search(pattern, tweet_lower):
                found_competitors.add(competitor)
                break
    
    return list(found_competitors)

# Test
test_tweet = "I love KFC's chicken but McDonald's has better fries!"
print(f"Test tweet: {test_tweet}")
print(f"Found competitors: {extract_all_competitors(test_tweet)}")

## 6. Train/Val Split

In [None]:
# Stratified split
train_df, val_df = train_test_split(
    df_large_clean,
    test_size=0.2,
    random_state=SEED,
    stratify=df_large_clean['Competitor']
)

print(f"Dataset Split:")
print(f"  Training: {len(train_df)} samples")
print(f"  Validation: {len(val_df)} samples")
print(f"  Test: {len(df_test_clean)} samples")
print(f"\nTop 5 competitors in training set:")
print(train_df['Competitor'].value_counts().head())

## 7. NER Model - Single-Label Classification

In [None]:
class CompetitorDataset(Dataset):
    """Dataset for competitor classification (0-13)"""
    
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.competitor_to_idx = {comp: idx for idx, comp in enumerate(COMPETITORS)}
        
        # Validation
        unique_comps = self.data['Competitor'].unique()
        print(f"\nDataset has {len(unique_comps)} unique competitors:")
        for comp in unique_comps:
            if comp in self.competitor_to_idx:
                count = (self.data['Competitor'] == comp).sum()
                print(f"  ‚úì {comp}: {count} samples (label {self.competitor_to_idx[comp]})")
            else:
                print(f"  ‚úó {comp}: NOT IN COMPETITOR LIST!")
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        tweet = row['Tweet']
        competitor = row['Competitor']
        label = self.competitor_to_idx[competitor]
        
        encoding = self.tokenizer(
            tweet,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

print("‚úì CompetitorDataset defined")

In [None]:
# Load NER tokenizer
print(f"Loading NER tokenizer: {NER_MODEL_NAME}")
ner_tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_NAME)

# Create datasets
print("\nCreating NER datasets...")
ner_train_dataset = CompetitorDataset(train_df, ner_tokenizer, MAX_SEQ_LENGTH)
ner_val_dataset = CompetitorDataset(val_df, ner_tokenizer, MAX_SEQ_LENGTH)
ner_test_dataset = CompetitorDataset(df_test_clean, ner_tokenizer, MAX_SEQ_LENGTH)

# DataLoaders
ner_train_loader = DataLoader(ner_train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
ner_val_loader = DataLoader(ner_val_dataset, batch_size=BATCH_SIZE*2, shuffle=False, num_workers=2)
ner_test_loader = DataLoader(ner_test_dataset, batch_size=BATCH_SIZE*2, shuffle=False, num_workers=2)

print(f"\n‚úì DataLoaders created")

In [None]:
# Class weights for imbalanced data
train_labels = [ner_train_dataset.competitor_to_idx[comp] for comp in train_df['Competitor']]
ner_class_weights = compute_class_weight(
    'balanced',
    classes=np.arange(len(COMPETITORS)),
    y=train_labels
)
ner_class_weights = torch.tensor(ner_class_weights, dtype=torch.float).to(device)

print("NER Class Weights:")
for i, comp in enumerate(COMPETITORS):
    print(f"  {comp:20s}: {ner_class_weights[i]:.3f}")

In [None]:
def train_model(model, train_loader, val_loader, epochs, learning_rate, class_weights, model_name="model"):
    """
    Training function with torch.optim.AdamW (FIXED)
    """
    model = model.to(device)
    
    # FIXED: Use torch.optim.AdamW
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=WEIGHT_DECAY)
    
    total_steps = len(train_loader) * epochs // GRAD_ACCUM_STEPS
    warmup_steps = int(total_steps * WARMUP_RATIO)
    scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, total_steps)
    
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    scaler = GradScaler()
    
    history = {'train_loss': [], 'val_loss': [], 'val_accuracy': [], 'val_f1': []}
    best_val_f1 = 0
    
    print(f"\nTraining {model_name}...")
    print(f"  Epochs: {epochs}, Steps: {total_steps}, Warmup: {warmup_steps}")
    print(f"  Using optimizer: torch.optim.AdamW\n")
    
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        print("-" * 50)
        
        # Training
        model.train()
        train_loss = 0
        optimizer.zero_grad()
        
        train_pbar = tqdm(train_loader, desc="Training")
        for step, batch in enumerate(train_pbar):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            with autocast():
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs.logits, labels) / GRAD_ACCUM_STEPS
            
            scaler.scale(loss).backward()
            
            if (step + 1) % GRAD_ACCUM_STEPS == 0:
                scaler.step(optimizer)
                scaler.update()
                scheduler.step()
                optimizer.zero_grad()
            
            train_loss += loss.item() * GRAD_ACCUM_STEPS
            train_pbar.set_postfix({'loss': f'{loss.item() * GRAD_ACCUM_STEPS:.4f}'})
        
        avg_train_loss = train_loss / len(train_loader)
        history['train_loss'].append(avg_train_loss)
        
        # Validation
        model.eval()
        val_loss = 0
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            val_pbar = tqdm(val_loader, desc="Validation")
            for batch in val_pbar:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                with autocast():
                    outputs = model(input_ids, attention_mask)
                    loss = criterion(outputs.logits, labels)
                
                val_loss += loss.item()
                preds = torch.argmax(outputs.logits, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        
        avg_val_loss = val_loss / len(val_loader)
        val_accuracy = accuracy_score(all_labels, all_preds)
        val_f1 = f1_score(all_labels, all_preds, average='macro')
        
        history['val_loss'].append(avg_val_loss)
        history['val_accuracy'].append(val_accuracy)
        history['val_f1'].append(val_f1)
        
        print(f"\nResults:")
        print(f"  Train Loss: {avg_train_loss:.4f}")
        print(f"  Val Loss: {avg_val_loss:.4f}")
        print(f"  Val Accuracy: {val_accuracy:.4f}")
        print(f"  Val F1 (macro): {val_f1:.4f}")
        
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            print(f"  ‚úì New best F1! Saving model...")
            torch.save(model.state_dict(), f'{MODEL_SAVE_DIR}/{model_name}_best.pt')
        print()
        
        torch.cuda.empty_cache()
        gc.collect()
    
    print(f"\n‚úì Training complete! Best val F1: {best_val_f1:.4f}")
    return model, history

print("‚úì Training function defined (using torch.optim.AdamW)")

In [None]:
# Train NER model
print(f"Initializing NER model: {NER_MODEL_NAME}")
ner_model = AutoModelForSequenceClassification.from_pretrained(
    NER_MODEL_NAME,
    num_labels=len(COMPETITORS)
)

ner_model, ner_history = train_model(
    ner_model,
    ner_train_loader,
    ner_val_loader,
    epochs=NER_EPOCHS,
    learning_rate=NER_LEARNING_RATE,
    class_weights=ner_class_weights,
    model_name="ner_model"
)

In [None]:
# Plot NER history
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

axes[0].plot(ner_history['train_loss'], label='Train', marker='o')
axes[0].plot(ner_history['val_loss'], label='Val', marker='s')
axes[0].set_title('NER - Loss', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].legend()
axes[0].grid(alpha=0.3)

axes[1].plot(ner_history['val_accuracy'], marker='o', color='blue')
axes[1].set_title('NER - Validation Accuracy', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].grid(alpha=0.3)

axes[2].plot(ner_history['val_f1'], marker='o', color='green')
axes[2].set_title('NER - Validation F1', fontsize=14, fontweight='bold')
axes[2].set_xlabel('Epoch')
axes[2].set_ylabel('F1 Score')
axes[2].grid(alpha=0.3)

plt.tight_layout()
plt.savefig(f'{RESULTS_DIR}/ner_training.png', dpi=300)
plt.show()

print(f"‚úì Best NER F1: {max(ner_history['val_f1']):.4f}")

## 8. Sentiment Model

In [None]:
class SentimentDataset(Dataset):
    """Competitor-aware sentiment dataset"""
    
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        tweet = row['Tweet']
        competitor = row['Competitor']
        sentiment = row['SENTIMENT']
        
        # Contextualized input
        text = f"{tweet} This tweet is about {competitor}."
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(sentiment, dtype=torch.long)
        }

print("‚úì SentimentDataset defined")

In [None]:
# Load sentiment tokenizer
print(f"Loading Sentiment tokenizer: {SENTIMENT_MODEL_NAME}")
sentiment_tokenizer = AutoTokenizer.from_pretrained(SENTIMENT_MODEL_NAME)

# Create datasets
print("\nCreating Sentiment datasets...")
sentiment_train_dataset = SentimentDataset(train_df, sentiment_tokenizer, MAX_SEQ_LENGTH)
sentiment_val_dataset = SentimentDataset(val_df, sentiment_tokenizer, MAX_SEQ_LENGTH)
sentiment_test_dataset = SentimentDataset(df_test_clean, sentiment_tokenizer, MAX_SEQ_LENGTH)

print(f"  Train: {len(sentiment_train_dataset)} samples")
print(f"  Val: {len(sentiment_val_dataset)} samples")
print(f"  Test: {len(sentiment_test_dataset)} samples")

# DataLoaders
sentiment_train_loader = DataLoader(sentiment_train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
sentiment_val_loader = DataLoader(sentiment_val_dataset, batch_size=BATCH_SIZE*2, shuffle=False, num_workers=2)
sentiment_test_loader = DataLoader(sentiment_test_dataset, batch_size=BATCH_SIZE*2, shuffle=False, num_workers=2)

print(f"\n‚úì DataLoaders created")

In [None]:
# Sentiment class weights
sentiment_class_weights = compute_class_weight(
    'balanced',
    classes=np.arange(3),
    y=train_df['SENTIMENT'].values
)
sentiment_class_weights = torch.tensor(sentiment_class_weights, dtype=torch.float).to(device)

print("Sentiment Class Weights:")
for i, label in SENTIMENT_MAP.items():
    print(f"  {label:8s}: {sentiment_class_weights[i]:.3f}")

In [None]:
# Train Sentiment model
print(f"Initializing Sentiment model: {SENTIMENT_MODEL_NAME}")
sentiment_model = AutoModelForSequenceClassification.from_pretrained(
    SENTIMENT_MODEL_NAME,
    num_labels=3,
    ignore_mismatched_sizes=True
)

sentiment_model, sentiment_history = train_model(
    sentiment_model,
    sentiment_train_loader,
    sentiment_val_loader,
    epochs=SENTIMENT_EPOCHS,
    learning_rate=SENTIMENT_LEARNING_RATE,
    class_weights=sentiment_class_weights,
    model_name="sentiment_model"
)

In [None]:
# Plot Sentiment history
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

axes[0].plot(sentiment_history['train_loss'], label='Train', marker='o')
axes[0].plot(sentiment_history['val_loss'], label='Val', marker='s')
axes[0].set_title('Sentiment - Loss', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].legend()
axes[0].grid(alpha=0.3)

axes[1].plot(sentiment_history['val_accuracy'], marker='o', color='blue')
axes[1].set_title('Sentiment - Validation Accuracy', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].grid(alpha=0.3)

axes[2].plot(sentiment_history['val_f1'], marker='o', color='green')
axes[2].set_title('Sentiment - Validation F1', fontsize=14, fontweight='bold')
axes[2].set_xlabel('Epoch')
axes[2].set_ylabel('F1 Score')
axes[2].grid(alpha=0.3)

plt.tight_layout()
plt.savefig(f'{RESULTS_DIR}/sentiment_training.png', dpi=300)
plt.show()

print(f"‚úì Best Sentiment F1: {max(sentiment_history['val_f1']):.4f}")

## 9. Integrated Pipeline & Predictions

In [None]:
def predict_tweet(tweet_text, ner_model, sentiment_model, ner_tokenizer, sentiment_tokenizer):
    """
    Full pipeline: NER + Regex ‚Üí Sentiment per competitor
    """
    ner_model.eval()
    sentiment_model.eval()
    
    # NER prediction
    encoding = ner_tokenizer(
        tweet_text,
        add_special_tokens=True,
        max_length=MAX_SEQ_LENGTH,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    with torch.no_grad():
        with autocast():
            outputs = ner_model(encoding['input_ids'].to(device), encoding['attention_mask'].to(device))
    
    predicted_idx = torch.argmax(outputs.logits, dim=1).item()
    primary_competitor = COMPETITORS[predicted_idx]
    
    # Regex extraction
    regex_competitors = extract_all_competitors(tweet_text)
    
    # Combine
    all_competitors = set([primary_competitor] + regex_competitors)
    
    # Sentiment for each
    results = []
    for competitor in all_competitors:
        text = f"{tweet_text} This tweet is about {competitor}."
        
        sentiment_encoding = sentiment_tokenizer(
            text,
            add_special_tokens=True,
            max_length=MAX_SEQ_LENGTH,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        with torch.no_grad():
            with autocast():
                sentiment_outputs = sentiment_model(
                    sentiment_encoding['input_ids'].to(device),
                    sentiment_encoding['attention_mask'].to(device)
                )
        
        predicted_sentiment = torch.argmax(sentiment_outputs.logits, dim=1).item()
        results.append((competitor, predicted_sentiment))
    
    return results

# Test
test_tweet = "KFC's chicken is amazing but McDonald's has terrible service"
print(f"Test tweet: {test_tweet}")
predictions = predict_tweet(test_tweet, ner_model, sentiment_model, ner_tokenizer, sentiment_tokenizer)
for comp, sent in predictions:
    print(f"  {comp}: {SENTIMENT_MAP[sent]}")

In [None]:
def process_dataset(df, ner_model, sentiment_model, ner_tokenizer, sentiment_tokenizer):
    """
    Process entire dataset with pipeline.
    Returns DataFrame with one row per detected competitor.
    """
    results = []
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing tweets"):
        tweet = row['Tweet']
        
        # Get predictions
        predictions = predict_tweet(tweet, ner_model, sentiment_model, ner_tokenizer, sentiment_tokenizer)
        
        # Create row for each detected competitor
        for competitor, sentiment in predictions:
            result_row = {
                'Competitor': competitor,
                'Tweet': tweet,
                'Predicted_Sentiment': sentiment,
                'Sentiment_Label': SENTIMENT_MAP[sentiment]
            }
            
            # Add metadata
            for col in ['Impact', 'Impressions', 'Reach (new)', 'Date', 'Url']:
                if col in row.index and pd.notna(row[col]):
                    result_row[col] = row[col]
            
            results.append(result_row)
    
    return pd.DataFrame(results)

print("‚úì Batch processing function defined")

In [None]:
# Generate predictions for test data
print("Generating predictions for test data...\n")

# Load best models
ner_model.load_state_dict(torch.load(f'{MODEL_SAVE_DIR}/ner_model_best.pt'))
sentiment_model.load_state_dict(torch.load(f'{MODEL_SAVE_DIR}/sentiment_model_best.pt'))

# Process
predictions_df = process_dataset(
    df_test_pred_clean,
    ner_model,
    sentiment_model,
    ner_tokenizer,
    sentiment_tokenizer
)

print(f"\n‚úì Generated {len(predictions_df)} predictions from {len(df_test_pred_clean)} tweets")
print(f"  Average competitors per tweet: {len(predictions_df) / len(df_test_pred_clean):.2f}")

print("\nSample predictions:")
print(predictions_df.head(10))

## 10. Excel Export with Formatting (FIXED)

In [None]:
def export_to_excel(df, filename, include_summary=True):
    """
    Export predictions to formatted Excel file.
    """
    filepath = f'{RESULTS_DIR}/{filename}'
    
    # Create Excel writer
    with pd.ExcelWriter(filepath, engine='xlsxwriter') as writer:
        # Write main data
        df.to_excel(writer, sheet_name='Predictions', index=False)
        
        # Get workbook and worksheet
        workbook = writer.book
        worksheet = writer.sheets['Predictions']
        
        # Define formats
        header_format = workbook.add_format({
            'bold': True,
            'text_wrap': True,
            'valign': 'top',
            'fg_color': '#D7E4BD',
            'border': 1
        })
        
        # Sentiment color formats
        negative_format = workbook.add_format({'bg_color': '#FFC7CE', 'font_color': '#9C0006'})
        neutral_format = workbook.add_format({'bg_color': '#FFEB9C', 'font_color': '#9C6500'})
        positive_format = workbook.add_format({'bg_color': '#C6EFCE', 'font_color': '#006100'})
        
        # Apply header format
        for col_num, value in enumerate(df.columns.values):
            worksheet.write(0, col_num, value, header_format)
        
        # Set column widths
        worksheet.set_column('A:A', 15)  # Competitor
        worksheet.set_column('B:B', 60)  # Tweet
        worksheet.set_column('C:C', 12)  # Predicted_Sentiment
        worksheet.set_column('D:D', 15)  # Sentiment_Label
        worksheet.set_column('E:Z', 12)  # Other columns
        
        # Apply conditional formatting for sentiment
        if 'Sentiment_Label' in df.columns:
            sentiment_col = df.columns.get_loc('Sentiment_Label')
            
            # Negative
            worksheet.conditional_format(1, sentiment_col, len(df), sentiment_col, {
                'type': 'text',
                'criteria': 'containing',
                'value': 'negative',
                'format': negative_format
            })
            
            # Neutral
            worksheet.conditional_format(1, sentiment_col, len(df), sentiment_col, {
                'type': 'text',
                'criteria': 'containing',
                'value': 'neutral',
                'format': neutral_format
            })
            
            # Positive
            worksheet.conditional_format(1, sentiment_col, len(df), sentiment_col, {
                'type': 'text',
                'criteria': 'containing',
                'value': 'positive',
                'format': positive_format
            })
        
        # Add summary sheet if requested
        if include_summary:
            summary_data = []
            
            # Overall stats
            summary_data.append(['Metric', 'Value'])
            summary_data.append(['Total Predictions', len(df)])
            summary_data.append(['Unique Tweets', df['Tweet'].nunique()])
            summary_data.append(['Unique Competitors', df['Competitor'].nunique()])
            summary_data.append([''])
            
            # Sentiment distribution
            summary_data.append(['Sentiment', 'Count', 'Percentage'])
            sent_dist = df['Sentiment_Label'].value_counts()
            for sent, count in sent_dist.items():
                pct = count / len(df) * 100
                summary_data.append([sent, count, f'{pct:.1f}%'])
            
            summary_data.append([''])
            
            # Per-competitor stats
            summary_data.append(['Competitor', 'Mentions', 'Positive', 'Neutral', 'Negative'])
            for comp in sorted(df['Competitor'].unique()):
                comp_df = df[df['Competitor'] == comp]
                pos = len(comp_df[comp_df['Sentiment_Label'] == 'positive'])
                neu = len(comp_df[comp_df['Sentiment_Label'] == 'neutral'])
                neg = len(comp_df[comp_df['Sentiment_Label'] == 'negative'])
                summary_data.append([comp, len(comp_df), pos, neu, neg])
            
            # Write summary
            summary_df = pd.DataFrame(summary_data)
            summary_df.to_excel(writer, sheet_name='Summary', index=False, header=False)
            
            # Format summary sheet
            summary_ws = writer.sheets['Summary']
            summary_ws.set_column('A:A', 20)
            summary_ws.set_column('B:E', 12)
    
    print(f"\n‚úì Excel file saved: {filepath}")
    return filepath

print("‚úì Excel export function defined")

In [None]:
# Export to Excel
excel_path = export_to_excel(
    predictions_df,
    'KFC_Predictions_Complete.xlsx',
    include_summary=True
)

print("\n‚úì Predictions exported to formatted Excel file!")
print(f"  File: {excel_path}")
print(f"  Sheets: 'Predictions' (main data), 'Summary' (statistics)")
print(f"  Format: Color-coded sentiment, auto-width columns, summary stats")

In [None]:
# Also save to Google Drive
drive_path = f'{MODEL_SAVE_DIR}/KFC_Predictions_Complete.xlsx'
predictions_df.to_excel(drive_path, index=False, engine='openpyxl')
print(f"\n‚úì Also saved to Google Drive: {drive_path}")

## 11. Summary & Download

In [None]:
print("\n" + "="*70)
print("TRAINING & PREDICTION COMPLETE")
print("="*70)

print("\n‚úÖ Fixed Issues:")
print("  1. AdamW import (now uses torch.optim.AdamW)")
print("  2. CSV encoding (auto-converted to UTF-8)")
print("  3. Excel output (formatted .xlsx with color-coding)")
print("  4. NER model (single-label classification)")

print("\nüìä Results:")
print(f"  NER Best F1: {max(ner_history['val_f1']):.4f}")
print(f"  Sentiment Best F1: {max(sentiment_history['val_f1']):.4f}")
print(f"  Total predictions: {len(predictions_df)}")
print(f"  From {len(df_test_pred_clean)} tweets")

print("\nüìÅ Output Files:")
print(f"  Excel (formatted): {excel_path}")
print(f"  Google Drive: {drive_path}")
print(f"  Models: {MODEL_SAVE_DIR}/")
print(f"  Plots: {RESULTS_DIR}/")

print("\n" + "="*70)
print("Ready to download results!")
print("="*70)

In [None]:
# Download Excel file
from google.colab import files
files.download(excel_path)