# Multi-Competitor NER & Sentiment Analysis - FIXED VERSION

**Key Changes from Original:**
1. Simplified NER to **single-label classification** (which competitor is tweet about)
2. Added **data validation** at every step
3. Separate handling for **multi-competitor extraction** using regex
4. More robust **label creation** with debugging

**Pipeline:**
1. NER Model: Tweet → Primary Competitor (14-class classification)
2. Multi-Competitor Extraction: Tweet → All mentioned competitors (regex-based)
3. Sentiment Model: (Tweet, Competitor) → Sentiment (3-class)

## 1. Setup & Installation

In [None]:
# Install required libraries
!pip install -q transformers datasets torch torchvision accelerate
!pip install -q scikit-learn pandas numpy matplotlib seaborn
!pip install -q sentencepiece protobuf

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import re
import gc
import warnings
warnings.filterwarnings('ignore')

# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler

# Transformers
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    AdamW, get_linear_schedule_with_warmup
)

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, confusion_matrix,
    accuracy_score, f1_score, precision_recall_fscore_support
)
from sklearn.utils.class_weight import compute_class_weight

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

# Set seeds
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print("✓ Libraries imported successfully!")

In [None]:
# Check GPU and configure device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

if torch.cuda.is_available():
    gpu_props = torch.cuda.get_device_properties(0)
    gpu_memory_gb = gpu_props.total_memory / 1e9
    print(f"GPU: {gpu_props.name}")
    print(f"GPU Memory: {gpu_memory_gb:.2f} GB")
    
    # Adaptive batch size
    if gpu_memory_gb >= 15:
        BATCH_SIZE = 16
        GRAD_ACCUM_STEPS = 2
    else:
        BATCH_SIZE = 8
        GRAD_ACCUM_STEPS = 4
else:
    BATCH_SIZE = 4
    GRAD_ACCUM_STEPS = 8

EFFECTIVE_BATCH_SIZE = BATCH_SIZE * GRAD_ACCUM_STEPS
print(f"\nBatch size: {BATCH_SIZE}, Accumulation: {GRAD_ACCUM_STEPS}, Effective: {EFFECTIVE_BATCH_SIZE}")

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Create directories
!mkdir -p '/content/drive/MyDrive/KFC_ML_Models'
!mkdir -p '/content/results'

MODEL_SAVE_DIR = '/content/drive/MyDrive/KFC_ML_Models'
RESULTS_DIR = '/content/results'

## 2. Configuration

In [None]:
# Competitor list (14 total)
COMPETITORS = [
    'Burger King', 'Deliveroo', "Domino's", 'Five Guys', 'Greggs',
    'Just Eat', 'KFC', "McDonald's", "Nando's", "Papa John's",
    'Pizza Hut', 'Pret a Manger', 'Taco Bell', 'Uber Eats'
]

SENTIMENT_MAP = {0: 'negative', 1: 'neutral', 2: 'positive'}

# Model configs
NER_MODEL_NAME = 'bert-base-uncased'  # Changed to uncased for robustness
SENTIMENT_MODEL_NAME = 'cardiffnlp/twitter-roberta-base-sentiment-latest'

# Hyperparameters
MAX_SEQ_LENGTH = 128
NER_LEARNING_RATE = 2e-5
SENTIMENT_LEARNING_RATE = 2e-5
NER_EPOCHS = 5
SENTIMENT_EPOCHS = 5
WARMUP_RATIO = 0.1
WEIGHT_DECAY = 0.01

print(f"✓ Configuration loaded")
print(f"  Competitors: {len(COMPETITORS)}")
print(f"  NER Model: {NER_MODEL_NAME}")
print(f"  Sentiment Model: {SENTIMENT_MODEL_NAME}")

## 3. Data Loading

In [None]:
# Upload CSV files
from google.colab import files
print("Please upload your CSV files:")
uploaded = files.upload()

In [None]:
# Load datasets
print("Loading datasets...")

df_large = pd.read_csv('KFC_social_data.xlsx - Sheet1.csv', low_memory=False)
df_train_sample = pd.read_csv('KFC_training_sample.csv')
df_test = pd.read_csv('KFC_test_sample.csv')
df_test_pred = pd.read_csv('KFC_test_sample_for_prediction.csv')

print(f"\n✓ Loaded:")
print(f"  Large dataset: {len(df_large)} rows")
print(f"  Training sample: {len(df_train_sample)} rows")
print(f"  Test (with labels): {len(df_test)} rows")
print(f"  Test (for prediction): {len(df_test_pred)} rows")

## 4. Data Preprocessing with Validation

In [None]:
def clean_sentiment(value):
    """Extract numeric sentiment (0, 1, 2)"""
    if pd.isna(value):
        return None
    
    if isinstance(value, (int, float)):
        if value in [0, 1, 2]:
            return int(value)
        return None
    
    value_str = str(value).strip().lower()
    if value_str in ['0', 'negative']: return 0
    elif value_str in ['1', 'neutral']: return 1
    elif value_str in ['2', 'positive']: return 2
    
    match = re.match(r'^(\d)', value_str)
    if match:
        digit = int(match.group(1))
        if digit in [0, 1, 2]:
            return digit
    
    return None

def normalize_competitor_name(comp_str):
    """Normalize competitor names to match COMPETITORS list"""
    if pd.isna(comp_str) or not comp_str:
        return None
    
    comp_str = str(comp_str).strip()
    
    # Direct match
    if comp_str in COMPETITORS:
        return comp_str
    
    # Case-insensitive match
    for comp in COMPETITORS:
        if comp.lower() == comp_str.lower():
            return comp
    
    return None

print("✓ Data cleaning functions defined")

In [None]:
def prepare_dataset(df, name="dataset"):
    """Clean and prepare dataset with validation"""
    print(f"\nPreparing {name}...")
    print(f"  Initial rows: {len(df)}")
    
    # Select columns
    essential_cols = ['Competitor', 'Tweet', 'SENTIMENT']
    if 'Tweet' not in df.columns:
        if 'Full Text' in df.columns:
            df['Tweet'] = df['Full Text']
        elif 'Snippet' in df.columns:
            df['Tweet'] = df['Snippet']
    
    available_cols = [col for col in essential_cols if col in df.columns]
    metadata_cols = [col for col in ['Impact', 'Impressions', 'Reach (new)'] if col in df.columns]
    
    df_clean = df[available_cols + metadata_cols].copy()
    
    # Clean sentiment
    if 'SENTIMENT' in df_clean.columns:
        df_clean['SENTIMENT'] = df_clean['SENTIMENT'].apply(clean_sentiment)
        before = len(df_clean)
        df_clean = df_clean.dropna(subset=['SENTIMENT'])
        print(f"  Dropped {before - len(df_clean)} rows with invalid sentiment")
        df_clean['SENTIMENT'] = df_clean['SENTIMENT'].astype(int)
    
    # Clean competitor names
    df_clean['Competitor'] = df_clean['Competitor'].apply(normalize_competitor_name)
    before = len(df_clean)
    df_clean = df_clean.dropna(subset=['Competitor', 'Tweet'])
    print(f"  Dropped {before - len(df_clean)} rows with invalid competitor/tweet")
    
    # Clean tweet text
    df_clean['Tweet'] = df_clean['Tweet'].astype(str).str.strip()
    df_clean = df_clean[df_clean['Tweet'].str.len() > 0]
    
    df_clean = df_clean.reset_index(drop=True)
    
    print(f"  Final rows: {len(df_clean)}")
    
    # Validation: Check competitor distribution
    comp_dist = df_clean['Competitor'].value_counts()
    print(f"  Unique competitors: {len(comp_dist)}")
    print(f"  Most common: {comp_dist.index[0]} ({comp_dist.iloc[0]} samples)")
    
    if 'SENTIMENT' in df_clean.columns:
        sent_dist = df_clean['SENTIMENT'].value_counts().sort_index()
        print(f"  Sentiment distribution: {dict(sent_dist)}")
    
    return df_clean

# Prepare all datasets
df_large_clean = prepare_dataset(df_large, "Large dataset")
df_test_clean = prepare_dataset(df_test, "Test dataset")

## 5. Enhanced Multi-Competitor Extraction (Regex-Based)

In [None]:
def extract_all_competitors(tweet_text):
    """
    Extract ALL competitors mentioned in tweet using regex.
    Returns list of competitor names.
    """
    found_competitors = set()
    tweet_lower = tweet_text.lower()
    
    # Define patterns for each competitor
    patterns = {
        'Burger King': [r'\bburger\s*king\b', r'\bbk\b'],
        'Deliveroo': [r'\bdeliveroo\b'],
        "Domino's": [r'\bdomino(?:s|\'s)?\b'],
        'Five Guys': [r'\bfive\s*guys\b'],
        'Greggs': [r'\bgreggs?\b'],
        'Just Eat': [r'\bjust\s*eat\b'],
        'KFC': [r'\bkfc\b', r'\bkentucky\s*fried\s*chicken\b', r'@kfc'],
        "McDonald's": [r'\bmcdonald(?:s|\'s)?\b', r'\bmaccies\b', r'\bmaccas\b', r'\bmcdonalds\b', r'@mcdonald'],
        "Nando's": [r'\bnando(?:s|\'s)?\b', r'@nando'],
        "Papa John's": [r'\bpapa\s*john(?:s|\'s)?\b', r'@papajohn'],
        'Pizza Hut': [r'\bpizza\s*hut\b', r'@pizzahut'],
        'Pret a Manger': [r'\bpret(?:\s*a\s*manger)?\b', r'@pret'],
        'Taco Bell': [r'\btaco\s*bell\b', r'@tacobell'],
        'Uber Eats': [r'\buber\s*eats\b', r'@ubereats']
    }
    
    for competitor, pattern_list in patterns.items():
        for pattern in pattern_list:
            if re.search(pattern, tweet_lower):
                found_competitors.add(competitor)
                break
    
    return list(found_competitors)

# Test extraction
test_tweets = [
    "I love KFC's chicken!",
    "McDonald's and Burger King are both great",
    "Just ordered from @Deliveroo",
    "Pret a Manger, Nando's, and KFC all have good food"
]

print("Testing competitor extraction:\n")
for tweet in test_tweets:
    comps = extract_all_competitors(tweet)
    print(f"Tweet: {tweet}")
    print(f"Found: {comps}\n")

## 6. Train/Val Split

In [None]:
# Create stratified split
train_df, val_df = train_test_split(
    df_large_clean,
    test_size=0.2,
    random_state=SEED,
    stratify=df_large_clean['Competitor']
)

print(f"Dataset Split:")
print(f"  Training: {len(train_df)} samples")
print(f"  Validation: {len(val_df)} samples")
print(f"  Test: {len(df_test_clean)} samples")

print(f"\nTraining set competitor distribution:")
print(train_df['Competitor'].value_counts())

## 7. NER Model - Single-Label Classification (FIXED)

**Key Change**: Instead of multi-label, we do **14-class classification** to predict which competitor the tweet is primarily about.

In [None]:
class CompetitorClassificationDataset(Dataset):
    """
    Dataset for single-label competitor classification.
    Task: Given tweet, predict which competitor it's about (0-13)
    """
    
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        # Create label mapping
        self.competitor_to_idx = {comp: idx for idx, comp in enumerate(COMPETITORS)}
        
        # Validate: check if all competitors in data are in our list
        unique_comps = self.data['Competitor'].unique()
        print(f"\nDataset has {len(unique_comps)} unique competitors:")
        for comp in unique_comps:
            if comp in self.competitor_to_idx:
                count = (self.data['Competitor'] == comp).sum()
                print(f"  ✓ {comp}: {count} samples (label {self.competitor_to_idx[comp]})")
            else:
                print(f"  ✗ {comp}: NOT IN COMPETITOR LIST!")
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        tweet = row['Tweet']
        competitor = row['Competitor']
        
        # Get label index
        label = self.competitor_to_idx[competitor]
        
        # Tokenize
        encoding = self.tokenizer(
            tweet,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

print("✓ CompetitorClassificationDataset defined")

In [None]:
# Load tokenizer and create datasets
print(f"Loading NER tokenizer: {NER_MODEL_NAME}")
ner_tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_NAME)

print("\nCreating NER datasets...")
ner_train_dataset = CompetitorClassificationDataset(train_df, ner_tokenizer, MAX_SEQ_LENGTH)
ner_val_dataset = CompetitorClassificationDataset(val_df, ner_tokenizer, MAX_SEQ_LENGTH)
ner_test_dataset = CompetitorClassificationDataset(df_test_clean, ner_tokenizer, MAX_SEQ_LENGTH)

# Create dataloaders
ner_train_loader = DataLoader(ner_train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
ner_val_loader = DataLoader(ner_val_dataset, batch_size=BATCH_SIZE*2, shuffle=False, num_workers=2)
ner_test_loader = DataLoader(ner_test_dataset, batch_size=BATCH_SIZE*2, shuffle=False, num_workers=2)

print(f"\n✓ DataLoaders created (batch size: {BATCH_SIZE})")

In [None]:
# Calculate class weights for NER (to handle imbalance)
train_labels = [ner_train_dataset.competitor_to_idx[comp] for comp in train_df['Competitor']]
ner_class_weights = compute_class_weight(
    'balanced',
    classes=np.arange(len(COMPETITORS)),
    y=train_labels
)
ner_class_weights = torch.tensor(ner_class_weights, dtype=torch.float).to(device)

print("NER Class Weights:")
for i, comp in enumerate(COMPETITORS):
    print(f"  {comp:20s}: {ner_class_weights[i]:.3f}")

In [None]:
def train_ner_classifier(model, train_loader, val_loader, epochs, learning_rate, class_weights):
    """
    Train NER model with proper validation
    """
    model = model.to(device)
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=WEIGHT_DECAY)
    
    total_steps = len(train_loader) * epochs // GRAD_ACCUM_STEPS
    warmup_steps = int(total_steps * WARMUP_RATIO)
    scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, total_steps)
    
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    scaler = GradScaler()
    
    history = {'train_loss': [], 'val_loss': [], 'val_accuracy': [], 'val_f1': []}
    best_val_f1 = 0
    
    print(f"\nTraining NER model...")
    print(f"  Epochs: {epochs}, Steps: {total_steps}, Warmup: {warmup_steps}\n")
    
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        print("-" * 50)
        
        # Training
        model.train()
        train_loss = 0
        optimizer.zero_grad()
        
        train_pbar = tqdm(train_loader, desc="Training")
        for step, batch in enumerate(train_pbar):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            with autocast():
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs.logits, labels) / GRAD_ACCUM_STEPS
            
            scaler.scale(loss).backward()
            
            if (step + 1) % GRAD_ACCUM_STEPS == 0:
                scaler.step(optimizer)
                scaler.update()
                scheduler.step()
                optimizer.zero_grad()
            
            train_loss += loss.item() * GRAD_ACCUM_STEPS
            train_pbar.set_postfix({'loss': f'{loss.item() * GRAD_ACCUM_STEPS:.4f}'})
        
        avg_train_loss = train_loss / len(train_loader)
        history['train_loss'].append(avg_train_loss)
        
        # Validation
        model.eval()
        val_loss = 0
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            val_pbar = tqdm(val_loader, desc="Validation")
            for batch in val_pbar:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                with autocast():
                    outputs = model(input_ids, attention_mask)
                    loss = criterion(outputs.logits, labels)
                
                val_loss += loss.item()
                preds = torch.argmax(outputs.logits, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        
        avg_val_loss = val_loss / len(val_loader)
        val_accuracy = accuracy_score(all_labels, all_preds)
        val_f1 = f1_score(all_labels, all_preds, average='macro')
        
        history['val_loss'].append(avg_val_loss)
        history['val_accuracy'].append(val_accuracy)
        history['val_f1'].append(val_f1)
        
        print(f"\nResults:")
        print(f"  Train Loss: {avg_train_loss:.4f}")
        print(f"  Val Loss: {avg_val_loss:.4f}")
        print(f"  Val Accuracy: {val_accuracy:.4f}")
        print(f"  Val F1 (macro): {val_f1:.4f}")
        
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            print(f"  ✓ New best F1! Saving model...")
            torch.save(model.state_dict(), f'{MODEL_SAVE_DIR}/ner_best_model.pt')
        print()
        
        torch.cuda.empty_cache()
        gc.collect()
    
    print(f"\n✓ Training complete! Best val F1: {best_val_f1:.4f}")
    return model, history

print("✓ Training function defined")

In [None]:
# Initialize and train NER model
print(f"Initializing NER model: {NER_MODEL_NAME}")
ner_model = AutoModelForSequenceClassification.from_pretrained(
    NER_MODEL_NAME,
    num_labels=len(COMPETITORS)
)

# Train
ner_model, ner_history = train_ner_classifier(
    ner_model,
    ner_train_loader,
    ner_val_loader,
    epochs=NER_EPOCHS,
    learning_rate=NER_LEARNING_RATE,
    class_weights=ner_class_weights
)

In [None]:
# Plot NER training history
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

axes[0].plot(ner_history['train_loss'], label='Train', marker='o')
axes[0].plot(ner_history['val_loss'], label='Val', marker='s')
axes[0].set_title('NER - Loss', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].legend()
axes[0].grid(alpha=0.3)

axes[1].plot(ner_history['val_accuracy'], marker='o', color='blue')
axes[1].set_title('NER - Validation Accuracy', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].grid(alpha=0.3)

axes[2].plot(ner_history['val_f1'], marker='o', color='green')
axes[2].set_title('NER - Validation F1', fontsize=14, fontweight='bold')
axes[2].set_xlabel('Epoch')
axes[2].set_ylabel('F1 Score')
axes[2].grid(alpha=0.3)

plt.tight_layout()
plt.savefig(f'{RESULTS_DIR}/ner_training.png', dpi=300)
plt.show()

print(f"\n✓ Best validation F1: {max(ner_history['val_f1']):.4f}")

## 8. Sentiment Model (Same as Before)

This part works fine - using competitor-aware sentiment classification.

In [None]:
# [SENTIMENT MODEL CODE - SAME AS ORIGINAL NOTEBOOK]
# Copy cells 75-85 from original notebook here
# (SentimentDataset, training function, etc.)

print("✓ Sentiment model section ready")
print("  Copy sentiment model cells from original notebook")

## 9. Integrated Pipeline - FIXED

Combines:
1. NER classifier (primary competitor)
2. Regex extraction (all mentioned competitors)
3. Sentiment model (per competitor)

In [None]:
def predict_pipeline(tweet_text, ner_model, sentiment_model, ner_tokenizer, sentiment_tokenizer):
    """
    Full pipeline:
    1. Use NER model to predict primary competitor
    2. Use regex to extract ALL mentioned competitors
    3. Combine both (union)
    4. For each competitor, predict sentiment
    """
    ner_model.eval()
    sentiment_model.eval()
    
    # Step 1: NER model prediction
    encoding = ner_tokenizer(
        tweet_text,
        add_special_tokens=True,
        max_length=MAX_SEQ_LENGTH,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    with torch.no_grad():
        with autocast():
            outputs = ner_model(encoding['input_ids'].to(device), encoding['attention_mask'].to(device))
    
    predicted_idx = torch.argmax(outputs.logits, dim=1).item()
    primary_competitor = COMPETITORS[predicted_idx]
    
    # Step 2: Regex extraction
    regex_competitors = extract_all_competitors(tweet_text)
    
    # Step 3: Combine (union)
    all_competitors = set([primary_competitor] + regex_competitors)
    
    # Step 4: Sentiment for each
    results = []
    for competitor in all_competitors:
        text = f"{tweet_text} This tweet is about {competitor}."
        
        sentiment_encoding = sentiment_tokenizer(
            text,
            add_special_tokens=True,
            max_length=MAX_SEQ_LENGTH,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        with torch.no_grad():
            with autocast():
                sentiment_outputs = sentiment_model(
                    sentiment_encoding['input_ids'].to(device),
                    sentiment_encoding['attention_mask'].to(device)
                )
        
        predicted_sentiment = torch.argmax(sentiment_outputs.logits, dim=1).item()
        results.append((competitor, predicted_sentiment))
    
    return results

print("✓ Integrated pipeline defined")

## 10. Summary

**Key Fixes:**
1. ✅ NER changed from multi-label to single-label (14-class classification)
2. ✅ Added data validation at every step
3. ✅ Enhanced regex extraction for multi-competitor tweets
4. ✅ Class weights for imbalanced data
5. ✅ Proper label creation with debugging

**Expected Performance:**
- NER F1 should now be >0.70 (not 0.0000!)
- Sentiment F1 should be >0.70
- Combined pipeline should handle multi-competitor tweets correctly