# üöÄ DATA304 Final Project - Hierarchical Multi-Label Classification
## Optimized for AWS SageMaker with GPU

**Project**: Amazon Product Review Classification  
**Task**: Classify 19,658 reviews into 531 hierarchical categories (2-3 labels each)  
**Deadline**: December 20, 2025, 23:59 KST

---

### üìã Execution Plan
1. **Setup & Data Loading** (5 min)
2. **Silver Label Generation** (15-20 min)
3. **Model Training** (30-45 min with GPU)
4. **Prediction & Export** (10-15 min)

**Total Time**: ~1-1.5 hours with GPU ‚ö°

---
## üì¶ STEP 1: Setup & Installation

In [26]:
%%time
# Install required packages - Updated versions
!pip install -q transformers torch scikit-learn pandas networkx sentence-transformers

print("‚úì Packages installed successfully!")

‚úì Packages installed successfully!
CPU times: user 0 ns, sys: 31.9 ms, total: 31.9 ms
Wall time: 1.23 s


In [27]:
# Import libraries
import os
import random
import numpy as np
import pandas as pd
import pickle
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from tqdm import tqdm  # Changed from tqdm.notebook for AWS compatibility
from typing import Dict, List, Set, Tuple
import warnings
warnings.filterwarnings('ignore')

print("‚úì Libraries imported")

‚úì Libraries imported


In [28]:
# Set random seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    if torch.cuda.is_available():
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_seed(42)

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"‚úì Using device: {device}")
if torch.cuda.is_available():
    print(f"  GPU: {torch.cuda.get_device_name(0)}")
    print(f"  Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

‚úì Using device: cuda
  GPU: NVIDIA L4
  Memory: 23.6 GB


In [29]:
# Configuration - HEAVILY OPTIMIZED FOR DIVERSITY AND PERFORMANCE
class Config:
    # Paths
    DATA_DIR = 'data'
    OUTPUT_DIR = 'outputs'
    MODEL_DIR = 'models'
    
    # Model - Using BERT-base for better performance
    PRETRAINED_MODEL = 'bert-base-uncased'  # Better than distilbert
    MAX_LENGTH = 256
    BATCH_SIZE = 64 if torch.cuda.is_available() else 16  # Larger batch with GPU
    LEARNING_RATE = 2e-5
    NUM_EPOCHS = 5  # More epochs for better results
    NUM_CLASSES = 531
    
    # Silver Labels - OPTIMIZED for more diversity
    TFIDF_THRESHOLD = 0.05  # LOWER threshold for more diverse labels
    TFIDF_MAX_FEATURES = 10000  # More features
    TOP_K_CANDIDATES = 20  # Consider more candidates
    
    # Prediction - CRITICAL: Always take top-3, ignore threshold
    CONFIDENCE_THRESHOLD = 0.01  # Very low, not really used
    MIN_LABELS = 2
    MAX_LABELS = 3

config = Config()
os.makedirs(config.OUTPUT_DIR, exist_ok=True)
os.makedirs(config.MODEL_DIR, exist_ok=True)

print("‚úì Configuration loaded")
print(f"  Model: {config.PRETRAINED_MODEL}")
print(f"  Batch size: {config.BATCH_SIZE}")
print(f"  Epochs: {config.NUM_EPOCHS}")

‚úì Configuration loaded
  Model: bert-base-uncased
  Batch size: 64
  Epochs: 5


---
## üìö STEP 2: Utility Functions

In [30]:
# Data loading functions
def load_corpus(path: str) -> Dict[str, str]:
    """Load corpus from file"""
    pid2text = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t', 1)
            if len(parts) == 2:
                pid, text = parts
                pid2text[pid] = text
    return pid2text

def load_classes(path: str) -> Dict[int, str]:
    """Load class names"""
    id2class = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 2:
                class_id, class_name = parts
                id2class[int(class_id)] = class_name
    return id2class

def load_hierarchy(path: str) -> List[Tuple[int, int]]:
    """Load class hierarchy"""
    edges = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 2:
                parent, child = int(parts[0]), int(parts[1])
                edges.append((parent, child))
    return edges

def load_keywords(path: str) -> Dict[str, List[str]]:
    """Load class keywords"""
    class2keywords = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split(':', 1)
            if len(parts) == 2:
                class_name, keywords_str = parts
                keywords = [kw.strip() for kw in keywords_str.split(',')]
                class2keywords[class_name] = keywords
    return class2keywords

# Hierarchy functions
def build_hierarchy_graph(edges: List[Tuple[int, int]], num_classes: int = 531) -> nx.DiGraph:
    """Build directed graph from hierarchy"""
    G = nx.DiGraph()
    G.add_nodes_from(range(num_classes))
    G.add_edges_from(edges)
    return G

def get_ancestors(graph: nx.DiGraph, node: int) -> Set[int]:
    """Get all ancestors of a node"""
    try:
        return nx.ancestors(graph, node)
    except:
        return set()

def get_leaf_nodes(graph: nx.DiGraph) -> Set[int]:
    """Get all leaf nodes"""
    return {node for node in graph.nodes() if graph.out_degree(node) == 0}

def ensure_label_constraints(labels: List[int], min_labels: int = 2, max_labels: int = 3) -> List[int]:
    """Ensure labels meet constraints"""
    if len(labels) < min_labels:
        available = list(set(range(531)) - set(labels))
        needed = min_labels - len(labels)
        labels.extend(random.sample(available, min(needed, len(available))))
    elif len(labels) > max_labels:
        labels = labels[:max_labels]
    return sorted(labels)

def save_predictions(pids: List[str], predictions: List[List[int]], output_path: str):
    """Save predictions in Kaggle format"""
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write('pid,labels\n')
        for pid, labels in zip(pids, predictions):
            labels_str = ','.join(map(str, sorted(labels)))
            f.write(f'{pid},{labels_str}\n')

print("‚úì Utility functions loaded")

‚úì Utility functions loaded


---
## üìä STEP 3: Load Data

In [31]:
%%time
print("Loading data...")

# Load all data
train_corpus = load_corpus(os.path.join(config.DATA_DIR, 'train/train_corpus.txt'))
test_corpus = load_corpus(os.path.join(config.DATA_DIR, 'test/test_corpus.txt'))
id2class = load_classes(os.path.join(config.DATA_DIR, 'classes.txt'))
class2id = {v: k for k, v in id2class.items()}
hierarchy_edges = load_hierarchy(os.path.join(config.DATA_DIR, 'class_hierarchy.txt'))
graph = build_hierarchy_graph(hierarchy_edges, config.NUM_CLASSES)
class2keywords = load_keywords(os.path.join(config.DATA_DIR, 'class_related_keywords.txt'))

print(f"‚úì Train samples: {len(train_corpus):,}")
print(f"‚úì Test samples: {len(test_corpus):,}")
print(f"‚úì Classes: {config.NUM_CLASSES}")
print(f"‚úì Hierarchy edges: {len(hierarchy_edges)}")
print(f"‚úì Leaf nodes: {len(get_leaf_nodes(graph))}")

Loading data...
‚úì Train samples: 29,487
‚úì Test samples: 19,658
‚úì Classes: 531
‚úì Hierarchy edges: 568
‚úì Leaf nodes: 462
CPU times: user 37.9 ms, sys: 12 ms, total: 49.9 ms
Wall time: 47.7 ms


---
## üè∑Ô∏è STEP 4: Generate Silver Labels

In [32]:
%%time
# STEP 4: Generate Silver Labels - ULTRA LOW THRESHOLD FOR DIVERSITY
import os
import pickle
import numpy as np
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

print("Regenerating silver labels with VERY low threshold...")

NEW_TFIDF_THRESHOLD = 0.01  # CRITICAL: Much lower than 0.05

silver_labels_file = os.path.join(config.OUTPUT_DIR, 'silver_labels.pkl')

# Force regeneration
if os.path.exists(silver_labels_file):
    os.remove(silver_labels_file)
    print("‚úì Removed old silver labels")

# Prepare class descriptions
class_descriptions = {}
for class_name, keywords in class2keywords.items():
    description = ' '.join(keywords).replace('_', ' ')
    class_descriptions[class_name] = description

# TF-IDF vectorization
print("  Computing TF-IDF...")
vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 3),
    stop_words='english',
    min_df=2
)

all_texts = list(train_corpus.values()) + list(class_descriptions.values())
vectorizer.fit(all_texts)

train_vectors = vectorizer.transform(train_corpus.values())
class_vectors = vectorizer.transform([class_descriptions.get(id2class[i], '') 
                                     for i in range(config.NUM_CLASSES)])

print("  Computing similarities...")
similarities = cosine_similarity(train_vectors, class_vectors)

print("  Assigning labels with VERY low threshold (0.01)...")
silver_labels = {}
leaf_nodes = get_leaf_nodes(graph)

for idx, (pid, text) in enumerate(tqdm(train_corpus.items(), desc="Processing")):
    sim_scores = similarities[idx]
    top_indices = np.argsort(sim_scores)[::-1][:30]  # Consider top 30
    
    # Select with VERY low threshold
    selected = []
    for class_id in top_indices:
        if sim_scores[class_id] > NEW_TFIDF_THRESHOLD:  # 0.01 - MUCH lower
            selected.append(class_id)
            if len(selected) >= 10:  # Get many candidates
                break
    
    # Always ensure at least 5 candidates
    if len(selected) < 5:
        selected = list(top_indices[:5])
    
    # Prioritize deeper nodes
    depths = {label: len(get_ancestors(graph, label)) for label in selected}
    sorted_labels = sorted(selected, key=lambda x: depths[x], reverse=True)
    
    # Take top 3 from the diverse candidates
    final_labels = ensure_label_constraints(sorted_labels[:3], 
                                           config.MIN_LABELS, config.MAX_LABELS)
    
    silver_labels[pid] = final_labels

# Save
with open(silver_labels_file, 'wb') as f:
    pickle.dump(silver_labels, f)

# Check diversity - CRITICAL CHECK
all_silver_classes = []
for labels in silver_labels.values():
    all_silver_classes.extend(labels)
silver_unique = len(set(all_silver_classes))

print(f"\n{'='*60}")
print(f"‚úì NEW Silver labels generated!")
print(f"  Total: {len(silver_labels)}")
print(f"  Unique classes in silver labels: {silver_unique}/531")
print(f"  Target: 200+ for good diversity")
if silver_unique < 100:
    print(f"  ‚ö†Ô∏è WARNING: Still too low!")
elif silver_unique < 200:
    print(f"  ‚ö° Moderate diversity - should improve score")
else:
    print(f"  ‚úì Excellent diversity!")
print(f"{'='*60}")

# Show distribution
label_counts = [len(labels) for labels in silver_labels.values()]
print(f"\nLabel distribution: {pd.Series(label_counts).value_counts().sort_index().to_dict()}")

Regenerating silver labels with VERY low threshold...
‚úì Removed old silver labels
  Computing TF-IDF...
  Computing similarities...
  Assigning labels with VERY low threshold (0.01)...


Processing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 29487/29487 [00:01<00:00, 21910.93it/s]



‚úì NEW Silver labels generated!
  Total: 29487
  Unique classes in silver labels: 472/531
  Target: 200+ for good diversity
  ‚úì Excellent diversity!

Label distribution: {3: 29487}
CPU times: user 9.94 s, sys: 268 ms, total: 10.2 s
Wall time: 10.2 s


---
## ü§ñ STEP 5: Define Model Architecture

In [33]:
class ReviewDataset(Dataset):
    def __init__(self, corpus, labels, tokenizer, max_length):
        self.pids = list(corpus.keys())
        self.texts = [corpus[pid] for pid in self.pids]
        self.labels = [self._to_binary_vector(labels[pid]) for pid in self.pids]
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def _to_binary_vector(self, labels):
        vector = np.zeros(config.NUM_CLASSES, dtype=np.float32)
        for label in labels:
            if 0 <= label < config.NUM_CLASSES:
                vector[label] = 1.0
        return vector
    
    def __len__(self):
        return len(self.pids)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.float32)
        }

class HierarchicalClassifier(nn.Module):
    def __init__(self, pretrained_model, num_classes):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(pretrained_model)
        self.dropout = nn.Dropout(0.2)
        hidden_size = self.encoder.config.hidden_size
        
        # Add intermediate layer for better learning
        self.fc1 = nn.Linear(hidden_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        
        x = self.dropout(pooled)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        logits = self.fc2(x)
        
        return logits

print("‚úì Model architecture defined")

‚úì Model architecture defined


---
## üèãÔ∏è STEP 6: Train Model

In [34]:
%%time
print("Initializing model...")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(config.PRETRAINED_MODEL)
model = HierarchicalClassifier(config.PRETRAINED_MODEL, config.NUM_CLASSES)
model = model.to(device)

# Prepare dataset
train_dataset = ReviewDataset(train_corpus, silver_labels, tokenizer, config.MAX_LENGTH)
train_loader = DataLoader(
    train_dataset, 
    batch_size=config.BATCH_SIZE, 
    shuffle=True,
    num_workers=2 if torch.cuda.is_available() else 0
)

# Loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=config.LEARNING_RATE, weight_decay=0.01)

# Learning rate scheduler
total_steps = len(train_loader) * config.NUM_EPOCHS
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=total_steps)

print(f"‚úì Model initialized")
print(f"  Total parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"  Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
print(f"  Batches per epoch: {len(train_loader)}")

Initializing model...
‚úì Model initialized
  Total parameters: 110,481,171
  Trainable parameters: 110,481,171
  Batches per epoch: 461
CPU times: user 216 ms, sys: 40.1 ms, total: 256 ms
Wall time: 318 ms


In [37]:
%%time
print("\nStarting training...\n")

model.train()
best_loss = float('inf')
training_history = []

for epoch in range(config.NUM_EPOCHS):
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{config.NUM_EPOCHS}")
    
    for batch_idx, batch in enumerate(progress_bar):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        progress_bar.set_postfix({
            'loss': f'{loss.item():.4f}',
            'avg_loss': f'{total_loss/(batch_idx+1):.4f}'
        })
    
    avg_loss = total_loss / len(train_loader)
    training_history.append(avg_loss)
    
    print(f"\n‚úì Epoch {epoch+1} completed - Avg Loss: {avg_loss:.4f}")
    
    # Save best model
    if avg_loss < best_loss:
        best_loss = avg_loss
        model_path = os.path.join(config.MODEL_DIR, 'best_model.pt')
        torch.save(model.state_dict(), model_path)
        print(f"  ‚úì Best model saved (loss: {best_loss:.4f})")

print(f"\n‚úì Training completed!")
print(f"  Best loss: {best_loss:.4f}")


Starting training...



Epoch 1/5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 461/461 [08:30<00:00,  1.11s/it, loss=0.0337, avg_loss=0.0330]



‚úì Epoch 1 completed - Avg Loss: 0.0330
  ‚úì Best model saved (loss: 0.0330)


Epoch 2/5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 461/461 [08:35<00:00,  1.12s/it, loss=0.0324, avg_loss=0.0327]



‚úì Epoch 2 completed - Avg Loss: 0.0327
  ‚úì Best model saved (loss: 0.0327)


Epoch 3/5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 461/461 [08:34<00:00,  1.12s/it, loss=0.0319, avg_loss=0.0322]



‚úì Epoch 3 completed - Avg Loss: 0.0322
  ‚úì Best model saved (loss: 0.0322)


Epoch 4/5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 461/461 [08:34<00:00,  1.12s/it, loss=0.0316, avg_loss=0.0320]



‚úì Epoch 4 completed - Avg Loss: 0.0320
  ‚úì Best model saved (loss: 0.0320)


Epoch 5/5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 461/461 [08:31<00:00,  1.11s/it, loss=0.0311, avg_loss=0.0319]



‚úì Epoch 5 completed - Avg Loss: 0.0319
  ‚úì Best model saved (loss: 0.0319)

‚úì Training completed!
  Best loss: 0.0319
CPU times: user 42min 42s, sys: 7.05 s, total: 42min 49s
Wall time: 42min 59s


---
## üîÆ STEP 7: Generate Predictions

In [38]:
class TestDataset(Dataset):
    def __init__(self, corpus, tokenizer, max_length):
        self.pids = list(corpus.keys())
        self.texts = [corpus[pid] for pid in self.pids]
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.pids)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'pid': self.pids[idx],
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze()
        }

print("‚úì Test dataset class defined")

‚úì Test dataset class defined


In [39]:
%%time
print("Generating predictions...\n")

# Load best model
model_path = os.path.join(config.MODEL_DIR, 'best_model.pt')
model.load_state_dict(torch.load(model_path))
model.eval()

# Prepare test dataset
test_dataset = TestDataset(test_corpus, tokenizer, config.MAX_LENGTH)
test_loader = DataLoader(
    test_dataset, 
    batch_size=config.BATCH_SIZE, 
    shuffle=False,
    num_workers=2 if torch.cuda.is_available() else 0
)

all_predictions = {}

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting"):
        pids = batch['pid']
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        logits = model(input_ids, attention_mask)
        probs = torch.sigmoid(logits)
        
        for i, pid in enumerate(pids):
            scores = probs[i].cpu().numpy()
            
            # CRITICAL FIX: Always take top-3 scores directly
            # This ensures maximum diversity in predictions
            top_indices = np.argsort(scores)[::-1][:3]
            final_labels = [int(idx) for idx in top_indices]
            
            # Ensure constraints (should already be 3, but just in case)
            final_labels = ensure_label_constraints(final_labels, 
                                                   config.MIN_LABELS, 
                                                   config.MAX_LABELS)
            all_predictions[pid] = final_labels

print(f"\n‚úì Generated predictions for {len(all_predictions)} samples")

# Statistics
pred_counts = [len(preds) for preds in all_predictions.values()]
print(f"  Prediction distribution: {pd.Series(pred_counts).value_counts().sort_index().to_dict()}")

# CRITICAL: Diversity analysis
from collections import Counter
all_classes = []
for labels in all_predictions.values():
    all_classes.extend(labels)
class_counts = Counter(all_classes)
print(f"\n  ‚ö†Ô∏è DIVERSITY CHECK:")
print(f"  Unique classes predicted: {len(class_counts)}/531")
print(f"  Target: 200+ for good score")
if len(class_counts) < 100:
    print(f"  ‚ö†Ô∏è WARNING: Low diversity! Model may be collapsing.")
elif len(class_counts) < 200:
    print(f"  ‚ö° Moderate diversity - score should improve")
else:
    print(f"  ‚úì Excellent diversity! Expect good score")

Generating predictions...



Predicting: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 308/308 [02:03<00:00,  2.49it/s]


‚úì Generated predictions for 19658 samples
  Prediction distribution: {3: 19658}

  ‚ö†Ô∏è DIVERSITY CHECK:
  Unique classes predicted: 19/531
  Target: 200+ for good score
CPU times: user 2min 3s, sys: 312 ms, total: 2min 4s
Wall time: 2min 4s





In [42]:
# IMPROVED TF-IDF: Filter by similarity threshold
import numpy as np
import csv
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

print("Creating IMPROVED TF-IDF predictions...\n")

# Prepare class descriptions
class_descriptions = {}
for class_name, keywords in class2keywords.items():
    description = ' '.join(keywords).replace('_', ' ')
    class_descriptions[class_name] = description

print("Computing TF-IDF with quality filtering...")
vectorizer = TfidfVectorizer(
    max_features=15000,  # More features
    ngram_range=(1, 3),
    stop_words='english',
    min_df=2
)

all_texts = list(test_corpus.values()) + list(class_descriptions.values())
vectorizer.fit(all_texts)

test_vectors = vectorizer.transform(test_corpus.values())
class_vectors = vectorizer.transform([class_descriptions.get(id2class[i], '') 
                                     for i in range(531)])

similarities = cosine_similarity(test_vectors, class_vectors)

print("Generating predictions with similarity threshold...")
improved_preds = {}

SIMILARITY_THRESHOLD = 0.05  # Only accept decent matches

for idx, (pid, text) in enumerate(test_corpus.items()):
    tfidf_scores = similarities[idx]
    tfidf_sorted = np.argsort(tfidf_scores)[::-1]
    
    # Select top candidates that meet threshold
    selected = []
    for candidate in tfidf_sorted[:50]:  # Check top 50
        if tfidf_scores[candidate] > SIMILARITY_THRESHOLD:
            selected.append(int(candidate))
            if len(selected) == 3:
                break
    
    # If not enough, take top 3 anyway
    if len(selected) < 3:
        selected = [int(tfidf_sorted[0]), int(tfidf_sorted[1]), int(tfidf_sorted[2])]
    
    improved_preds[pid] = sorted(selected[:3])

# Check diversity
all_improved = []
for labels in improved_preds.values():
    all_improved.extend(labels)
improved_unique = len(set(all_improved))

print(f"\n{'='*60}")
print(f"IMPROVED TF-IDF RESULTS")
print(f"{'='*60}")
print(f"Previous TF-IDF: 529/531 classes (Score: 0.18)")
print(f"Improved TF-IDF: {improved_unique:3d}/531 classes")
print(f"{'='*60}")

# Save
with open('outputs/final_predictions_improved_tfidf.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'labels'])
    for pid in sorted(improved_preds.keys(), key=lambda x: int(x)):
        labels_str = ','.join(map(str, improved_preds[pid]))
        writer.writerow([pid, labels_str])

print(f"\n‚úì Improved TF-IDF predictions saved!")
print(f"‚úì File: outputs/final_predictions_improved_tfidf.csv")

# Show distribution
counter = Counter(all_improved)
print(f"\nTop 10 most predicted classes:")
for class_id, count in counter.most_common(10):
    print(f"  {id2class[class_id][:40]:40s}: {count:4d} times")
    
# Check if distribution is more balanced
most_common_count = counter.most_common(1)[0][1]
print(f"\nMost common class count: {most_common_count} (should be <2000)")

print(f"\n{'='*60}")
print(f"Expected score: ~0.20-0.35 (better balance)")
print(f"{'='*60}")

print(f"\nüì§ Download: final_predictions_improved_tfidf.csv")

Creating IMPROVED TF-IDF predictions...

Computing TF-IDF with quality filtering...
Generating predictions with similarity threshold...

IMPROVED TF-IDF RESULTS
Previous TF-IDF: 529/531 classes (Score: 0.18)
Improved TF-IDF: 529/531 classes

‚úì Improved TF-IDF predictions saved!
‚úì File: outputs/final_predictions_improved_tfidf.csv

Top 10 most predicted classes:
  styling_products                        : 1136 times
  play_vehicles                           : 1074 times
  dogs                                    :  941 times
  hair_care                               :  854 times
  hammering_pounding_toys                 :  782 times
  fragrance                               :  749 times
  baby_food                               :  581 times
  baby_products                           :  546 times
  water                                   :  512 times
  bottle_feeding                          :  454 times

Most common class count: 1136 (should be <2000)

Expected score: ~0.20-0.35 (bett

---
## üíæ STEP 8: Save Results

In [24]:
# Save predictions in CORRECT Kaggle format
import csv
output_file = os.path.join(config.OUTPUT_DIR, 'final_predictions.csv')

with open(output_file, 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'labels'])  # CRITICAL: header must be 'id', not 'pid'
    for pid in sorted(all_predictions.keys(), key=lambda x: int(x)):
        labels_str = ','.join(map(str, all_predictions[pid]))
        writer.writerow([pid, labels_str])  # csv.writer adds quotes automatically

print(f"‚úì Predictions saved to: {output_file}")

# Show sample predictions
print("\nSample predictions:")
df = pd.read_csv(output_file)
print(df.head(10))

print(f"\n{'='*80}")
print(f"{'PIPELINE COMPLETE - OPTIMIZED VERSION!':^80}")
print(f"{'='*80}")
print(f"\n‚úì Final output: {output_file}")
print(f"‚úì Total samples: {len(all_predictions)}")
print(f"‚úì Format: CORRECT for Kaggle (id, labels with quotes)")
print(f"\nüì§ NEXT STEPS:")
print(f"  1. Download: {output_file}")
print(f"  2. Submit DIRECTLY to Kaggle (no rename needed)")
print(f"  3. Expected improvement: 0.08 ‚Üí 0.15-0.40+ üöÄ")
print(f"\nüí° KEY OPTIMIZATIONS APPLIED:")
print(f"  - Always takes top-3 predictions (no threshold)")
print(f"  - Lower TF-IDF threshold (0.05) for diverse silver labels")
print(f"  - Correct CSV format (id column, automatic quoting)")
print(f"  - Diversity analysis to verify model health")
print(f"\n{'='*80}")

‚úì Predictions saved to: outputs/final_predictions.csv

Sample predictions:
   id       labels
0   0   65,148,335
1   1   65,148,335
2   2   65,148,472
3   3   65,129,335
4   4   65,335,405
5   5   65,335,405
6   6   65,335,405
7   7  148,154,472
8   8   65,335,405
9   9  148,154,472

                     PIPELINE COMPLETE - OPTIMIZED VERSION!                     

‚úì Final output: outputs/final_predictions.csv
‚úì Total samples: 19658
‚úì Format: CORRECT for Kaggle (id, labels with quotes)

üì§ NEXT STEPS:
  1. Download: outputs/final_predictions.csv
  2. Submit DIRECTLY to Kaggle (no rename needed)
  3. Expected improvement: 0.08 ‚Üí 0.15-0.40+ üöÄ

üí° KEY OPTIMIZATIONS APPLIED:
  - Always takes top-3 predictions (no threshold)
  - Lower TF-IDF threshold (0.05) for diverse silver labels
  - Correct CSV format (id column, automatic quoting)
  - Diversity analysis to verify model health



---
## üìä STEP 9: Analysis (Optional)

In [None]:
# Analyze predictions
print("Prediction Analysis:\n")

# Class distribution
class_counts = {}
for preds in all_predictions.values():
    for pred in preds:
        class_counts[pred] = class_counts.get(pred, 0) + 1

print(f"Unique classes predicted: {len(class_counts)}/{config.NUM_CLASSES}")
print(f"\nTop 10 most predicted classes:")
top_classes = sorted(class_counts.items(), key=lambda x: x[1], reverse=True)[:10]
for class_id, count in top_classes:
    print(f"  {id2class[class_id]:30s}: {count:4d} times")

# Show examples
print(f"\n\nExample predictions:")
for i, (pid, text) in enumerate(list(test_corpus.items())[:3]):
    preds = all_predictions[pid]
    print(f"\nReview {pid}:")
    print(f"  Text: {text[:150]}...")
    print(f"  Predictions: {[id2class[p] for p in preds]}")