# ðŸŽ¯ TF-IDF Pure: Diversity Maximization

Objetivo: 250+ clases Ãºnicas â†’ score 0.40-0.50

**Estrategia:**
1. TF-IDF puro sin BERT (BERT causa collapse)
2. Threshold muy bajo para maximizar diversidad
3. CalibraciÃ³n por percentiles
4. Keyword boosting agresivo

In [1]:
%%time
!pip install -q scikit-learn pandas numpy

import os, csv, re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter, defaultdict
from tqdm import tqdm

print('âœ“ Imports ready')

âœ“ Imports ready
CPU times: user 336 ms, sys: 141 ms, total: 477 ms
Wall time: 1.62 s


In [2]:
# Load data
def load_corpus(path):
    d = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            p = line.strip().split('\t', 1)
            if len(p)==2: d[p[0]] = p[1]
    return d

def load_classes(path):
    m = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            p = line.strip().split('\t')
            if len(p)==2: m[int(p[0])] = p[1]
    return m

def load_keywords(path):
    d = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            p = line.strip().split(':', 1)
            if len(p)==2: d[p[0]] = [w.strip() for w in p[1].split(',')]
    return d

DATA = 'data'
train = load_corpus(os.path.join(DATA, 'train/train_corpus.txt'))
test = load_corpus(os.path.join(DATA, 'test/test_corpus.txt'))
id2cls = load_classes(os.path.join(DATA, 'classes.txt'))
kw = load_keywords(os.path.join(DATA, 'class_related_keywords.txt'))
NCLS = 531
print('Train:', len(train), 'Test:', len(test), 'Classes:', NCLS)

Train: 29487 Test: 19658 Classes: 531


In [3]:
%%time
# Build class descriptions with keyword repetition (boosting)
cls_desc = []
for i in range(NCLS):
    name = id2cls[i]
    words = kw.get(name, [])
    if not words: words = [name.replace('_', ' ')]
    # Repeat keywords 5x for boosting
    desc = ' '.join([w.replace('_', ' ') for w in words] * 5)
    cls_desc.append(desc)

print('Class descriptions built with 5x keyword boost')

Class descriptions built with 5x keyword boost
CPU times: user 1.29 ms, sys: 0 ns, total: 1.29 ms
Wall time: 1.29 ms


In [6]:
%time
# TF-IDF vectorizer with aggressive parameters
vec = TfidfVectorizer(
    max_features=25000,
    ngram_range=(1,4),
    stop_words='english',
    min_df=1,
    sublinear_tf=True
)

# Fit on combined corpus
all_text = list(train.values()) + list(test.values()) + cls_desc
vec.fit(all_text)
print('Vocabulary size:', len(vec.vocabulary_))

# Transform
T_test = vec.transform(test.values())
C_vec = vec.transform(cls_desc)
print('Test matrix:', T_test.shape, 'Class matrix:', C_vec.shape)

CPU times: user 2 Î¼s, sys: 0 ns, total: 2 Î¼s
Wall time: 5.48 Î¼s
Vocabulary size: 25000
Test matrix: (19658, 25000) Class matrix: (531, 25000)


In [7]:
%%time
# Compute similarities
S = cosine_similarity(T_test, C_vec)
print('Similarity matrix:', S.shape)

Similarity matrix: (19658, 531)
CPU times: user 29 ms, sys: 44.1 ms, total: 73 ms
Wall time: 72.1 ms


In [8]:
%%time
# Predictions with adaptive threshold per sample
test_pids = list(test.keys())
preds = {}

for i, pid in enumerate(tqdm(test_pids, desc='Predict')):
    scores = S[i]
    
    # Strategy: take top 30 candidates, filter by dynamic threshold
    top_idx = np.argsort(scores)[::-1][:30]
    top_scores = scores[top_idx]
    
    # Dynamic threshold: 50th percentile of top 30
    thresh = np.percentile(top_scores, 50)
    
    # Apply minimum threshold
    thresh = max(thresh, 0.02)
    
    # Select candidates
    cands = [(int(idx), scores[idx]) for idx in top_idx if scores[idx] >= thresh]
    
    # Ensure at least 2, at most 5
    if len(cands) < 2:
        cands = [(int(idx), scores[idx]) for idx in top_idx[:3]]
    elif len(cands) > 5:
        cands = cands[:5]
    
    preds[pid] = [c[0] for c in cands[:3]]

print('âœ“ Predictions done')

Predict: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 19658/19658 [00:01<00:00, 11517.49it/s]

âœ“ Predictions done
CPU times: user 1.71 s, sys: 3.96 ms, total: 1.72 s
Wall time: 1.71 s





In [9]:
# Diversity check
all_pred_classes = []
for labels in preds.values():
    all_pred_classes.extend(labels)

unique = len(set(all_pred_classes))
cnt = Counter(all_pred_classes)
print(f'\nDIVERSITY ANALYSIS:')
print(f'Unique classes predicted: {unique}')
print(f'Target: 250+ for score 0.40-0.50')
print(f'\nTop 10 most frequent:')
for c, freq in cnt.most_common(10):
    print(f'  Class {c} ({id2cls[c]}): {freq} times')
print(f'\nDistribution:')
print(f'  >100 times: {sum(1 for x in cnt.values() if x > 100)}')
print(f'  50-100: {sum(1 for x in cnt.values() if 50 <= x <= 100)}')
print(f'  10-50: {sum(1 for x in cnt.values() if 10 <= x < 50)}')
print(f'  <10: {sum(1 for x in cnt.values() if x < 10)}')


DIVERSITY ANALYSIS:
Unique classes predicted: 531
Target: 250+ for score 0.40-0.50

Top 10 most frequent:
  Class 220 (fragrance): 764 times
  Class 199 (dogs): 383 times
  Class 65 (styling_products): 366 times
  Class 242 (men_s): 363 times
  Class 181 (grooming_healthcare_kits): 341 times
  Class 155 (bedding): 326 times
  Class 472 (kickball_playground_balls): 323 times
  Class 25 (deodorants_antiperspirants): 309 times
  Class 221 (women_s): 303 times
  Class 32 (household_batteries): 291 times

Distribution:
  >100 times: 249
  50-100: 190
  10-50: 91
  <10: 1


In [10]:
# Save
OUT = 'outputs'
os.makedirs(OUT, exist_ok=True)
out = os.path.join(OUT, 'tfidf_diversity_max.csv')

with open(out, 'w', newline='', encoding='utf-8') as f:
    w = csv.writer(f)
    w.writerow(['id', 'labels'])
    for pid in sorted(preds.keys(), key=lambda x: int(x)):
        w.writerow([pid, ','.join(map(str, preds[pid]))])

print(f'\nâœ“ Saved to: {out}')
print('\nUPLOAD THIS FILE TO KAGGLE')


âœ“ Saved to: outputs/tfidf_diversity_max.csv

UPLOAD THIS FILE TO KAGGLE
