In [40]:
# Cell 1: Install required libraries
!pip install datasets pandas numpy scikit-learn rapidfuzz sentence-transformers faiss-cpu -q

print("✓ Installation complete!")

✓ Installation complete!


In [41]:
# Cell 2: Import all necessary libraries
import pandas as pd
import numpy as np
from datasets import load_dataset
from collections import defaultdict, Counter
import re
from rapidfuzz import fuzz, process
from sklearn.feature_extraction.text import TfidfVectorizer
import faiss
from sentence_transformers import SentenceTransformer
import gc
from tqdm.auto import tqdm
tqdm.pandas()

print("✓ All libraries imported successfully!")

✓ All libraries imported successfully!


In [42]:
# Cell 3: Load datasets from Hugging Face
print("Loading datasets...")

# Load training data
train_data = load_dataset("123tushar/Dice_Challenge_2025",
                         data_files="train_data/*.parquet")
df_train = train_data['train'].to_pandas()
print(f"✓ Training data loaded: {len(df_train):,} rows")

# Load test prefixes
test_prefixes_data = load_dataset("123tushar/Dice_Challenge_2025",
                                 data_files="test_prefixes_data/*.parquet")
df_test = test_prefixes_data['train'].to_pandas()
print(f"✓ Test prefixes loaded: {len(df_test):,} rows")

# Load query features
query_features = load_dataset("123tushar/Dice_Challenge_2025",
                             data_files="query_features/*.parquet")
df_query_features = query_features['train'].to_pandas()
print(f"✓ Query features loaded: {len(df_query_features):,} rows")

# Load query pool
pool = load_dataset("123tushar/Dice_Challenge_2025",
                   data_files="pool/*.parquet")
df_pool = pool['train'].to_pandas()
print(f"✓ Query pool loaded: {len(df_pool):,} rows")

# Clear memory
del train_data, test_prefixes_data, query_features, pool
gc.collect()

print("\n✓ All data loaded successfully!")

Loading datasets...


Resolving data files:   0%|          | 0/159 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/159 [00:00<?, ?it/s]

✓ Training data loaded: 6,183,540 rows


Resolving data files:   0%|          | 0/32 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/32 [00:00<?, ?it/s]

✓ Test prefixes loaded: 522,726 rows


Resolving data files:   0%|          | 0/83 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/83 [00:00<?, ?it/s]

✓ Query features loaded: 5,339,026 rows


Resolving data files:   0%|          | 0/96 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/96 [00:00<?, ?it/s]

✓ Query pool loaded: 4,168,668 rows

✓ All data loaded successfully!


In [43]:
# Cell 4: Quick data exploration
print("=== Data Exploration ===\n")

print("Training Data Sample:")
print(df_train.head(3))
print(f"\nShape: {df_train.shape}")

print("\n" + "="*50)
print("\nTest Prefixes Sample:")
print(df_test.head(3))
print(f"\nShape: {df_test.shape}")

print("\n" + "="*50)
print("\nQuery Features Sample:")
print(df_query_features.head(3))
print(f"\nShape: {df_query_features.shape}")

print("\n" + "="*50)
print("\nQuery Pool Sample:")
print(df_pool.head(3))
print(f"\nShape: {df_pool.shape}")

=== Data Exploration ===

Training Data Sample:
              prefix                    query
0        5 meter cot  1 5 meter cotton fabric
1  1 litre oil bottl       1 litre oil bottle
2          1 eeyar g       1 year boy punjabi

Shape: (6183540, 2)


Test Prefixes Sample:
                   prefix
0  full sleeve mehendi st
1             dancing jea
2          vridavan dress

Shape: (522726, 1)


Query Features Sample:
               query  catalog_clicks  orders  volume  catalog_views
0        0 baby girl            50.0     2.0    42.0         1796.0
1  0 bacchon ki gadi             5.0     NaN     8.0           42.0
2             0 bulb           575.0    13.0   553.0        20724.0

Shape: (5339026, 5)


Query Pool Sample:
               query
0      garba nosepin
1  jeans top girl 10
2      350 kurta set

Shape: (4168668, 1)


In [44]:
# Cell 5: Create text normalization functions
def normalize_text(text):
    """Normalize text for better matching"""
    if pd.isna(text):
        return ""
    text = str(text).lower().strip()
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    return text

def clean_query(query):
    """Clean query text"""
    query = normalize_text(query)
    # Remove special characters but keep essential ones
    query = re.sub(r'[^a-z0-9\s\-]', '', query)
    return query.strip()

print("✓ Normalization functions created!")

✓ Normalization functions created!


In [45]:
# Cell 6: Preprocess all dataframes
print("Preprocessing data...\n")

# Normalize training data
df_train['prefix_clean'] = df_train['prefix'].apply(normalize_text)
df_train['query_clean'] = df_train['query'].apply(normalize_text)
print("✓ Training data normalized")

# Normalize test prefixes
df_test['prefix_clean'] = df_test['prefix'].apply(normalize_text)
print("✓ Test prefixes normalized")

# Normalize query pool
df_pool['query_clean'] = df_pool['query'].apply(normalize_text)
print("✓ Query pool normalized")

# Normalize query features
df_query_features['query_clean'] = df_query_features['query'].apply(normalize_text)
print("✓ Query features normalized")

print("\n✓ All preprocessing complete!")

Preprocessing data...

✓ Training data normalized
✓ Test prefixes normalized
✓ Query pool normalized
✓ Query features normalized

✓ All preprocessing complete!


In [46]:
# Cell 7: Build prefix-to-query mapping from training data
print("Building prefix-to-query mapping...")

# Create mapping: prefix -> list of queries
prefix_to_queries = defaultdict(list)
for _, row in tqdm(df_train.iterrows(), total=len(df_train), desc="Processing"):
    prefix_to_queries[row['prefix_clean']].append(row['query_clean'])

# Count frequency of each query for each prefix
prefix_query_freq = {}
for prefix, queries in tqdm(prefix_to_queries.items(), desc="Counting"):
    prefix_query_freq[prefix] = Counter(queries)

print(f"✓ Mapping created for {len(prefix_to_queries):,} unique prefixes")
print(f"  Average queries per prefix: {np.mean([len(q) for q in prefix_to_queries.values()]):.2f}")

Building prefix-to-query mapping...


Processing:   0%|          | 0/6183540 [00:00<?, ?it/s]

Counting:   0%|          | 0/736757 [00:00<?, ?it/s]

✓ Mapping created for 736,757 unique prefixes
  Average queries per prefix: 8.39


In [47]:
# Cell 8: Create popularity scores for queries
print("Creating query popularity scores...")

# Merge query features with pool
df_pool_features = df_pool.merge(
    df_query_features[['query_clean', 'orders', 'volume', 'catalog_clicks', 'catalog_views']],
    on='query_clean',
    how='left'
)

# Fill NaN with 0
df_pool_features.fillna(0, inplace=True)

# Create composite popularity score
df_pool_features['popularity_score'] = (
    df_pool_features['orders'] * 10 +
    df_pool_features['volume'] * 5 +
    df_pool_features['catalog_clicks'] * 2 +
    df_pool_features['catalog_views']
)

# Normalize to 0-1 range
max_score = df_pool_features['popularity_score'].max()
if max_score > 0:
    df_pool_features['popularity_norm'] = df_pool_features['popularity_score'] / max_score
else:
    df_pool_features['popularity_norm'] = 0

print("✓ Popularity scores created")
print(f"  Top 5 popular queries:")
top_queries = df_pool_features.nlargest(5, 'popularity_score')[['query_clean', 'popularity_score']]
for idx, row in top_queries.iterrows():
    print(f"    - {row['query_clean']}: {row['popularity_score']:.0f}")

Creating query popularity scores...
✓ Popularity scores created
  Top 5 popular queries:
    - short kurti: 531371527
    - kurti: 499747259
    - earring: 477612337
    - top for women: 436070826
    - kurti set: 374727230


In [58]:
# Cell 8.5: Clean up memory before building index
print("Cleaning up memory before building indexes...")

# Remove unnecessary columns from dataframes
if 'prefix' in df_train.columns:
    df_train = df_train[['prefix_clean', 'query_clean']]
if 'prefix' in df_test.columns:
    df_test_minimal = df_test[['prefix', 'prefix_clean']].copy()

# Keep only necessary columns in pool features
df_pool_features = df_pool_features[['query_clean', 'popularity_score', 'popularity_norm']]

# Reduce pool size if it's too large (keep most popular queries)
if len(df_pool_features) > 700000:
    print(f"  Original pool size: {len(df_pool_features):,}")
    # Keep top 100k by popularity + random sample
    top_queries = df_pool_features.nlargest(560000, 'popularity_score')
    random_sample = df_pool_features.sample(n=140000, random_state=42)
    df_pool_features = pd.concat([top_queries, random_sample]).drop_duplicates()
    print(f"  Reduced pool size: {len(df_pool_features):,}")

# Force garbage collection
gc.collect()

print("✓ Memory optimized!")
print(f"  Training data: {len(df_train):,} rows")
print(f"  Query pool: {len(df_pool_features):,} rows")

Cleaning up memory before building indexes...
✓ Memory optimized!
  Training data: 6,183,540 rows
  Query pool: 490,482 rows


In [59]:
# Cell 9: Create MEMORY-EFFICIENT prefix-based retrieval
print("Building memory-efficient prefix matching structure...")

# Clear some memory first
gc.collect()

# Use a more memory-efficient approach - limit prefix length
MAX_PREFIX_LEN = 15  # Reduced from 15 to save memory

def build_efficient_prefix_index(queries, max_prefix_len=12):
    """Build memory-efficient prefix index"""
    prefix_index = defaultdict(list)

    # Process in chunks to avoid memory issues
    chunk_size = 10000
    total_queries = len(queries)

    for start_idx in tqdm(range(0, total_queries, chunk_size), desc="Indexing chunks"):
        end_idx = min(start_idx + chunk_size, total_queries)
        chunk = queries[start_idx:end_idx]

        for query in chunk:
            query_str = str(query).lower().strip()
            # Only index shorter prefixes to save memory
            for i in range(1, min(len(query_str) + 1, max_prefix_len + 1)):
                prefix = query_str[:i]
                # Store only if list not too long (prevent memory explosion)
                if len(prefix_index[prefix]) < 5000:  # Limit per prefix
                    prefix_index[prefix].append(query_str)

    # Convert lists to sets and then back to lists to remove duplicates
    print("Deduplicating...")
    for prefix in tqdm(list(prefix_index.keys())[:10000], desc="Cleaning"):  # Process subset
        prefix_index[prefix] = list(set(prefix_index[prefix]))[:1000]  # Keep top 1000

    return dict(prefix_index)

# Build index for all queries in pool
all_queries = df_pool_features['query_clean'].unique()
print(f"Total unique queries: {len(all_queries):,}")

prefix_index = build_efficient_prefix_index(all_queries, max_prefix_len=MAX_PREFIX_LEN)

print(f"✓ Prefix index built with {len(prefix_index):,} entries")

# Create query to features mapping for quick lookup
query_to_features = df_pool_features.set_index('query_clean')[['popularity_norm']].to_dict()['popularity_norm']

# Clear memory
del all_queries
gc.collect()

print("✓ Prefix matching system ready!")
print(f"  Memory saved by limiting prefix length and entries per prefix")

Building memory-efficient prefix matching structure...
Total unique queries: 490,481


Indexing chunks:   0%|          | 0/50 [00:00<?, ?it/s]

Deduplicating...


Cleaning:   0%|          | 0/10000 [00:00<?, ?it/s]

✓ Prefix index built with 1,799,860 entries
✓ Prefix matching system ready!
  Memory saved by limiting prefix length and entries per prefix


In [60]:
# Cell 10: Setup LIGHTWEIGHT fuzzy matching
print("Setting up lightweight fuzzy matching...")

# Clear memory first
gc.collect()

# Keep only subset of unique queries for fuzzy matching (to save memory)
# Prioritize popular queries
df_pool_sorted = df_pool_features.sort_values('popularity_score', ascending=False)
unique_queries_subset = df_pool_sorted['query_clean'].head(50000).unique().tolist()

print(f"✓ Using {len(unique_queries_subset):,} queries for fuzzy matching")
print("  (Top queries by popularity to optimize memory)")

def get_fuzzy_matches(prefix, query_list, limit=100, score_threshold=70):
    """Get fuzzy matches for a prefix - lighter version"""
    # Only do fuzzy matching for prefixes of reasonable length
    if len(prefix) < 2 or len(prefix) > 20:
        return []

    matches = process.extract(
        prefix,
        query_list,
        scorer=fuzz.partial_ratio,
        limit=min(limit, 100)  # Limit to save memory
    )
    # Filter by threshold
    filtered = [(match, score) for match, score, _ in matches if score >= score_threshold]
    return filtered

print("✓ Lightweight fuzzy matching ready!")
print("  This will help handle typos like 'ifon' -> 'iphone'")

Setting up lightweight fuzzy matching...
✓ Using 50,000 queries for fuzzy matching
  (Top queries by popularity to optimize memory)
✓ Lightweight fuzzy matching ready!
  This will help handle typos like 'ifon' -> 'iphone'


In [61]:
# Cell 12: OPTIMIZED retrieval function for low memory
def retrieve_candidates(prefix, top_k=150):
    """
    Memory-efficient retrieval using multiple strategies
    """
    candidates = {}  # query -> score
    prefix_clean = normalize_text(prefix)

    # Strategy 1: Historical matches (HIGHEST PRIORITY - 70% weight)
    if prefix_clean in prefix_query_freq:
        for query, freq in prefix_query_freq[prefix_clean].items():
            candidates[query] = candidates.get(query, 0) + freq * 100

    # Strategy 2: Direct prefix matching (30% weight)
    if prefix_clean in prefix_index:
        matching_queries = prefix_index[prefix_clean][:500]  # Limit to save memory
        for query in matching_queries:
            base_score = 50
            # Boost by popularity
            pop_score = query_to_features.get(query, 0) * 20
            candidates[query] = candidates.get(query, 0) + base_score + pop_score

    # Strategy 3: Character-by-character prefix expansion
    # Search for queries starting with first few characters
    if len(candidates) < top_k and len(prefix_clean) >= 2:
        for i in range(min(len(prefix_clean), 6), 1, -1):  # Check shorter prefixes
            partial_prefix = prefix_clean[:i]
            if partial_prefix in prefix_index:
                partial_matches = prefix_index[partial_prefix][:300]
                for query in partial_matches:
                    if query.startswith(prefix_clean[:2]):  # Must match first 2 chars
                        base_score = 25 * (i / len(prefix_clean))
                        pop_score = query_to_features.get(query, 0) * 10
                        if query not in candidates:
                            candidates[query] = base_score + pop_score
                if len(candidates) >= top_k:
                    break

    # Strategy 4: Fuzzy matching for typos (only if needed)
    if len(candidates) < top_k and len(prefix_clean) >= 3:
        fuzzy_matches = get_fuzzy_matches(
            prefix_clean,
            unique_queries_subset,
            limit=50,  # Reduced limit
            score_threshold=60  # Higher threshold
        )
        for query, score in fuzzy_matches:
            if query not in candidates:
                fuzzy_score = (score / 100) * 30
                pop_score = query_to_features.get(query, 0) * 10
                candidates[query] = fuzzy_score + pop_score

    # Strategy 5: Popular queries as fallback (only if really needed)
    if len(candidates) < top_k:
        # Get popular queries efficiently
        popular_queries = df_pool_features.nlargest(
            top_k * 2,
            'popularity_score'
        )['query_clean'].tolist()

        for query in popular_queries:
            if query not in candidates:
                candidates[query] = query_to_features.get(query, 0) * 5
            if len(candidates) >= top_k * 2:  # Enough candidates
                break

    # Sort by score and return top-k
    sorted_candidates = sorted(candidates.items(), key=lambda x: x[1], reverse=True)
    top_queries = [query for query, score in sorted_candidates[:top_k]]

    # Ensure we have exactly top_k queries
    if len(top_queries) < top_k:
        # Fill remaining with any queries from pool
        remaining = top_k - len(top_queries)
        pool_sample = df_pool_features.sample(n=min(remaining * 2, len(df_pool_features)))
        for query in pool_sample['query_clean'].tolist():
            if query not in top_queries:
                top_queries.append(query)
                if len(top_queries) >= top_k:
                    break

    return top_queries[:top_k]

print("✓ Memory-efficient retrieval function ready!")
print("  Strategies optimized to minimize memory usage")

✓ Memory-efficient retrieval function ready!
  Strategies optimized to minimize memory usage


In [62]:
# Cell 13: Test the retrieval function with sample prefixes
print("Testing retrieval function...\n")

test_prefixes_sample = ['sho', 'red dr', 'ifon', 'lapt']

for test_prefix in test_prefixes_sample:
    results = retrieve_candidates(test_prefix, top_k=10)
    print(f"Prefix: '{test_prefix}'")
    print(f"Top 10 predictions: {results[:10]}\n")

Testing retrieval function...

Prefix: 'sho'
Top 10 predictions: ['shorts for women', 'shoes for men', 'short kurti combo', 'shoe rack', 'shorts for men', 'women shoes', 'shoes', 'short kurti', 'shorts', 'short kurti combo set']

Prefix: 'red dr'
Top 10 predictions: ['red dress', 'red dress for women', 'red dress for girl', 'red dress for kids', 'red dress for baby girl', 'red dragon fruit plant', 'red dress for boys', 'baby girl red dress', 'boy red dress', 'red dress for woman']

Prefix: 'ifon'
Top 10 predictions: ['ifb washing machine cover', 'ifb washing liquid', 'ifb descale powder', 'ifb microwave oven stand', 'ifb tub clean powder', 'ifb washing cover 7 kg', 'iftar', 'ifb washing cover', 'ifb washing machine 7 kg', 'ifb front load cover']

Prefix: 'lapt'
Top 10 predictions: ['laptop', 'laptop bag', 'laptop stand', 'laptop table', 'laptop cover', 'laptop sticker', 'laptop cleaner', 'kids toy laptop', 'laptop cleaning kit', 'laptop bag woman']



In [63]:
# Cell 14: Generate predictions for all test prefixes
print("Generating predictions for all test prefixes...")
print(f"Total prefixes to process: {len(df_test):,}\n")

predictions = []

for idx, row in tqdm(df_test.iterrows(), total=len(df_test), desc="Predicting"):
    prefix = row['prefix']
    retrieved_queries = retrieve_candidates(prefix, top_k=150)
    predictions.append({
        'prefix': prefix,
        'retrieved_queries': retrieved_queries
    })

# Create results dataframe
df_results = pd.DataFrame(predictions)

print(f"\n✓ Generated predictions for {len(df_results):,} prefixes")
print(f"  Sample result:")
print(f"  Prefix: {df_results.iloc[0]['prefix']}")
print(f"  Queries: {df_results.iloc[0]['retrieved_queries'][:5]}...")

Generating predictions for all test prefixes...
Total prefixes to process: 522,726



Predicting:   0%|          | 0/522726 [00:00<?, ?it/s]


✓ Generated predictions for 522,726 prefixes
  Sample result:
  Prefix: full sleeve mehendi st
  Queries: ['full sleeve mehendi stick', 'full sleeve mehendi stencil', 'full sleeve blouse', 'full sleeve kurti', 'full sleeve tshirt women']...


In [64]:
# Cell 15: Save predictions to CSV
import json

# Convert list to JSON string for CSV format
df_results['retrieved_queries_str'] = df_results['retrieved_queries'].apply(json.dumps)

# Create final submission dataframe
df_submission = df_results[['prefix', 'retrieved_queries_str']].copy()
df_submission.columns = ['prefix', 'retrieved_queries']

# Save to CSV
df_submission.to_csv('submission2.csv', index=False)

print("✓ Submission saved to 'submission2.csv'")
print(f"  File size: {len(df_submission):,} rows")

# Show first few rows
print("\nFirst 3 rows of submission:")
print(df_submission.head(3))

✓ Submission saved to 'submission2.csv'
  File size: 522,726 rows

First 3 rows of submission:
                   prefix                                  retrieved_queries
0  full sleeve mehendi st  ["full sleeve mehendi stick", "full sleeve meh...
1             dancing jea  ["dancing cactus", "dancing cactus toy", "danc...
2          vridavan dress  ["dress", "man dress", "jean dress", "press", ...


In [55]:
# Cell 16: Save predictions to Parquet (more efficient)
# Keep the list format for parquet
df_results[['prefix', 'retrieved_queries']].to_parquet('submission1.parquet', index=False)

print("✓ Submission also saved to 'submission1.parquet'")
print("\nYou can submit either the CSV or Parquet file!")

✓ Submission also saved to 'submission1.parquet'

You can submit either the CSV or Parquet file!


In [65]:
# Cell 17: Validate submission format
print("=== Submission Validation ===\n")

# Check 1: Row count
print(f"✓ Number of predictions: {len(df_results):,}")
print(f"  Expected: {len(df_test):,}")
print(f"  Match: {len(df_results) == len(df_test)}")

# Check 2: All prefixes covered
missing_prefixes = set(df_test['prefix']) - set(df_results['prefix'])
print(f"\n✓ Missing prefixes: {len(missing_prefixes)}")

# Check 3: Number of queries per prefix
query_counts = df_results['retrieved_queries'].apply(len)
print(f"\n✓ Queries per prefix:")
print(f"  Min: {query_counts.min()}")
print(f"  Max: {query_counts.max()}")
print(f"  Mean: {query_counts.mean():.2f}")
print(f"  All have 150: {(query_counts == 150).all()}")

# Check 4: Sample quality
print(f"\n✓ Sample predictions:")
for i in range(min(3, len(df_results))):
    prefix = df_results.iloc[i]['prefix']
    queries = df_results.iloc[i]['retrieved_queries'][:5]
    print(f"  '{prefix}' -> {queries}")

print("\n" + "="*50)
print("✓ Validation complete!")

=== Submission Validation ===

✓ Number of predictions: 522,726
  Expected: 522,726
  Match: True

✓ Missing prefixes: 0

✓ Queries per prefix:
  Min: 150
  Max: 150
  Mean: 150.00
  All have 150: True

✓ Sample predictions:
  'full sleeve mehendi st' -> ['full sleeve mehendi stick', 'full sleeve mehendi stencil', 'full sleeve blouse', 'full sleeve kurti', 'full sleeve tshirt women']
  'dancing jea' -> ['dancing cactus', 'dancing cactus toy', 'dancing toy', 'dancing monkey toy', 'dancing car']
  'vridavan dress' -> ['dress', 'man dress', 'jean dress', 'press', 'new dress']

✓ Validation complete!


In [66]:
# Cell 18: Analyze which strategies contributed most
print("=== Strategy Performance Analysis ===\n")

# Count coverage from each strategy
total_prefixes = len(df_test)

# Historical matches
historical_coverage = sum(1 for p in df_test['prefix_clean'] if p in prefix_query_freq)
print(f"Historical Match Coverage: {historical_coverage}/{total_prefixes} ({100*historical_coverage/total_prefixes:.1f}%)")

# Prefix index coverage
prefix_coverage = sum(1 for p in df_test['prefix_clean'] if p in prefix_index)
print(f"Prefix Index Coverage: {prefix_coverage}/{total_prefixes} ({100*prefix_coverage/total_prefixes:.1f}%)")

# Analyze prefix length distribution
df_test['prefix_length'] = df_test['prefix_clean'].str.len()
print(f"\nPrefix Length Distribution:")
print(df_test['prefix_length'].describe())

print("\n" + "="*50)

=== Strategy Performance Analysis ===

Historical Match Coverage: 87365/522726 (16.7%)
Prefix Index Coverage: 190552/522726 (36.5%)

Prefix Length Distribution:
count    522726.000000
mean         13.419832
std           4.713659
min           1.000000
25%          10.000000
50%          13.000000
75%          17.000000
max          25.000000
Name: prefix_length, dtype: float64

