In [15]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupShuffleSplit

def ndcg_at_k(r, k=5):
    r = np.asfarray(r)[:k]
    if r.size == 0:
        return 0.0
    dcg = np.sum((2**r - 1) / np.log2(np.arange(2, r.size + 2)))
    ideal_r = np.sort(r)[::-1]
    idcg = np.sum((2**ideal_r - 1) / np.log2(np.arange(2, r.size + 2)))
    return dcg / idcg if idcg > 0 else 0.0

# Load training data
full_train_df = pd.read_parquet('training_set_processed.parquet')

# Sample 10% of the data by search_id (to prevent leakage)
sampled_srch_ids = full_train_df['srch_id'].drop_duplicates().sample(frac=0.2, random_state=42)
train_df = full_train_df[full_train_df['srch_id'].isin(sampled_srch_ids)].copy()

# Split into training and validation set (again, by srch_id to avoid leakage)
splitter = GroupShuffleSplit(test_size=0.2, random_state=42)
train_idx, val_idx = next(splitter.split(train_df, groups=train_df['srch_id']))
train_data = train_df.iloc[train_idx].copy()
val_data = train_df.iloc[val_idx].copy()

# Feature columns to use
feature_cols = [
    'prop_starrating', 'prop_review_score', 'prop_brand_bool',
    'prop_location_score1', 'prop_location_score2', 'price_usd',
    'prop_log_historical_price', 'promotion_flag', 'orig_destination_distance'
]

# Handle special case: prop_log_historical_price == 0 means missing
train_data['prop_log_historical_price'] = train_data['prop_log_historical_price'].replace(0, np.nan)

# Add missing value indicator columns
for col in ['prop_review_score', 'orig_destination_distance']:
    train_data[f'{col}_missing'] = train_data[col].isna().astype(int)


# Feature Engineering
# price_per_night
train_data['price_per_night'] = train_data['price_usd'] / train_data['srch_length_of_stay'].replace(0, np.nan)

# price_per_person
group_size = train_data['srch_adults_count'] + train_data['srch_children_count']
train_data['price_per_person'] = train_data['price_usd'] / group_size.replace(0, np.nan)

# price_ratio_to_mean_in_search
mean_price_per_search = train_data.groupby('srch_id')['price_usd'].transform('mean')
train_data['price_ratio_to_mean'] = train_data['price_usd'] / mean_price_per_search.replace(0, np.nan)

# price × star rating interaction
train_data['price_x_star'] = train_data['price_usd'] * train_data['prop_starrating']

# same_country_bool
train_data['same_country'] = (train_data['visitor_location_country_id'] == train_data['prop_country_id']).astype(int)

# Apply imputation
train_data['prop_review_score'] = train_data['prop_review_score'].fillna(train_data['prop_review_score'].median())
train_data['orig_destination_distance'] = train_data['orig_destination_distance'].fillna(train_data['orig_destination_distance'].mean())
train_data['prop_log_historical_price'] = train_data['prop_log_historical_price'].fillna(train_data['prop_log_historical_price'].mean())

# Rebuild feature list with missing indicators
feature_cols.extend(['price_per_night', 'price_per_person', 'price_ratio_to_mean', 'price_x_star', 'same_country'])
feature_cols.extend(['prop_review_score_missing', 'orig_destination_distance_missing'])

# Limit training to top 40000 most frequent hotels
top_props = train_data['prop_id'].value_counts().head(30000).index
train_data = train_data[train_data['prop_id'].isin(top_props)]

# Create item profiles
item_profiles = train_data[['prop_id'] + feature_cols].drop_duplicates('prop_id').set_index('prop_id')
item_profiles.replace([np.inf, -np.inf], np.nan, inplace=True)
item_profiles.fillna(item_profiles.mean(), inplace=True)
item_profiles.fillna(0, inplace=True)  # final fallback

# Clip extreme values to prevent overflow
for col in feature_cols:
    item_profiles[col] = np.clip(item_profiles[col], -1e6, 1e6)

# Standardize
scaler = StandardScaler()
item_profiles_scaled = pd.DataFrame(
    scaler.fit_transform(item_profiles.astype(np.float64)), index=item_profiles.index, columns=feature_cols)

# Similarity matrix
similarity_matrix = pd.DataFrame(
    cosine_similarity(item_profiles_scaled),
    index=item_profiles_scaled.index,
    columns=item_profiles_scaled.index
)

# Precompute top-10 neighbors per hotel
precomputed_neighbors = {}
for prop in similarity_matrix.index:
    sim_series = similarity_matrix.loc[prop].drop(prop, errors='ignore')
    top_k = sim_series.nlargest(10)
    precomputed_neighbors[prop] = list(top_k.items())
similarity_matrix = pd.DataFrame(
    cosine_similarity(item_profiles_scaled),
    index=item_profiles_scaled.index,
    columns=item_profiles_scaled.index
)

# Click/booking rates
click_rate = train_data.groupby('prop_id')['click_bool'].mean().to_dict()
book_rate = train_data.groupby('prop_id')['booking_bool'].mean().to_dict()

# Scoring function
def score_with_knn(val_df, precomputed_neighbors):
    results = []
    for srch_id, group in val_df.groupby('srch_id'):
        props = group['prop_id'].values
        scores = {}
        for prop in props:
            if prop not in similarity_matrix.index:
                scores[prop] = 0
                continue
            sim_series = similarity_matrix[prop].drop(prop, errors='ignore')
            top_k = precomputed_neighbors.get(prop, [])
            score = 0
            for neighbor, sim in top_k.items():
                cr = click_rate.get(neighbor, 0)
                br = book_rate.get(neighbor, 0)
                score += sim * (br * 5 + cr)
            scores[prop] = score
        ranked_props = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        for prop, score in ranked_props:
            results.append({'srch_id': srch_id, 'prop_id': prop, 'score': score})
    return pd.DataFrame(results)

# Limit validation set to 5000 search IDs
val_data = val_data[val_data['srch_id'].isin(val_data['srch_id'].unique()[:5000])]

import time
start_time = time.time()
ranked_df = score_with_knn(val_data, similarity_matrix)
print(f'Time to score 5000 validation searches: {time.time() - start_time:.2f} seconds')

# Attach relevance
true_labels = val_data[['srch_id', 'prop_id', 'click_bool', 'booking_bool']]
ranked_df = ranked_df.merge(true_labels, on=['srch_id', 'prop_id'], how='left')
ranked_df['relevance'] = ranked_df['booking_bool'] * 5 + ranked_df['click_bool']

# NDCG@5
ndcg_scores = []
for srch_id, group in ranked_df.groupby('srch_id'):
    sorted_group = group.sort_values('score', ascending=False)
    ndcg_scores.append(ndcg_at_k(sorted_group['relevance'].values, k=5))

print(f"Validation NDCG@5: {np.mean(ndcg_scores):.4f}")

# Save output
ranked_df.to_csv('ranked_knn_imputed_output.csv', index=False)


  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  casted = dtype.type(element)


Time to score 5000 validation searches: 662.26 seconds
Validation NDCG@5: 0.1792


In [None]:
# Load test set
test_df = pd.read_parquet('test_set_processed.parquet')

# Filter to only props included in training similarity matrix
test_df = test_df[test_df['prop_id'].isin(similarity_matrix.index)]

# Score function using precomputed_neighbors, click_rate, and book_rate
def score_with_knn_test(test_df, precomputed_neighbors):
    results = []
    for srch_id, group in test_df.groupby('srch_id'):
        props = group['prop_id'].values
        scores = {}
        for prop in props:
            if prop not in precomputed_neighbors:
                scores[prop] = 0
                continue
            top_k = precomputed_neighbors.get(prop, [])
            score = 0
            for neighbor, sim in top_k:
                cr = click_rate.get(neighbor, 0)
                br = book_rate.get(neighbor, 0)
                score += sim * (br * 5 + cr)
            scores[prop] = score
        ranked_props = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        for rank, (prop, score) in enumerate(ranked_props):
            results.append({'srch_id': srch_id, 'prop_id': prop, 'rank': rank})
    return pd.DataFrame(results)

# Score and export
ranked_test = score_with_knn_test(test_df, precomputed_neighbors)
ranked_test.sort_values(['srch_id', 'rank'])[['srch_id', 'prop_id']].to_csv('submission5.csv', index=False)


✅ submission.csv created successfully.
