In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
from transformers import AutoModel
import lightgbm as lgb


This notebook uses the LightGBM models we created as an ensemble to perform inference. It generates the necessary CSV file for Kaggle submission and saves it.

### Load data

In [None]:
test_users_features = pd.read_parquet('test_users_features.parquet')
test_reviews_features = pd.read_parquet('test_reviews_features.parquet')

test_users_tokens = pd.read_parquet('test_users_tokens.parquet')
test_reviews_tokens = pd.read_parquet('test_reviews_tokens.parquet')

### Functions and classes used for inference

In [None]:
def create_subsets(users_features, users_tokens, reviews_features, reviews_tokens, matches, total_parts, part):
    """
    Create subsets of the data for training and evaluation to fit into memory
    Args:
        users_features: DataFrame with user features
        users_tokens: DataFrame with user tokens
        reviews_features: DataFrame with review features
        reviews_tokens: DataFrame with review tokens
        matches: DataFrame with matches
        total_parts: total number of parts to split the data into
        part: part to select
    Returns:
        Tuple of DataFrames with user features, user tokens, review features, review tokens, and matches
    """
    accommodation_ids = matches['accommodation_id'].unique().tolist()
    
    accommodation_ids = sorted(accommodation_ids)
    
    subset_size = len(accommodation_ids) // total_parts
    remainder = len(accommodation_ids) % total_parts
    
    start_idx = part * subset_size + min(part, remainder)
    end_idx = (part + 1) * subset_size + min(part + 1, remainder)

    selected_accommodation_ids = accommodation_ids[start_idx:end_idx]
    
    users_features = users_features[users_features['accommodation_id'].isin(selected_accommodation_ids)]
    reviews_features = reviews_features[reviews_features['accommodation_id'].isin(selected_accommodation_ids)]
    users_tokens = users_tokens[users_tokens['accommodation_id'].isin(selected_accommodation_ids)]
    reviews_tokens = reviews_tokens[reviews_tokens['accommodation_id'].isin(selected_accommodation_ids)]
    
    matches_reduced = matches[matches['accommodation_id'].isin(selected_accommodation_ids)]
    
    return users_features, users_tokens, reviews_features, reviews_tokens, matches_reduced

class TwoTowersNetwork(nn.Module):
    """
    Args:
        model_id (str): Model identifier.
    """
    def __init__(self, model_id):
        super().__init__()
        self.bert1 = AutoModel.from_pretrained(model_id)
        self.bert2 = AutoModel.from_pretrained(model_id)
        
    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0]
        mask = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * mask, 1) / torch.clamp(mask.sum(1), min=1e-9)

    def forward(self, user_ids, user_mask, review_ids, review_mask):
        context_output = self.bert1(user_ids, attention_mask=user_mask)
        review_output = self.bert2(review_ids, attention_mask=review_mask)
        
        context_embed = self.mean_pooling(context_output, user_mask)
        review_embed = self.mean_pooling(review_output, review_mask)
        
        return context_embed, review_embed

def pad_sequence(sequences, batch_first=False, padding_value=0):
    """
    Pad a list of variable length sequences with padding_value.
    """
    max_len = max(len(seq) for seq in sequences)
    padded_sequences = []

    for seq in sequences:
        padding = torch.full((max_len - len(seq),), padding_value, dtype=seq.dtype)
        padded_sequences.append(torch.cat((seq, padding)))

    if batch_first:
        return torch.stack(padded_sequences)
    else:
        return torch.stack(padded_sequences).transpose(0, 1)

def encode_pairwise(train_users_tokens, train_reviews_tokens, model, batch_size=64, device='cuda'):
    """
    Encode user and review tokens using the model.
    Args:
        train_users_tokens (DataFrame): DataFrame with user tokens
        train_reviews_tokens (DataFrame): DataFrame with review tokens
        model (nn.Module): Model to use for encoding
        batch_size (int): Batch size
        device (str): Device to use
    Returns:
        Tuple of DataFrames with user and review embeddings
    """
    model = model.to(device)
    model.eval()

    user_embeddings, review_embeddings = [], []

    for i in range(0, len(train_users_tokens), batch_size):
        user_batch = train_users_tokens.iloc[i:i+batch_size]
        review_batch = train_reviews_tokens.iloc[i:i+batch_size]
        
        user_ids = pad_sequence([torch.tensor(x) for x in user_batch['input_ids']], batch_first=True).to(device)
        user_masks = pad_sequence([torch.tensor(x) for x in user_batch['attention_mask']], batch_first=True).to(device)
        review_ids = pad_sequence([torch.tensor(x) for x in review_batch['input_ids']], batch_first=True).to(device)
        review_masks = pad_sequence([torch.tensor(x) for x in review_batch['attention_mask']], batch_first=True).to(device)

        with torch.no_grad():
            user_embed, review_embed = model(user_ids, user_masks, review_ids, review_masks)
            user_embeddings.extend(user_embed.cpu().numpy())
            review_embeddings.extend(review_embed.cpu().numpy())
        
        if (i // batch_size) % 500 == 0:
            print(f'Processed {i}/{len(train_users_tokens)} samples', flush=True)
    print('', flush=True)

    user_embeddings_df = pd.DataFrame(user_embeddings, columns=[f'user_embedding_{i}' for i in range(user_embed.size(1))])
    review_embeddings_df = pd.DataFrame(review_embeddings, columns=[f'review_embedding_{i}' for i in range(review_embed.size(1))])

    return user_embeddings_df, review_embeddings_df

def embed_data(users_tokens, reviews_tokens, model_path, batch_size=64, device='cuda'):
    """
    Encode user and review tokens using the model.
    Args:
        users_tokens (DataFrame): DataFrame with user tokens
        reviews_tokens (DataFrame): DataFrame with review tokens
        model_path (str): Path to the model
        batch_size (int): Batch size
        device (str): Device to use
    Returns:
        Tuple of DataFrames with user and review embeddings
    """
    model = TwoTowersNetwork("sentence-transformers/all-MiniLM-L12-v2").to(device)
    model.load_state_dict(torch.load(model_path))

    user_embeddings, review_embeddings = encode_pairwise(users_tokens, reviews_tokens, model, batch_size=batch_size, device=device)

    
    return user_embeddings, review_embeddings

def generate_user_review_pairs(val_users, val_reviews, accommodation_reviews, sample_fraction=0.1):
    """
    Generate user-review pairs for evaluation.
    Args:
        val_users (DataFrame): DataFrame with user features
        val_reviews (DataFrame): DataFrame with review features
        accommodation_reviews (dict): Dictionary with accommodation reviews
        sample_fraction (float): Fraction of users to sample
    Returns:
        List of tuples with user ID, review ID, and combined features
    """
    val_reviews_indexed = val_reviews.set_index('review_id')
    user_review_pairs = []

    for ii, user_row in val_users.iterrows():
        user_id = user_row['user_id']
        acc_id = user_row['accommodation_id']
        user_features = user_row.drop(['user_id', 'accommodation_id'])
        
        reviews_for_acc = accommodation_reviews.get(acc_id, [])
        
        for review_id in reviews_for_acc:
            if review_id in val_reviews_indexed.index:
                review_features = val_reviews_indexed.loc[review_id].drop(['accommodation_id'])
                
                combined_features = pd.concat([user_features, review_features])
                user_review_pairs.append((user_id, review_id, combined_features))

        if (ii + 1) % 1000 == 0:
            print(f'Processed {ii+1}/{len(val_users)} users', flush=True)

    return user_review_pairs

def predict_and_rank(user_review_pairs, model, batch_size=10000):
    """
    Predict scores for user-review pairs and rank them.
    Args:
        user_review_pairs (list): List of tuples with user ID, review ID, and combined features
        model (nn.Module): Model to use for prediction
        batch_size (int): Batch size
    Returns:
        DataFrame with user ID, review ID, score, and rank
    """
    user_ids = []
    review_ids = []
    features = []
    scores = []

    # Collect all pairs first
    for user_id, review_id, combined_features in user_review_pairs:
        user_ids.append(user_id)
        review_ids.append(review_id)
        features.append(combined_features.values)

    # Convert features to numpy array
    features_array = np.array(features)
    
    # Batch prediction
    for i in range(0, len(features_array), batch_size):
        batch_features = features_array[i:i + batch_size]
        batch_scores = model.predict(batch_features)
        scores.extend(batch_scores)
        print(f'Predicted {i + len(batch_features)}/{len(features_array)} samples', flush=True)
    print('', flush=True)

    predictions_df = pd.DataFrame({
        'user_id': user_ids,
        'review_id': review_ids,
        'score': scores
    })

    predictions_df['rank'] = predictions_df.groupby('user_id')['score'].rank(ascending=False, method='first')

    return predictions_df

def aggregate_review_scores(*dfs):
    """
    Aggregate review scores from multiple models.
    Args:
        dfs: DataFrames with user ID, review ID, score, and rank
    Returns:
        DataFrame with user ID, review ID, aggregated score, and rank
    """
    required_columns = ['user_id', 'review_id', 'score', 'rank']
    processed_dfs = []
    
    for i, df in enumerate(dfs):
        temp_df = df[required_columns].copy()
        temp_df = temp_df.rename(columns={'score': f'score_{i+1}'})
        processed_dfs.append(temp_df)
    
    final_df = processed_dfs[0]
    for df in processed_dfs[1:]:
        final_df = final_df.merge(df[['user_id', 'review_id', df.columns[-2]]], 
                                on=['user_id', 'review_id'], 
                                how='outer')
    
    score_columns = [col for col in final_df.columns if col.startswith('score_')]
    final_df['aggregated_score'] = final_df[score_columns].mean(axis=1)
    
    final_df['new_rank'] = final_df.groupby('user_id')['aggregated_score'].rank(
        ascending=False, 
        method='min'
    )
    
    final_columns = ['user_id', 'review_id'] + ['aggregated_score', 'new_rank']
    
    final_df = final_df[final_columns].sort_values(['user_id', 'new_rank'])
    final_df = final_df.rename(columns={'aggregated_score': 'score', 'new_rank': 'rank'})
    
    return final_df

def create_top_10_reviews_df(predictions_df, user_to_accommodation):
    """
    Create DataFrame with top 10 reviews for each user.
    Args:
        predictions_df (DataFrame): DataFrame with user ID, review ID, score, and rank
        user_to_accommodation (dict): Dictionary with user to accommodation mapping
    Returns:
        DataFrame with user ID, accommodation ID, and top 10 reviews
    """
    top_10_df = predictions_df[predictions_df['rank'] <= 10].sort_values(['user_id', 'rank'])
    
    user_reviews_dict = {
        user_id: group['review_id'].tolist() 
        for user_id, group in top_10_df.groupby('user_id')
    }
    
    result_rows = []
    for user_id in user_reviews_dict:
        user_reviews = user_reviews_dict[user_id]
        
        user_reviews.extend([None] * (10 - len(user_reviews)))
        
        row = {
            'accommodation_id': user_to_accommodation.get(user_id),
            'user_id': user_id,
        }
        
        for i, review in enumerate(user_reviews, 1):
            row[f'review_{i}'] = review
            
        result_rows.append(row)
    
    result_df = pd.DataFrame(result_rows)
    
    return result_df

def get_partition(accommodation_ids, index, total_partitions=5):
    """
    Get partition of accommodation IDs.
    Args:
        accommodation_ids (list): List of accommodation IDs
        index (int): Index of the partition
        total_partitions (int): Total number of partitions
    Returns:
        List of accommodation IDs
    """
    partition_size = len(accommodation_ids) // total_partitions
    remainder = len(accommodation_ids) % total_partitions
    
    print(f"Part {index} out of {total_partitions - 1}\n", flush=True)
    print(f"Partition size: {partition_size}", flush=True)
    print(f"Remainder: {remainder}", flush=True)

    start_idx = index * partition_size + min(index, remainder)
    end_idx = start_idx + partition_size + (1 if index < remainder else 0)

    print(f"Start index: {start_idx}", flush=True)
    print(f"End index: {end_idx}\n", flush=True)
    
    return accommodation_ids[start_idx:end_idx]


### Get dataset parition so we can fit test set in memory

In [None]:
total_parts = 200
part = 0

accommodation_ids = test_users_features['accommodation_id'].unique().tolist()
accommodation_ids = get_partition(accommodation_ids, part, total_parts)

test_users_features = test_users_features[test_users_features['accommodation_id'].isin(accommodation_ids)]
test_reviews_features = test_reviews_features[test_reviews_features['accommodation_id'].isin(accommodation_ids)]
test_users_tokens = test_users_tokens[test_users_tokens['accommodation_id'].isin(accommodation_ids)]
test_reviews_tokens = test_reviews_tokens[test_reviews_tokens['accommodation_id'].isin(accommodation_ids)]

### embed the the test set with our trained embedding model

In [None]:
model_path = 'embedding_model.pt'
test_user_embeddings, test_review_embeddings = embed_data(test_users_tokens, test_reviews_tokens, model_path)

test_users_features.reset_index(drop=True, inplace=True)
test_reviews_features.reset_index(drop=True, inplace=True)
test_user_embeddings.reset_index(drop=True, inplace=True)
test_review_embeddings.reset_index(drop=True, inplace=True)


test_users = pd.concat([test_users_features, test_user_embeddings], axis=1)
test_reviews = pd.concat([test_reviews_features, test_review_embeddings], axis=1)

### generate a pair of user-review for every user with every review for their respective accommodation

In [None]:
accommodation_reviews = (test_reviews.groupby('accommodation_id')['review_id'].apply(list).to_dict())
user_review_pairs = generate_user_review_pairs(test_users, test_reviews, accommodation_reviews)

### loads our ensemble and predicts top 10 reviews for each user

In [None]:
models_paths = ['model1.txt', 'model2.txt', 'model3.txt', 'model4.txt', 'model5.txt']
models = [lgb.Booster(model_file=path) for path in models_paths]
predictions_dfs = [predict_and_rank(user_review_pairs, model) for model in models]

### Aggregates results by getting mean score for each user-review pair and ranking reviews based on mean score. Then saves aggregated results.

In [None]:
aggregate_df = aggregate_review_scores(*predictions_dfs)
result_df = create_top_10_reviews_df(aggregate_df, test_users_features.set_index('user_id')['accommodation_id'].to_dict())
result_df.to_csv(f'results_{part}', index=False)


### After inferencing the entire test set, we need to combine all the results