In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
from transformers import AutoModel
from transformers import get_cosine_schedule_with_warmup
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


This notebook utilizes the embedding model we trained to embed the entire training dataset. It sorts the data by accommodation and then creates hard negatives by shifting the data within the same accommodation group. We then train a LightGBM model on this processed data.

### Load data

In [None]:
train_users_features = pd.read_parquet('train_users_features.parquet')
train_reviews_features = pd.read_parquet('train_reviews_features.parquet')
train_matches = pd.read_csv('train_matches.csv')

train_users_tokens = pd.read_parquet('train_users_tokens.parquet')
train_reviews_tokens = pd.read_parquet('train_reviews_tokens.parquet')

### Function and classes for training

In [None]:
def create_subsets(users_features, users_tokens, reviews_features, reviews_tokens, matches, total_parts, part):
    """
    Create subsets of the data for training and evaluation to fit into memory
    Args:
        users_features: DataFrame with user features
        users_tokens: DataFrame with user tokens
        reviews_features: DataFrame with review features
        reviews_tokens: DataFrame with review tokens
        matches: DataFrame with matches
        total_parts: total number of parts to split the data into
        part: part to select
    Returns:
        Tuple of DataFrames with user features, user tokens, review features, review tokens, and matches
    """
    accommodation_ids = matches['accommodation_id'].unique().tolist()
    
    accommodation_ids = sorted(accommodation_ids)
    
    subset_size = len(accommodation_ids) // total_parts
    remainder = len(accommodation_ids) % total_parts
    
    start_idx = part * subset_size + min(part, remainder)
    end_idx = (part + 1) * subset_size + min(part + 1, remainder)

    selected_accommodation_ids = accommodation_ids[start_idx:end_idx]
    
    users_features = users_features[users_features['accommodation_id'].isin(selected_accommodation_ids)]
    reviews_features = reviews_features[reviews_features['accommodation_id'].isin(selected_accommodation_ids)]
    users_tokens = users_tokens[users_tokens['accommodation_id'].isin(selected_accommodation_ids)]
    reviews_tokens = reviews_tokens[reviews_tokens['accommodation_id'].isin(selected_accommodation_ids)]
    
    matches_reduced = matches[matches['accommodation_id'].isin(selected_accommodation_ids)]
    
    return users_features, users_tokens, reviews_features, reviews_tokens, matches_reduced

class TwoTowersNetwork(nn.Module):
    """
    Args:
        model_id (str): Model identifier.
    """
    def __init__(self, model_id):
        super().__init__()
        self.bert1 = AutoModel.from_pretrained(model_id)
        self.bert2 = AutoModel.from_pretrained(model_id)
        
    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0]
        mask = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * mask, 1) / torch.clamp(mask.sum(1), min=1e-9)

    def forward(self, user_ids, user_mask, review_ids, review_mask):
        context_output = self.bert1(user_ids, attention_mask=user_mask)
        review_output = self.bert2(review_ids, attention_mask=review_mask)
        
        context_embed = self.mean_pooling(context_output, user_mask)
        review_embed = self.mean_pooling(review_output, review_mask)
        
        return context_embed, review_embed

def pad_sequence(sequences, batch_first=False, padding_value=0):
    """
    Pad a list of variable length sequences with padding_value.
    """
    max_len = max(len(seq) for seq in sequences)
    padded_sequences = []

    for seq in sequences:
        padding = torch.full((max_len - len(seq),), padding_value, dtype=seq.dtype)
        padded_sequences.append(torch.cat((seq, padding)))

    if batch_first:
        return torch.stack(padded_sequences)
    else:
        return torch.stack(padded_sequences).transpose(0, 1)

def encode_pairwise(train_users_tokens, train_reviews_tokens, model, batch_size=64, device='cuda'):
    """
    Encode user and review tokens using the model.
    Args:
        train_users_tokens (DataFrame): DataFrame with user tokens
        train_reviews_tokens (DataFrame): DataFrame with review tokens
        model (nn.Module): Model to use for encoding
        batch_size (int): Batch size
        device (str): Device to use
    Returns:
        Tuple of DataFrames with user and review embeddings
    """
    model = model.to(device)
    model.eval()

    user_embeddings, review_embeddings = [], []

    for i in range(0, len(train_users_tokens), batch_size):
        user_batch = train_users_tokens.iloc[i:i+batch_size]
        review_batch = train_reviews_tokens.iloc[i:i+batch_size]
        
        user_ids = pad_sequence([torch.tensor(x) for x in user_batch['input_ids']], batch_first=True).to(device)
        user_masks = pad_sequence([torch.tensor(x) for x in user_batch['attention_mask']], batch_first=True).to(device)
        review_ids = pad_sequence([torch.tensor(x) for x in review_batch['input_ids']], batch_first=True).to(device)
        review_masks = pad_sequence([torch.tensor(x) for x in review_batch['attention_mask']], batch_first=True).to(device)

        with torch.no_grad():
            user_embed, review_embed = model(user_ids, user_masks, review_ids, review_masks)
            user_embeddings.extend(user_embed.cpu().numpy())
            review_embeddings.extend(review_embed.cpu().numpy())
        
        if (i // batch_size) % 500 == 0:
            print(f'Processed {i}/{len(train_users_tokens)} samples', flush=True)
    print('', flush=True)

    user_embeddings_df = pd.DataFrame(user_embeddings, columns=[f'user_embedding_{i}' for i in range(user_embed.size(1))])
    review_embeddings_df = pd.DataFrame(review_embeddings, columns=[f'review_embedding_{i}' for i in range(review_embed.size(1))])

    return user_embeddings_df, review_embeddings_df

def embed_data(users_tokens, reviews_tokens, model_path, batch_size=64, device='cuda'):
    """
    Encode user and review tokens using the model.
    Args:
        users_tokens (DataFrame): DataFrame with user tokens
        reviews_tokens (DataFrame): DataFrame with review tokens
        model_path (str): Path to the model
        batch_size (int): Batch size
        device (str): Device to use
    Returns:
        Tuple of DataFrames with user and review embeddings
    """
    model = TwoTowersNetwork("sentence-transformers/all-MiniLM-L12-v2").to(device)
    model.load_state_dict(torch.load(model_path))

    user_embeddings, review_embeddings = encode_pairwise(users_tokens, reviews_tokens, model, batch_size=batch_size, device=device)

    
    return user_embeddings, review_embeddings

def generate_negative_samples_shift(df, split_idx):
    """
    Generate negative samples by shifting the right part of the dataframe.
    Args:
        df (DataFrame): DataFrame with positive samples
        split_idx (int): Index to split the dataframe
    Returns:
        DataFrame with negative samples
    """
    df_sorted = df.sort_values('accommodation_id').reset_index(drop=True)
    
    left_part = df_sorted.iloc[:, :split_idx]
    right_part = df_sorted.iloc[:, split_idx:]
    
    right_part_shifted = pd.concat([right_part.iloc[-1:], right_part.iloc[:-1]]).reset_index(drop=True)
    
    df_negative = pd.concat([left_part, right_part_shifted], axis=1)
    
    df_negative['label'] = 0
    
    return df_negative


### create partition of the data so it fits into memory

In [None]:
total_parts = 10
part = 0

train_users_features, train_users_tokens, train_reviews_features, train_reviews_tokens, train_matches = create_subsets(train_users_features, train_users_tokens, train_reviews_features, train_reviews_tokens, train_matches, total_parts=total_parts, part=part)

### Creates text embeddings and merge the users and reviews datasets. these are the positive samples

In [None]:
model_path = 'model.pt'
train_user_embeddings, train_review_embeddings = embed_data(train_users_tokens, train_reviews_tokens, model_path, batch_size=64, device=device)

train_users_features.reset_index(drop=True, inplace=True)
train_reviews_features.reset_index(drop=True, inplace=True)
train_user_embeddings.reset_index(drop=True, inplace=True)
train_review_embeddings.reset_index(drop=True, inplace=True)

train_users = pd.concat([train_users_features, train_user_embeddings], axis=1)
train_reviews = pd.concat([train_reviews_features, train_review_embeddings], axis=1)

merged_users = train_matches.merge(train_users, on='user_id', how='left')
final_train = merged_users.merge(train_reviews, on='review_id', how='left')

final_train = final_train.drop(columns=['accommodation_id_y', 'accommodation_id'])
final_train = final_train.rename(columns={'accommodation_id_x': 'accommodation_id'})
final_train['label'] = 1

### Creates negative samples by shifting the data - this allows us to use reviews for same accommodations which will learn meaningful differences

In [None]:
negative_samples = generate_negative_samples_shift(final_train, 488)
negative_samples['label'] = 0

train = pd.concat([final_train, negative_samples], ignore_index=True)
train = train.sample(frac=1).reset_index(drop=True)

### Trains the model on the subset of data, we used it several times and inferences using an ensemble of 5 models

In [None]:
X = train.drop(columns=['user_id', 'accommodation_id', 'review_id', 'label'])
y = train['label']
train_data = lgb.Dataset(X, label=y)

params = {
    'objective': 'binary',
    'metric': 'binary_error',
    'learning_rate': 0.1,
    'num_leaves': 31,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'n_estimators': 100
}

model = lgb.train(params, train_data, num_boost_round=1000)
model.save_model(f'model_{part}.txt')