In [None]:
import pandas as pd
import numpy as np
import pickle
from tqdm.notebook import tqdm

import torch
from torch.utils.data import Dataset, DataLoader, Sampler

import warnings
warnings.filterwarnings('ignore')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pd.set_option('display.max_columns', 5000)

The purpose of this notebook is to create the necessary objects for dataset creation during training, aiming to save time.

## Load Data

In [None]:
train_users = pd.read_parquet('train_users_tokens.parquet')
train_reviews = pd.read_parquet('train_reviews_tokens.parquet')
train_matches = pd.read_csv('train_matches.csv')

## Create objects

In [None]:
users_dict = {}
for i, row in tqdm(train_users.iterrows(), total=len(train_users), desc="Creating users dict"):
    user_id = row['user_id']
    input_ids = row['input_ids']
    attention_mask = row['attention_mask']
    users_dict[user_id] = (input_ids, attention_mask)

reviews_dict = {}
for i, row in tqdm(train_reviews.iterrows(), total=len(train_reviews), desc="Creating reviews dict"):
    review_id = row['review_id']
    input_ids = row['input_ids']
    attention_mask = row['attention_mask']
    reviews_dict[review_id] = (input_ids, attention_mask)

positive_pairs = []
for i, row in tqdm(train_matches.iterrows(), total=len(train_matches), desc="Creating positive pairs"):
    user_id = row['user_id']
    review_id = row['review_id']
    accommodation_id = row['accommodation_id']
    positive_pairs.append((user_id, review_id))

### Save 

In [None]:
with open('users_dict.pkl', 'wb') as f:
    pickle.dump(users_dict, f)

with open('reviews_dict.pkl', 'wb') as f:
    pickle.dump(reviews_dict, f)

with open('positive_pairs.pkl', 'wb') as f:
    pickle.dump(positive_pairs, f)

### Example of loading these objects and creating datasets

In [None]:
def pad_sequence(sequences, batch_first=False, padding_value=0):
    """
    Pad a list of variable length sequences with padding_value.
    """
    max_len = max(len(seq) for seq in sequences)
    padded_sequences = []

    for seq in sequences:
        padding = torch.full((max_len - len(seq),), padding_value, dtype=seq.dtype)
        padded_sequences.append(torch.cat((seq, padding)))

    if batch_first:
        return torch.stack(padded_sequences)
    else:
        return torch.stack(padded_sequences).transpose(0, 1)
    
def collate_fn(batch):
    """
    Collate function for DataLoader.
    """
    user_input_ids = [item['user_input_ids'] for item in batch]
    user_attention_mask = [item['user_attention_mask'] for item in batch]
    review_input_ids = [item['review_input_ids'] for item in batch]
    review_attention_mask = [item['review_attention_mask'] for item in batch]
    
    user_input_ids = pad_sequence(user_input_ids, batch_first=True, padding_value=0)
    user_attention_mask = pad_sequence(user_attention_mask, batch_first=True, padding_value=0)
    review_input_ids = pad_sequence(review_input_ids, batch_first=True, padding_value=0)
    review_attention_mask = pad_sequence(review_attention_mask, batch_first=True, padding_value=0)
    
    return {
        'user_input_ids': user_input_ids,
        'user_attention_mask': user_attention_mask,
        'review_input_ids': review_input_ids,
        'review_attention_mask': review_attention_mask,
    }

class TrainDataset(Dataset):
    def __init__(self, users_dict, reviews_dict, positive_pairs, group_to_indices):
        """
        Args:
            users_dict (dict): Dictionary with user_id as key and tuple of input_ids and attention_mask as value.
            reviews_dict (dict): Dictionary with review_id as key and tuple of input_ids and attention_mask as value.
            positive_pairs (list): List of tuples (user_id, review_id).
            group_to_indices (dict): Dictionary with group_id as key and list of indices as value.
        """
        self.users_dict = users_dict
        self.reviews_dict = reviews_dict
        self.positive_pairs = positive_pairs
        self.group_to_indices = group_to_indices
        self.groups = list(group_to_indices.keys())
    
    def __len__(self):
        return len(self.positive_pairs)

    def __getitem__(self, idx):
        user_id, review_id = self.positive_pairs[idx]
        user_input_ids, user_attention_mask = self.users_dict[user_id]
        review_input_ids, review_attention_mask = self.reviews_dict[review_id]
        
        # to tensors
        user_input_ids = torch.tensor(user_input_ids)
        user_attention_mask = torch.tensor(user_attention_mask)
        review_input_ids = torch.tensor(review_input_ids)
        review_attention_mask = torch.tensor(review_attention_mask)

        return {
            'user_input_ids': user_input_ids,
            'user_attention_mask': user_attention_mask,
            'review_input_ids': review_input_ids,
            'review_attention_mask': review_attention_mask,
        }

class GroupBatchSampler(Sampler):
    def __init__(self, group_to_indices, batch_size, drop_last=False):
        """
        Args:
            group_to_indices (dict): Dictionary with group_id as key and list of indices as value.
            batch_size (int): Size of mini-batch.
            drop_last (bool): If True, the sampler will drop the last batch if its size would be less than batch_size.
        """
        self.group_to_indices = group_to_indices
        self.batch_size = batch_size
        self.drop_last = drop_last
        self.groups = list(group_to_indices.keys())
        
    def __iter__(self):
        all_batches = []
        
        for group_id in self.groups:
            indices = self.group_to_indices[group_id]
            
            if len(indices) <= self.batch_size:
                all_batches.append(indices)
            else:
                indices = np.array(indices)
                np.random.shuffle(indices)
                
                for i in range(0, len(indices) - self.batch_size + 1, self.batch_size):
                    batch = indices[i:i + self.batch_size].tolist()
                    all_batches.append(batch)
                
                leftover = len(indices) % self.batch_size
                if leftover > 0 and not self.drop_last:
                    last_batch = indices[-leftover:].tolist()
                    all_batches.append(last_batch)
        
        np.random.shuffle(all_batches)
        return iter(all_batches)
    
    def __len__(self):
        total_batches = 0
        for indices in self.group_to_indices.values():
            if len(indices) <= self.batch_size:
                total_batches += 1
            else:
                n_full_batches = len(indices) // self.batch_size
                total_batches += n_full_batches
                if not self.drop_last and len(indices) % self.batch_size > 0:
                    total_batches += 1
        return total_batches

In [None]:
with open('users_dict.pkl', 'rb') as f:
    users_dict = pickle.load(f)

with open('reviews_dict.pkl', 'rb') as f:
    reviews_dict = pickle.load(f)

with open('positive_pairs.pkl', 'rb') as f:
    positive_pairs = pickle.load(f)

with open('kmeans_groups.pkl', 'rb') as f:
    kmeans_groups = pickle.load(f)

kmeans_train_dataset = TrainDataset(users_dict, reviews_dict, positive_pairs, kmeans_groups)
kmeans_sampler = GroupBatchSampler(kmeans_groups, batch_size=32, drop_last=True)
kmeans_loader = DataLoader(kmeans_train_dataset, batch_sampler=kmeans_sampler, collate_fn=collate_fn)

accomodation_groups = train_matches.groupby('accommodation_id').apply(lambda x: x.index.tolist()).to_dict()
accomodation_train_dataset = TrainDataset(users_dict, reviews_dict, positive_pairs, accomodation_groups)
accomodation_sampler = GroupBatchSampler(accomodation_groups, batch_size=32, drop_last=True)
accomodation_loader = DataLoader(accomodation_train_dataset, batch_sampler=accomodation_sampler, collate_fn=collate_fn)