In [2]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm
import random

from textblob import TextBlob
tqdm.pandas()

import torch

import warnings
warnings.filterwarnings('ignore')
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from transformers import AutoTokenizer, AutoModel
import os

pd.set_option('display.max_columns', 5060)

def seed_everything(seed=42, torch_stuff=True):
    random.seed(seed)
    np.random.seed(seed)

    if torch_stuff:
        torch.manual_seed(seed)
        torch.use_deterministic_algorithms(True)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark  = False

    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'

    if torch_stuff and torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

seed_everything()

## Load Data

In [4]:
train_users = pd.read_csv('train_users.csv')
train_reviews = pd.read_csv('train_reviews.csv')
train_matches = pd.read_csv('train_matches.csv')

val_users = pd.read_csv('val_users.csv')
val_reviews = pd.read_csv('val_reviews.csv')
val_matches = pd.read_csv('val_matches.csv')

test_users = pd.read_csv('test_users.csv')
test_reviews = pd.read_csv('test_reviews.csv')

## Load model, both for embedding and tokenizing

In [None]:
model_id = "sentence-transformers/all-MiniLM-L12-v2"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)

embedding_model = SentenceTransformer(model_id)

## Functions for users processing

In [6]:
guest_country_dict = {
    'Zuc': 'australia', 'Mejok': 'united kingdom', 'Nen': 'india', 'Tig': 'philippines', 'Nukeye': 'united states of america', 'Jof': 'canada',
    'Fuxa': 'malaysia', 'Pule': 'romania', 'Wiz': 'brazil', 'Naz': 'china', 'Toq': 'japan', 'Zot': 'france',
    'Rawabe': 'italy', 'Pes': 'russia', 'Xopag': 'korea, south', 'Guroz': 'spain', 'Heyuru': 'indonesia', 'Vohomi': 'turkey',
    'Dawal': 'netherlands', 'Pihay': 'saudi arabia', 'Vig': 'switzerland', 'Fanir': 'poland', 'Modey': 'taiwan', 'Rixo': 'belgium',
    'Cibi': 'sweden', 'Kiy': 'argentina', 'Gobuf': 'ireland', 'Cej': 'united arab emirates', 'Bemil': 'austria', 'Pikune': 'singapore',
    'Qebuf': 'thailand', 'Vema': 'israel', 'Qehoj': 'germany', 'Naz': 'china', 'Gifop': 'norway', 'Diwej': 'bangladesh',
    'Vezuz': 'iran', 'Vohomi': 'colombia', 'Zic': 'denmark', 'Pimes': 'greece', 'Xazas': 'new zealand', 'Wapef': 'portugal',
    'Made': 'south africa', 'Cutip': 'croatia', 'Noliqo': 'morocco', 'Vey': 'egypt', 'Vezuz': 'czech republic', 'Qop': 'iceland',
    'Gugen': 'sri lanka', 'Nolita': 'hungary', 'May': 'slovenia', 'Tuleho': 'bulgaria', 'Nol': 'cyprus', 'Jeqe': 'georgia',
    'Buzi': 'mexico', 'Xogem': 'kazakhstan', 'Quduja': 'kosovo', 'Tuhi': 'finland', 'Nis': 'moldova', 'Kudefa': 'estonia',
    'Tucit': 'latvia', 'Bemab': 'lithuania', 'Tomu': 'ukraine', 'Mal': 'costa rica'
}

month_dict = {
    1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June',
    7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'
}

def process_guest_type(df):
    """
    One-hot encode 'guest_type' column
    Args:
        df (pd.DataFrame): DataFrame containing 'guest_type' column
    Returns:
        pd.DataFrame: DataFrame with one-hot encoded 'guest_type' column
    """
    df_guest_type_ohe = pd.get_dummies(df['guest_type'], prefix='guest_type').astype(int)
    df = pd.concat([df, df_guest_type_ohe], axis=1)
    df.drop('guest_type', axis=1, inplace=True)  # Drop original 'guest type' column
    return df

def process_guest_country(df, guest_country_dict, model, pca_components=16):
    """
    Encode 'guest_country' column using SentenceTransformer model
    Args:
        df (pd.DataFrame): DataFrame containing 'guest_country' column
        guest_country_dict (dict): Dictionary mapping guest country codes to country names
        model (transformers.PreTrainedModel): Pretrained SentenceTransformer model
        pca_components (int, optional): Number of PCA components. Defaults to 16.
    Returns:
        pd.DataFrame: DataFrame with encoded 'guest_country' column
    """
    country_embedding_dict = {}
    
    unique_real_countries = set(guest_country_dict.values())
    
    for country in unique_real_countries:
        country_embedding_dict[country] = model.encode([country])[0]
    
    country_embedding_dict['Unknown Country'] = model.encode(['Unknown Country'])[0]
    
    embeddings = []
    real_countries = df['guest_country'].apply(lambda x: guest_country_dict.get(x, 'Unknown Country')).tolist()

    for real_country in real_countries:
        embeddings.append(country_embedding_dict.get(real_country, country_embedding_dict['Unknown Country']))
    
    pca = PCA(n_components=pca_components)
    reduced_embeddings = pca.fit_transform(np.vstack(embeddings))
    embedding_df = pd.DataFrame(reduced_embeddings, columns=[f'guest_country_embedding_{i}' for i in range(reduced_embeddings.shape[1])])

    df = df.drop(columns=['guest_country'])
    df = pd.concat([df, embedding_df], axis=1)

    return df

def process_room_nights(df):
    """
    Binning 'room_nights' column and one-hot encoding the bins
    Args:
        df (pd.DataFrame): DataFrame containing 'room_nights' column
    Returns:
        pd.DataFrame: DataFrame with one-hot encoded 'room_nights' column
    """
    bins = [0, 1, 2, 3, 4, 5, 6, 10, 16, 30, 100, float('inf')]
    labels = ['1', '2', '3', '4', '5', '6', '7-10', '11-16', '17-30', '31-100', '100+']
    df['room_nights_binned'] = pd.cut(df['room_nights'], bins=bins, labels=labels, right=False)
    df_room_nights_ohe = pd.get_dummies(df['room_nights_binned'], prefix='room_nights')
    df_room_nights_ohe = df_room_nights_ohe.astype(int)
    df = pd.concat([df, df_room_nights_ohe], axis=1)
    df.drop(['room_nights', 'room_nights_binned'], axis=1, inplace=True)
    return df

def process_seasons(df):
    """
    Map 'month' to 'Season' and one-hot encode 'Season'
    Args:
        df (pd.DataFrame): DataFrame containing 'month' column
    Returns:
        pd.DataFrame: DataFrame with one-hot encoded 'Season' column
    """
    season_dict = {
        12: 'Winter', 1: 'Winter', 2: 'Winter',
        3: 'Spring', 4: 'Spring', 5: 'Spring',
        6: 'Summer', 7: 'Summer', 8: 'Summer',
        9: 'Fall', 10: 'Fall', 11: 'Fall'
    }
    df['Season'] = df['month'].map(season_dict)
    df_season_ohe = pd.get_dummies(df['Season'], prefix='Season').astype(int)
    df = pd.concat([df, df_season_ohe], axis=1)
    df.drop('Season', axis=1, inplace=True)
    return df

def process_holidays(df):
    """
    Map 'month' to 'Holiday' and one-hot encode 'Holiday'
    Args:
        df (pd.DataFrame): DataFrame containing 'month' column
    Returns:
        pd.DataFrame: DataFrame with one-hot encoded 'Holiday' column
    """
    holidays_dict = {
        "New Year": 12,
        "Valentines Day": 2,
        "Easter": 4,
        "Halloween": 10,
        "Christmas": 12,
        "Chinese New Year": 1,
    }
    df['Holiday'] = df['month'].map({v: k for k, v in holidays_dict.items()})
    df['Holiday'] = df['Holiday'].fillna('No Holiday')
    df_holiday_ohe = pd.get_dummies(df['Holiday'], prefix='Holiday')
    df_holiday_ohe = df_holiday_ohe.astype(int)
    df = pd.concat([df, df_holiday_ohe], axis=1)
    df.drop('Holiday', axis=1, inplace=True)
    return df

def process_month(df):
    """
    One-hot encode 'month' column
    Args:
        df (pd.DataFrame): DataFrame containing 'month' column
    Returns:
        pd.DataFrame: DataFrame with one-hot encoded 'month' column
    """
    df_month_ohe = pd.get_dummies(df['month'], prefix='month')
    df_month_ohe = df_month_ohe.astype(int)
    df = pd.concat([df, df_month_ohe], axis=1)
    df.drop('month', axis=1, inplace=True)
    return df

def process_accommodation_type(df):
    """
    One-hot encode 'accommodation_type' column
    Args:
        df (pd.DataFrame): DataFrame containing 'accommodation_type' column
    Returns:
        pd.DataFrame: DataFrame with one-hot encoded 'accommodation_type' column
    """
    df_accommodation_type_ohe = pd.get_dummies(df['accommodation_type'], prefix='accommodation_type')
    df_accommodation_type_ohe = df_accommodation_type_ohe.astype(int)
    df = pd.concat([df, df_accommodation_type_ohe], axis=1)
    df.drop('accommodation_type', axis=1, inplace=True)
    return df

def process_accommodation_country(df, model, pca_components=16):
    """
    Encode 'accommodation_country' column using SentenceTransformer model
    Args:
        df (pd.DataFrame): DataFrame containing 'accommodation_country' column
        model (transformers.PreTrainedModel): Pretrained SentenceTransformer model
        pca_components (int, optional): Number of PCA components. Defaults to 16.
    Returns:
        pd.DataFrame: DataFrame with encoded 'accommodation_country' column
    """
    embeddings = []
    countries = df['accommodation_country'].tolist()
    country_embedding_dict = {}
    country_embedding_dict['Unknown Country'] = model.encode(['Unknown Country'])[0]

    for country in set(countries):
        country_embedding_dict[country] = model.encode([country])[0]
    
    for country in countries:
        embeddings.append(country_embedding_dict.get(country, country_embedding_dict['Unknown Country']))

    pca = PCA(n_components=pca_components)
    reduced_embeddings = pca.fit_transform(np.vstack(embeddings))
    embedding_df = pd.DataFrame(reduced_embeddings, columns=[f'accommodation_country_embedding_{i}' for i in range(reduced_embeddings.shape[1])])

    df = df.drop(columns=['accommodation_country'])
    df = pd.concat([df, embedding_df], axis=1)

    return df

def process_accommodation_score(df, scaler=None):
    """ 
    Normalize 'accommodation_score' column
    Args:
        df (pd.DataFrame): DataFrame containing 'accommodation_score' column
        scaler (sklearn.preprocessing.StandardScaler, optional): Scaler to be used. Defaults to None.
    Returns:
        pd.DataFrame: DataFrame with normalized 'accommodation_score' column
    """
    if scaler is None:
        scaler = MinMaxScaler(feature_range=(0, 1))
        scaler.fit(df[['accommodation_score']])
        df['accommodation_score'] = scaler.transform(df[['accommodation_score']])
        return df, scaler
    else:
        df['accommodation_score'] = scaler.transform(df[['accommodation_score']])
        return df
    
def process_accommodation_star_rating(df, scaler=None):
    """
    Normalize 'accommodation_star_rating' column
    Args:
        df (pd.DataFrame): DataFrame containing 'accommodation_star_rating' column
        scaler (sklearn.preprocessing.StandardScaler, optional): Scaler to be used. Defaults to None.
    Returns:
        pd.DataFrame: DataFrame with normalized 'accommodation_star_rating' column
    """
    if scaler is None:
        scaler = MinMaxScaler(feature_range=(0, 1))
        scaler.fit(df[['accommodation_star_rating']])
        df['accommodation_star_rating'] = scaler.transform(df[['accommodation_star_rating']])
        return df, scaler
    else:
        df['accommodation_star_rating'] = scaler.transform(df[['accommodation_star_rating']])
        return df

def get_row_text(row, guest_country_dict, month_dict):
    """
    Generate text for each row 
    Args:
        row (pd.Series): Row of the DataFrame
        guest_country_dict (dict): Dictionary mapping guest country codes to country names
        month_dict (dict): Dictionary mapping month numbers to month names
    Returns:
        str: Text generated from the row
    """
    guest_country = guest_country_dict.get(row['guest_country'], row['guest_country'])
    month_name = month_dict.get(row['month'], str(row['month']))
    
    text = (
        f"A {row['guest_type']} from {guest_country} stayed at a {row['accommodation_type']} "
        f"in {row['accommodation_country']} for {row['room_nights']} nights in {month_name}. "
        f"The {row['accommodation_type']} has a score of {row['accommodation_score']} and a rating of {row['accommodation_star_rating']}."
    )
    
    if row['location_is_ski']:
        text += " The location is at a ski resort."
    if row['location_is_beach']:
        text += " The location is at the beach."
    if row['location_is_city_center']:
        text += " The location is in the city center."
    
    return text

def get_tokens(df, tokenizer, batch_size=128, max_length=512):
    """
    Tokenize the text data
    Args:
        df (pd.DataFrame): DataFrame containing text data
        tokenizer (transformers.PreTrainedTokenizer): Tokenizer to be used
        batch_size (int, optional): Batch size for tokenization. Defaults to 128.
        max_length (int, optional): Maximum length of the tokenized sequence. Defaults to 512.
    Returns:
        pd.DataFrame: DataFrame with 'input_ids' and 'attention
    """
    input_ids = []
    attention_mask = []
    
    for start_idx in tqdm(range(0, len(df), batch_size), desc="Tokenizing", total=len(df) // batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        batch_texts = df["text"].iloc[start_idx:end_idx].tolist()

        tokenized = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt",
        )

        input_ids.extend(tokenized["input_ids"].tolist())
        attention_mask.extend(tokenized["attention_mask"].tolist())

    df["input_ids"] = input_ids
    df["attention_mask"] = attention_mask
    df = df.drop(columns=["text"])
    return df

def process_users(df, embedding_model, tokenizer, scaler_accommodation_score=None, scaler_accommodation_star_rating=None):
    """
    Processes user data for training, applying various transformations to the dataset.

    This function processes the following aspects of the input dataframe:
    - Text data from user reviews
    - Guest type and country, encoding them using embeddings
    - Room nights, seasonality, and holiday information
    - Accommodation type and country, encoding using embeddings
    - Scaled accommodation score and star rating
    - Tokenizes text data using the provided tokenizer

    Args:
        df (pd.DataFrame): The dataframe containing user data to be processed.
        embedding_model (transformers.PreTrainedModel): Pretrained embedding model used for encoding categorical data.
        tokenizer (transformers.PreTrainedTokenizer): Tokenizer used to tokenize review text.
        scaler_accommodation_score (sklearn.preprocessing.StandardScaler, optional): Scaler used to normalize accommodation scores. Defaults to None.
        scaler_accommodation_star_rating (sklearn.preprocessing.StandardScaler, optional): Scaler used to normalize accommodation star ratings. Defaults to None.

    Returns:
        tuple: A tuple containing:
            - tabular_features (pd.DataFrame): DataFrame of processed tabular features, excluding tokenization columns.
            - tokens (pd.DataFrame): DataFrame of user-related tokens, including 'user_id', 'accommodation_id', 'input_ids', and 'attention_mask'.
            - scaler_accommodation_score (sklearn.preprocessing.StandardScaler): The scaler used for accommodation scores, if applicable.
            - scaler_accommodation_star_rating (sklearn.preprocessing.StandardScaler): The scaler used for accommodation star ratings, if applicable.
    """
    df['text'] = df.progress_apply(get_row_text, axis=1)
    df = process_guest_type(df)
    df = process_guest_country(df, guest_country_dict, embedding_model)
    df = process_room_nights(df)
    df = process_seasons(df)
    df = process_holidays(df)
    df = process_month(df)
    df = process_accommodation_type(df)
    df = process_accommodation_country(df, embedding_model)
    df, scaler_accommodation_score = process_accommodation_score(df)
    df, scaler_accommodation_star_rating = process_accommodation_star_rating(df)
    df = get_tokens(df, tokenizer)

    tabular_features = df.drop(columns=['input_ids', 'attention_mask'])
    tokens = df[['user_id', 'accommodation_id', 'input_ids', 'attention_mask']]
    
    return tabular_features, tokens, scaler_accommodation_score, scaler_accommodation_star_rating

## Create processed data

In [None]:
train_user_features, train_user_tokens, scaler_accommodation_score, scaler_accommodation_star_rating = process_users(train_users, embedding_model, tokenizer)
val_user_features, val_user_tokens, _, _ = process_users(val_users, embedding_model, tokenizer, scaler_accommodation_score, scaler_accommodation_star_rating)
test_user_features, test_user_tokens, _, _ = process_users(test_users, embedding_model, tokenizer, scaler_accommodation_score, scaler_accommodation_star_rating)

def equalize_columns(df1, df2):
    """
    Equalize columns between two DataFrames by adding missing columns with 0 values.
    Args:
        df1 (pd.DataFrame): First DataFrame
        df2 (pd.DataFrame): Second DataFrame
    Returns:
        tuple: A tuple containing:
            - df1 (pd.DataFrame): First DataFrame with missing columns added
            - df2 (pd.DataFrame): Second DataFrame with missing columns added
    """
    cols1 = set(df1.columns)
    cols2 = set(df2.columns)
    missing_cols1 = cols2 - cols1
    missing_cols2 = cols1 - cols2
    for col in missing_cols1:
        df1[col] = 0
    for col in missing_cols2:
        df2[col] = 0
    return df1, df2

train_user_features, val_user_features = equalize_columns(train_user_features, val_user_features)
train_user_features, test_user_features = equalize_columns(train_user_features, test_user_features)
val_user_features, test_user_features = equalize_columns(val_user_features, test_user_features)

train_user_features_columns = train_user_features.columns
val_features = val_user_features[train_user_features_columns]
test_features = test_user_features[train_user_features_columns]

## Save 

In [None]:
train_user_features.to_parquet('train_users_features.parquet')
train_user_tokens.to_parquet('train_users_tokens.parquet')

val_features.to_parquet('val_users_features.parquet')
val_user_tokens.to_parquet('val_users_tokens.parquet')

test_features.to_parquet('test_users_features.parquet')
test_user_tokens.to_parquet('test_users_tokens.parquet')

## Functions for reviews processing

In [12]:
def create_review_text(row):
    """
    Create review text from the review data
    Args:
        row (pd.Series): Row of the DataFrame
    Returns:
        str: Review text
    """
    title = str(row['review_title']) if pd.notna(row['review_title']) else ''
    positive = str(row['review_positive']) if pd.notna(row['review_positive']) else ''
    negative = str(row['review_negative']) if pd.notna(row['review_negative']) else ''
    
    return f"Title: {title}, What I liked about my stay: {positive}, What I didn't like: {negative}"

def add_sentiment_score(df):
    """
    Add sentiment scores to the review data
    Args:
        df (pd.DataFrame): DataFrame containing review data
    """
    def get_sentiment_score(text):
        if pd.isna(text):
            return 0
        return TextBlob(text).sentiment.polarity

    df['title_sentiment'] = df['review_title'].progress_apply(get_sentiment_score)
    df['positive_sentiment'] = df['review_positive'].progress_apply(get_sentiment_score)
    df['negative_sentiment'] = df['review_negative'].progress_apply(get_sentiment_score)

def add_lengths(df):
    """
    Add lengths of review text to the DataFrame
    Args:
        df (pd.DataFrame): DataFrame containing review data
    """
    df['title_length'] = df['review_title'].fillna('').str.len()
    df['positive_length'] = df['review_positive'].fillna('').str.len()
    df['negative_length'] = df['review_negative'].fillna('').str.len()

def normalize_review_score(df, scaler=None):
    """
    Normalize 'review_score' column
    Args:
        df (pd.DataFrame): DataFrame containing 'review_score' column
        scaler (sklearn.preprocessing.StandardScaler, optional): Scaler to be used. Defaults to None.
    Returns:
        pd.DataFrame: DataFrame with normalized 'review_score' column
        MinMaxScaler: Scaler used for normalization
    """
    if scaler is None:
        scaler = MinMaxScaler()
        df['review_score'] = scaler.fit_transform(df[['review_score']])
    else:
        df['review_score'] = scaler.transform(df[['review_score']])
    return df, scaler

def standardize_review_helpful_votes(df, scaler=None):
    """
    Standardize 'review_helpful_votes' column
    Args:
        df (pd.DataFrame): DataFrame containing 'review_helpful_votes' column
        scaler (sklearn.preprocessing.StandardScaler, optional): Scaler to be used. Defaults to None.
    Returns:
        pd.DataFrame: DataFrame with standardized 'review_helpful_votes' column
        StandardScaler: Scaler used for standardization
    """
    if scaler is None:
        scaler = StandardScaler()
        df['review_helpful_votes'] = scaler.fit_transform(df[['review_helpful_votes']])
    else:
        df['review_helpful_votes'] = scaler.transform(df[['review_helpful_votes']])
    return df, scaler

def standardize_sentiments(df, title_scaler=None, positive_scaler=None, negative_scaler=None):
    """
    Standardize sentiment scores
    Args:
        df (pd.DataFrame): DataFrame containing sentiment scores
        title_scaler (sklearn.preprocessing.StandardScaler, optional): Scaler for title sentiment. Defaults to None.
        positive_scaler (sklearn.preprocessing.StandardScaler, optional): Scaler for positive sentiment. Defaults to None.
        negative_scaler (sklearn.preprocessing.StandardScaler, optional): Scaler for negative sentiment. Defaults to None.
    Returns:
        pd.DataFrame: DataFrame with standardized sentiment scores
        StandardScaler: Scaler for title sentiment
        StandardScaler: Scaler for positive sentiment
        StandardScaler: Scaler for negative sentiment
    """
    if title_scaler is None:
        title_scaler = StandardScaler()
        positive_scaler = StandardScaler()
        negative_scaler = StandardScaler()
        df['title_sentiment'] = title_scaler.fit_transform(df[['title_sentiment']])
        df['positive_sentiment'] = positive_scaler.fit_transform(df[['positive_sentiment']])
        df['negative_sentiment'] = negative_scaler.fit_transform(df[['negative_sentiment']])
    else:
        df['title_sentiment'] = title_scaler.transform(df[['title_sentiment']])
        df['positive_sentiment'] = positive_scaler.transform(df[['positive_sentiment']])
        df['negative_sentiment'] = negative_scaler.transform(df[['negative_sentiment']])
    return df, title_scaler, positive_scaler, negative_scaler

def standardize_lengths(df, title_scaler=None, positive_scaler=None, negative_scaler=None):
    """
    Standardize lengths of review text
    Args:
        df (pd.DataFrame): DataFrame containing lengths of review text
        title_scaler (sklearn.preprocessing.StandardScaler, optional): Scaler for title length. Defaults to None.
        positive_scaler (sklearn.preprocessing.StandardScaler, optional): Scaler for positive length. Defaults to None.
        negative_scaler (sklearn.preprocessing.StandardScaler, optional): Scaler for negative length. Defaults to None.
    Returns:
        pd.DataFrame: DataFrame with standardized lengths of review text
        StandardScaler: Scaler for title length
        StandardScaler: Scaler for positive length
        StandardScaler: Scaler for negative length
    """
    if title_scaler is None:
        title_scaler = StandardScaler()
        positive_scaler = StandardScaler()
        negative_scaler = StandardScaler()
        df['title_length'] = title_scaler.fit_transform(df[['title_length']])
        df['positive_length'] = positive_scaler.fit_transform(df[['positive_length']])
        df['negative_length'] = negative_scaler.fit_transform(df[['negative_length']])
    else:
        df['title_length'] = title_scaler.transform(df[['title_length']])
        df['positive_length'] = positive_scaler.transform(df[['positive_length']])
        df['negative_length'] = negative_scaler.transform(df[['negative_length']])
    return df, title_scaler, positive_scaler, negative_scaler

def add_holiday_columns(df):
    """
    Add holiday columns to the DataFrame
    Args:
        df (pd.DataFrame): DataFrame containing review data
    Returns:
        pd.DataFrame: DataFrame with holiday columns
    """
    holiday_words = {
        "New Year": ["nye", "new year", "new years", "new year's", "december 31", "january 1", "31st december", "1st january", "sylvester"],
        "Valentines Day": ["valentine", "valentines", "cupid", "lovebirds", "proposal"],
        "Easter": ["easter", "egghunt", "easteregg", "chocolateegg", "easterbasket", "resurrection", "goodfriday", "holyweek", "churchservice"],
        "Halloween": ["halloween", "trickortreat", "trick or treat"],
        "Christmas": ["christmas", "xmas", "santa claus", "december 25", "dec 25"],
        "Chinese New Year": ["chinesenewyear", "lunar new", "chun jie", "liondance", "year of the", "yearofthe", "chinese zodiac"],
    }
    for holiday, words in holiday_words.items():
        df[holiday] = df['text'].apply(lambda x: any(word in x.lower() for word in words)).astype(int)
    
    return df

def is_family(df):
    """
    Add 'is_family' column to the DataFrame
    Args:
        df (pd.DataFrame): DataFrame containing review data
    Returns:
        pd.DataFrame: DataFrame with 'is_family' column
    """
    family_phrases = ["we traveled as a family","perfect for families","great for families","family-friendly",
                  "ideal for a family trip","as a family","family vacation","traveled with kids","perfect for family stays",
                  "accommodated our family perfectly","great for children","safe for families","spacious enough for our family",
                  "kid-friendly","family getaway","we were a family of", "children", " kids", "child friendly", "family", "childcare",
                  "baby-friendly", "toddler",  "child amenities", "child safety"
    ]
    def check_family(text):
        return any(phrase.lower() in text.lower() for phrase in family_phrases)
    
    df['is_family'] = df['text'].apply(check_family).astype(int)
    
    return df

def review_to_text(row):
    """
    Generate review text from the review data
    Args:
        row (pd.Series): Row of the DataFrame
    Returns:
        str: Review text
    """
    review_title = row['review_title'] if pd.notna(row['review_title']) else "Review"
    review_negative = row['review_negative'] if pd.notna(row['review_negative']) else "Nothing in particular"
    
    helpful_text = (
        f"{row['review_helpful_votes']} people found it helpful." 
        if row['review_helpful_votes'] > 0 
        else "No one found this review helpful."
    )
    
    text = (
        f"{review_title}: "
        f"What I liked about my stay: {row['review_positive']} "
        f"What I didn't like: {review_negative} "
        f"I gave a score of {row['review_score']}. "
        f"{helpful_text}"
    )
    return text

def process_reviews(df, tokenizer, scaler_review_score=None, scaler_review_helpful_votes=None, title_length_scaler=None, positive_length_scaler=None, negative_length_scaler=None):
    """
    Processes review data for training, applying various transformations to the dataset.
    Args:
        df (pd.DataFrame): DataFrame containing review data to be processed.
        tokenizer (transformers.PreTrainedTokenizer): Tokenizer used to tokenize review text.
        scaler_review_score (sklearn.preprocessing.StandardScaler, optional): Scaler used for normalizing review scores. Defaults to None.
        scaler_review_helpful_votes (sklearn.preprocessing.StandardScaler, optional): Scaler used for standardizing review helpful votes. Defaults to None.
        title_length_scaler (sklearn.preprocessing.StandardScaler, optional): Scaler used for standardizing title lengths. Defaults to None.
        positive_length_scaler (sklearn.preprocessing.StandardScaler, optional): Scaler used for standardizing positive review lengths. Defaults to None.
        negative_length_scaler (sklearn.preprocessing.StandardScaler, optional): Scaler used for standardizing negative review lengths. Defaults to None.
    Returns:
        tuple: A tuple containing:
            - features (pd.DataFrame): DataFrame of processed features, excluding tokenization columns.
            - tokens (pd.DataFrame): DataFrame of review tokens, including 'review_id', 'accommodation_id', 'input_ids', and 'attention_mask'.
            - scaler_review_score (sklearn.preprocessing.StandardScaler): The scaler used for review scores, if applicable.
            - scaler_review_helpful_votes (sklearn.preprocessing.StandardScaler): The scaler used for review helpful votes, if applicable.
            - title_length_scaler (sklearn.preprocessing.StandardScaler): The scaler used for title lengths, if applicable.
            - positive_length_scaler (sklearn.preprocessing.StandardScaler): The scaler used for positive review lengths, if applicable.
            - negative_length_scaler (sklearn.preprocessing.StandardScaler): The scaler used for negative review lengths, if applicable.
    """
    df = df.copy()
    df['text'] = df.progress_apply(review_to_text, axis=1)
    add_lengths(df)
    df.drop(['review_title', 'review_positive', 'review_negative'], axis=1, inplace=True)
    df, scaler_review_score = normalize_review_score(df)
    df, scaler_review_helpful_votes = standardize_review_helpful_votes(df)
    df, title_length_scaler, positive_length_scaler, negative_length_scaler = standardize_lengths(df, title_length_scaler, positive_length_scaler, negative_length_scaler)
    df = add_holiday_columns(df)
    df = is_family(df)
    df = get_tokens(df, tokenizer)

    features = df.drop(columns=['input_ids', 'attention_mask'])
    tokens = df[['review_id', 'accommodation_id', 'input_ids', 'attention_mask']]

    return features, tokens, scaler_review_score, scaler_review_helpful_votes, title_length_scaler, positive_length_scaler, negative_length_scaler

In [None]:
train_reviews_features, train_reviews_tokens, scaler_review_score, scaler_review_helpful_votes, title_length_scaler, positive_length_scaler, negative_length_scaler = process_reviews(train_reviews, tokenizer)
train_reviews_features.to_parquet('train_reviews_features.parquet')
train_reviews_tokens.to_parquet('train_reviews_tokens.parquet')

val_reviews_features, val_reviews_tokens, _, _, _, _, _ = process_reviews(val_reviews, tokenizer, scaler_review_score, scaler_review_helpful_votes, title_length_scaler, positive_length_scaler, negative_length_scaler)
val_reviews_features.to_parquet('val_reviews_features.parquet')
val_reviews_tokens.to_parquet('val_reviews_tokens.parquet')

test_reviews_features, test_reviews_tokens, _, _, _, _, _ = process_reviews(test_reviews, tokenizer, scaler_review_score, scaler_review_helpful_votes, title_length_scaler, positive_length_scaler, negative_length_scaler)
test_reviews_features.to_parquet('test_reviews_features.parquet')
test_reviews_tokens.to_parquet('test_reviews_tokens.parquet')