In [1]:
import polars as pl
import numpy as np
import pandas as pd

In [3]:
df = pl.read_csv('../data/raw/training_set_VU_DM.csv')

## Creating hotel_quality feature

In [24]:
# 1. Create binary indicators for historical data 
# to denote whether the historical booking data is available for a user.
df = df.with_columns([
    df['visitor_hist_starrating'].is_not_null().cast(pl.UInt8).alias('has_hist_starrating'),
    df['visitor_hist_adr_usd'].is_not_null().cast(pl.UInt8).alias('has_hist_adr_usd')
])

# 2. Drop columns with more than 90% missing data
for col in df.columns:
    if df[col].is_null().sum() / df.height > 0.9:
        df = df.drop(col)

# 3. Normalize features within each 'srch_id' group
features_to_normalize = ['prop_starrating', 'prop_location_score1', 'prop_location_score2', 'prop_review_score']
for feature in features_to_normalize:
    df = df.with_columns(
        df[feature].map_elements(lambda x: None if x == "NULL" else x, return_dtype=pl.Float32)
             .cast(pl.Float32)
             .fill_null(0) 
             .alias(feature)
    )

for feature in features_to_normalize:
    temp_mean_name = f'{feature}_mean_temp'
    temp_std_name = f'{feature}_std_temp'
    group_stats = df.group_by('srch_id').agg([
        pl.col(feature).mean().alias(temp_mean_name),
        pl.col(feature).std().alias(temp_std_name)
    ])
    df = df.join(group_stats, on='srch_id')

    # Perform normalization and create new column
    df = df.with_columns(
        ((pl.col(feature) - pl.col(temp_mean_name)) / pl.col(temp_std_name))
        .fill_nan(0)  # Handle division by zero or missing std dev
        .alias(f'normalized_{feature}')
    )

    # Drop temporary columns to prevent duplicates
    df = df.drop([temp_mean_name, temp_std_name])

# 4. Engineer the 'hotel_quality' feature using a weighted sum of normalized features
weights = {
    'normalized_prop_starrating': 0.2,
    'normalized_prop_location_score1': 0.25,
    'normalized_prop_location_score2': 0.3,
    'normalized_prop_review_score': 0.25
}

weighted_features = [pl.col(feature) * weight for feature, weight in weights.items()]
df = df.with_columns(
    sum(weighted_features).alias('hotel_quality')
)

# 5. Normalize the 'hotel_quality' score to range between 0 and 1
df = df.with_columns(
    ((df['hotel_quality'] - df['hotel_quality'].min()) / (df['hotel_quality'].max() - df['hotel_quality'].min())).alias('normalized_hotel_quality')
)

df.write_csv('../data/preprocessed/engineered_training_set.csv')