In [32]:
import polars as pl
import numpy as np
import pandas as pd

In [33]:
df = pl.read_csv('../data/raw/training_set_VU_DM.csv')

# Handling outliers in price_usd based on maximum value of visitor_hist_adr_usd
df = df.filter(pl.col('price_usd') < 2000)

# Create binary indicators for historical data to denote whether the historical booking data is available for a user.
df = df.with_columns([
    df['visitor_hist_starrating'].is_not_null().cast(pl.UInt8).alias('has_hist_starrating'),
    df['visitor_hist_adr_usd'].is_not_null().cast(pl.UInt8).alias('has_hist_adr_usd')
])

# Drop columns with more than 90% missing data
for col in df.columns:
    if df[col].is_null().sum() / df.height > 0.9:
        df = df.drop(col)

## Creating hotel_quality feature

In [34]:
# Create a score difference between prop_location_score2 and prop_location_score1
df = df.with_columns([
    df["prop_location_score2"].map_elements(lambda x: None if x == "NULL" else x, return_dtype=pl.Float64).cast(pl.Float64).fill_nan(0).alias("prop_location_score2"),
    df["prop_location_score1"].map_elements(lambda x: None if x == "NULL" else x, return_dtype=pl.Float64).cast(pl.Float64).fill_nan(0).alias("prop_location_score1")
])
df = df.with_columns(
    ((df["prop_location_score2"] + 0.0001) / (df["prop_location_score1"] + 0.0001)).alias("score1d2")
)

# 3. Normalize features within each 'srch_id' group
features_to_normalize = ['prop_starrating', 'score1d2', 'prop_review_score']
for feature in features_to_normalize:
    df = df.with_columns(
        df[feature].map_elements(lambda x: None if x == "NULL" else x, return_dtype=pl.Float32)
             .cast(pl.Float32)
             .fill_null(0) 
             .alias(feature)
    )

for feature in features_to_normalize:
    temp_mean_name = f'{feature}_mean_temp'
    temp_std_name = f'{feature}_std_temp'
    group_stats = df.group_by('srch_id').agg([
        pl.col(feature).mean().alias(temp_mean_name),
        pl.col(feature).std().alias(temp_std_name)
    ])
    df = df.join(group_stats, on='srch_id')

    # Perform normalization and create new column
    df = df.with_columns(
        ((pl.col(feature) - pl.col(temp_mean_name)) / pl.col(temp_std_name))
        .fill_nan(0)  # Handle division by zero or missing std dev
        .alias(f'normalized_{feature}')
    )

    # Drop temporary columns to prevent duplicates
    df = df.drop([temp_mean_name, temp_std_name])

# 4. Engineer the 'hotel_quality' feature using a weighted sum of normalized features
weights = {
    'normalized_prop_starrating': 0.3,
    'normalized_score1d2': 0.4,
    'normalized_prop_review_score': 0.3
}

weighted_features = [pl.col(feature) * weight for feature, weight in weights.items()]
df = df.with_columns(
    sum(weighted_features).alias('hotel_quality')
)

# 5. Normalize the 'hotel_quality' score to range between 0 and 1
df = df.with_columns(
    ((df['hotel_quality'] - df['hotel_quality'].min()) / (df['hotel_quality'].max() - df['hotel_quality'].min())).alias('normalized_hotel_quality')
)

## Create price features

In [35]:
# 1. Price per Person
df = df.with_columns(
    ((df["price_usd"] * df["srch_room_count"]) / 
     (df["srch_adults_count"] + df["srch_children_count"]).fill_null(1)).alias("price_per_person")
)

# 2. Average Price per Day
df = df.with_columns(
    (df["price_usd"] / df["srch_length_of_stay"]).alias("avg_price_per_day")
)

# 3. UMP (User Margin Price)
df = df.with_columns(
    (pl.col("prop_log_historical_price").exp() - pl.col("price_usd")).alias("ump")
)

df = df.with_columns([
    pl.col("visitor_hist_adr_usd").map_elements(lambda x: None if x in ["NULL", ""] else float(x), return_dtype=pl.Float64).alias("visitor_hist_adr_usd"),
    pl.col("price_usd").map_elements(lambda x: None if x in ["NULL", ""] else float(x), return_dtype=pl.Float64).alias("price_usd"),
    pl.col("visitor_hist_starrating").map_elements(lambda x: None if x in ["NULL", ""] else float(x), return_dtype=pl.Float64).alias("visitor_hist_starrating"),
    pl.col("prop_starrating").map_elements(lambda x: None if x in ["NULL", ""] else float(x), return_dtype=pl.Float64).alias("prop_starrating"),
    pl.col("srch_room_count").map_elements(lambda x: None if x in ["NULL", ""] else float(x), return_dtype=pl.Float64).alias("srch_room_count"),
    pl.col("srch_query_affinity_score").map_elements(lambda x: None if x in ["NULL", ""] else float(x), return_dtype=pl.Float64).alias("srch_query_affinity_score")
])

df = df.with_columns([
    # 4. Price Difference
    pl.when(pl.col("visitor_hist_adr_usd").is_null())
       .then(None)
       .otherwise(pl.col("visitor_hist_adr_usd") - pl.col("price_usd")).alias("price_diff"),
    # 5. Star Rating Difference
    pl.when(pl.col("visitor_hist_starrating").is_null())
       .then(None)
       .otherwise(pl.col("visitor_hist_starrating") - pl.col("prop_starrating")).alias("starrating_diff")
])

# 6. Total Price
df = df.with_columns(
    (df["price_usd"] * df["srch_room_count"]).alias("total_price")
)

# 7. Score2MA
df = df.with_columns(
    (df["prop_location_score2"] * df["srch_query_affinity_score"]).alias("score2ma")
)

In [40]:
columns_to_drop = [
    'prop_location_score2',
    'prop_location_score1',
    'prop_starrating', 
    'prop_review_score',
    'price_usd',
    'srch_room_count',
    'prop_log_historical_price',
    'srch_length_of_stay',
    'srch_children_count',
    'srch_adults_count',
    'room_count',
    'srch_query_affinity_score'
]

df = df.drop(columns_to_drop)

def replace_with_nan(value):
    if value == "NULL" or value == "":
        return None 
    return value

for col in df.columns:
    if df[col].dtype == pl.Utf8:  # Apply only to string columns
        df = df.with_columns(
            df[col].map_elements(replace_with_nan, return_dtype=pl.Float64).cast(pl.Float64).alias(col)
        )
    # If it's already a numeric type but might contain "NULL" or "", handle accordingly
    elif df[col].dtype in [pl.Float64, pl.Float64]:
        df = df.with_columns(
            df[col].map_elements(lambda x: None if x in ["NULL", ""] else x, return_dtype=pl.Float64).cast(pl.Float64).alias(col)
        )
df.write_csv('../data/preprocessed/engineered_training_set.csv')

In [42]:
engineered_data = pl.read_csv('../data/preprocessed/engineered_training_set.csv')
engineered_data.describe()

statistic,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_brand_bool,position,promotion_flag,srch_destination_id,srch_booking_window,srch_saturday_night_bool,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,has_hist_starrating,has_hist_adr_usd,score1d2,normalized_prop_starrating,normalized_score1d2,normalized_prop_review_score,hotel_quality,normalized_hotel_quality,price_per_person,avg_price_per_day,ump,price_diff,starrating_diff,total_price,score2ma
str,f64,str,f64,f64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,str
"""count""",4953228.0,"""0""",4953228.0,4953228.0,"""251792""","""252914""",4953228.0,4953228.0,4953228.0,4953228.0,4953228.0,4953228.0,4953228.0,4953228.0,"""0""",4953228.0,"""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""",4953228.0,"""0""",4953228.0,4953228.0,4953228.0,4953228.0,4953224.0,4953224.0,4953224.0,4953224.0,4953224.0,4953228.0,4953228.0,4953228.0,"""252914""","""251792""",4953228.0,"""0"""
"""null_count""",0.0,"""4953228""",0.0,0.0,"""4701436""","""4700314""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""4953228""",0.0,"""4953228""","""4953228""","""4953228""","""4953228""","""4953228""","""4953228""","""4953228""","""4953228""","""4953228""","""4953228""","""4953228""","""4953228""","""4953228""","""4953228""","""4953228""","""4953228""","""4953228""","""4953228""","""4953228""","""4953228""","""4953228""","""4953228""","""4953228""","""4953228""",0.0,"""4953228""",0.0,0.0,0.0,0.0,4.0,4.0,4.0,4.0,4.0,0.0,0.0,0.0,"""4700314""","""4701436""",0.0,"""4953228"""
"""mean""",166367.108221,,9.953445,175.369853,,,173.999283,70077.122916,0.634832,16.857549,0.215576,14042.200748,37.473272,0.502294,,0.295835,,,,,,,,,,,,,,,,,,,,,,,,,0.044751,,0.027916,1.0,1.0,0.0,1.3289e-09,0.0,5.8107e-09,2.1778e-09,0.57336,85.293109,92.34218,-0.981179,,,170.66645,
"""std""",96110.062236,,7.646976,65.904102,,,68.334449,40609.937868,0.481477,10.426479,0.411222,8111.447094,51.993395,0.499995,,0.456417,,,,,,,,,,,,,,,,,,,,,,,,,0.206756,,0.164733,0.0,0.0,0.0,0.970969,0.0,0.979434,0.484224,0.088648,82.613894,82.644123,107.787853,,,152.173464,
"""min""",1.0,,1.0,1.0,"""1.41""","""0.0""",1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,0.0,1.0,1.0,0.0,-5.833333,0.0,-5.43742,-3.131872,0.0,0.0,0.0,-1998.16,"""-0.009999999999990905""","""-0.009999999999999787""",0.0,
"""25%""",82936.0,,5.0,100.0,,,100.0,35010.0,0.0,8.0,0.0,7101.0,4.0,0.0,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,0.0,1.0,1.0,0.0,-0.645497,0.0,-0.466997,-0.294509,0.5194433,40.0,40.25,-24.529985,,,89.0,
"""50%""",166503.0,,5.0,219.0,,,219.0,69630.0,1.0,16.0,0.0,13541.0,17.0,1.0,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,0.0,1.0,1.0,0.0,0.040506,0.0,0.222286,0.057451,0.583878,63.04,71.486667,17.834287,,,129.0,
"""75%""",249721.0,,14.0,219.0,,,219.0,105165.0,1.0,26.0,0.0,21084.0,48.0,1.0,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,0.0,1.0,1.0,0.0,0.6561673,0.0,0.662652,0.345082,0.6365348,103.0,118.63,50.295562,,,199.64,
"""max""",332785.0,,34.0,231.0,"""5.0""","""999.81""",230.0,140821.0,1.0,40.0,1.0,28416.0,492.0,1.0,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,1.0,1.0,1.0,0.0,5.833334,0.0,5.294649,2.3304434,1.0,9150.0,1999.0,497.681251,"""997.0200000000001""","""5.0""",9150.0,
