In [1]:
import polars as pl
import numpy as np
import pandas as pd

In [2]:
# df = pl.read_csv('../data/raw/training_set_VU_DM.csv')
df = pl.read_csv('../data/raw/test_set_VU_DM.csv')

# Handling outliers in price_usd based on maximum value of visitor_hist_adr_usd
# df = df.filter(pl.col('price_usd') < 2000)

# Create binary indicators for historical data to denote whether the historical booking data is available for a user.
df = df.with_columns([
    df['visitor_hist_starrating'].is_not_null().cast(pl.UInt8).alias('has_hist_starrating'),
    df['visitor_hist_adr_usd'].is_not_null().cast(pl.UInt8).alias('has_hist_adr_usd')
])

# Drop columns with more than 90% missing data
for col in df.columns:
    if df[col].is_null().sum() / df.height > 0.9:
        df = df.drop(col)

## Creating hotel_quality feature

In [3]:
# Create a score difference between prop_location_score2 and prop_location_score1
df = df.with_columns([
    df["prop_location_score2"].map_elements(lambda x: None if x == "NULL" else x, return_dtype=pl.Float64).cast(pl.Float64).fill_nan(0).alias("prop_location_score2"),
    df["prop_location_score1"].map_elements(lambda x: None if x == "NULL" else x, return_dtype=pl.Float64).cast(pl.Float64).fill_nan(0).alias("prop_location_score1")
])
df = df.with_columns(
    ((df["prop_location_score2"] + 0.0001) / (df["prop_location_score1"] + 0.0001)).alias("score1d2")
)

# 3. Normalize features within each 'srch_id' group
features_to_normalize = ['prop_starrating', 'score1d2', 'prop_review_score']
for feature in features_to_normalize:
    df = df.with_columns(
        df[feature].map_elements(lambda x: None if x == "NULL" else x, return_dtype=pl.Float32)
             .cast(pl.Float32)
             .fill_null(0) 
             .alias(feature)
    )

for feature in features_to_normalize:
    temp_mean_name = f'{feature}_mean_temp'
    temp_std_name = f'{feature}_std_temp'
    group_stats = df.group_by('srch_id').agg([
        pl.col(feature).mean().alias(temp_mean_name),
        pl.col(feature).std().alias(temp_std_name)
    ])
    df = df.join(group_stats, on='srch_id')

    # Perform normalization and create new column
    df = df.with_columns(
        ((pl.col(feature) - pl.col(temp_mean_name)) / pl.col(temp_std_name))
        .fill_nan(0)  # Handle division by zero or missing std dev
        .alias(f'normalized_{feature}')
    )

    # Drop temporary columns to prevent duplicates
    df = df.drop([temp_mean_name, temp_std_name])

# 4. Engineer the 'hotel_quality' feature using a weighted sum of normalized features
weights = {
    'normalized_prop_starrating': 0.3,
    'normalized_score1d2': 0.4,
    'normalized_prop_review_score': 0.3
}

weighted_features = [pl.col(feature) * weight for feature, weight in weights.items()]
df = df.with_columns(
    sum(weighted_features).alias('hotel_quality')
)

# 5. Normalize the 'hotel_quality' score to range between 0 and 1
df = df.with_columns(
    ((df['hotel_quality'] - df['hotel_quality'].min()) / (df['hotel_quality'].max() - df['hotel_quality'].min())).alias('normalized_hotel_quality')
)

## Create price features

In [4]:
# 1. Price per Person
df = df.with_columns(
    ((df["price_usd"] * df["srch_room_count"]) / 
     (df["srch_adults_count"] + df["srch_children_count"]).fill_null(1)).alias("price_per_person")
)

# 2. Average Price per Day
df = df.with_columns(
    (df["price_usd"] / df["srch_length_of_stay"]).alias("avg_price_per_day")
)

# 3. UMP (User Margin Price)
df = df.with_columns(
    (pl.col("prop_log_historical_price").exp() - pl.col("price_usd")).alias("ump")
)

df = df.with_columns([
    pl.col("visitor_hist_adr_usd").map_elements(lambda x: None if x in ["NULL", ""] else float(x), return_dtype=pl.Float64).alias("visitor_hist_adr_usd"),
    pl.col("price_usd").map_elements(lambda x: None if x in ["NULL", ""] else float(x), return_dtype=pl.Float64).alias("price_usd"),
    pl.col("visitor_hist_starrating").map_elements(lambda x: None if x in ["NULL", ""] else float(x), return_dtype=pl.Float64).alias("visitor_hist_starrating"),
    pl.col("prop_starrating").map_elements(lambda x: None if x in ["NULL", ""] else float(x), return_dtype=pl.Float64).alias("prop_starrating"),
    pl.col("srch_room_count").map_elements(lambda x: None if x in ["NULL", ""] else float(x), return_dtype=pl.Float64).alias("srch_room_count"),
    pl.col("srch_query_affinity_score").map_elements(lambda x: None if x in ["NULL", ""] else float(x), return_dtype=pl.Float64).alias("srch_query_affinity_score")
])

df = df.with_columns([
    # 4. Price Difference
    pl.when(pl.col("visitor_hist_adr_usd").is_null())
       .then(None)
       .otherwise(pl.col("visitor_hist_adr_usd") - pl.col("price_usd")).alias("price_diff"),
    # 5. Star Rating Difference
    pl.when(pl.col("visitor_hist_starrating").is_null())
       .then(None)
       .otherwise(pl.col("visitor_hist_starrating") - pl.col("prop_starrating")).alias("starrating_diff")
])

# 6. Total Price
df = df.with_columns(
    (df["price_usd"] * df["srch_room_count"]).alias("total_price")
)

# 7. Score2MA
df = df.with_columns(
    (df["prop_location_score2"] * df["srch_query_affinity_score"]).alias("score2ma")
)

## Promotion feature

In [5]:
# Count how many promotions are flagged within each search
promotion_agg = df.group_by("srch_id").agg([
    pl.col("promotion_flag").sum().alias("promotion_count")
])
df = df.join(promotion_agg, on="srch_id", how="left")

# Check if any property was on promotion per srch_id
promotion_any = df.group_by("srch_id").agg([
    (pl.col("promotion_flag").max() > 0).cast(pl.UInt8).alias("promotion_any")
])

df = df.join(promotion_any, on="srch_id", how="left")

In [6]:
columns_to_drop = [
    'prop_location_score2',
    'prop_location_score1',
    'prop_starrating', 
    'prop_review_score',
    'price_usd',
    'srch_room_count',
    'prop_log_historical_price',
    'srch_length_of_stay',
    'srch_children_count',
    'srch_adults_count',
    'room_count',
    'srch_query_affinity_score',
    'promotion_flag'
]

df = df.drop(columns_to_drop)

drop_competitors = [col for col in df.columns if col.startswith("comp") and 
                   ("rate" in col or "inv" in col or "percent_diff" in col)]

df = df.drop(drop_competitors)

def replace_with_nan(value):
    try:
        return float(value)
    except ValueError:
        return None

for col in df.columns:
    if df[col].dtype == pl.Utf8:
        # Convert string columns, replacing non-convertible strings with NaN
        df = df.with_columns(
            df[col].map_elements(replace_with_nan, return_dtype=pl.Float64).alias(col)
        )
    elif df[col].dtype == pl.Float64:
        # For numeric columns, just ensure that they do not contain invalid strings
        df = df.with_columns(
            df[col].map_elements(lambda x: x if isinstance(x, float) else None, return_dtype=pl.Float64).alias(col)
        )


# df.write_csv('../data/preprocessed/engineered_training_set.csv')
df.write_csv('../data/preprocessed/engineered_test_set.csv')

In [7]:
engineered_data = pl.read_csv('../data/preprocessed/engineered_test_set.csv')
engineered_data.describe()

statistic,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_brand_bool,srch_destination_id,srch_booking_window,srch_saturday_night_bool,orig_destination_distance,random_bool,has_hist_starrating,has_hist_adr_usd,score1d2,normalized_prop_starrating,normalized_score1d2,normalized_prop_review_score,hotel_quality,normalized_hotel_quality,price_per_person,avg_price_per_day,ump,price_diff,starrating_diff,total_price,score2ma,promotion_count,promotion_any
str,f64,str,f64,f64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,str,f64,f64
"""count""",4959183.0,"""0""",4959183.0,4959183.0,"""253431""","""254624""",4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,3350504.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,"""254624""","""253431""",4959183.0,"""0""",4959183.0,4959183.0
"""null_count""",0.0,"""4959183""",0.0,0.0,"""4705752""","""4704559""",0.0,0.0,0.0,0.0,0.0,0.0,1608679.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""4704559""","""4705752""",0.0,"""4959183""",0.0,0.0
"""mean""",166646.023925,,9.979214,175.250352,,,173.837066,70081.112228,0.633909,14069.670932,37.773391,0.4983,1312.801653,0.296919,1.0,1.0,0.0,1.5765e-09,0.0,6.2023e-09,2.3341e-09,0.557897,120.166354,132.812844,-77.031889,,,255.960239,,6.291075,0.900008
"""std""",96149.918406,,7.667722,65.902836,,,68.350058,40613.628746,0.481735,8118.397545,52.221874,0.499997,2036.578173,0.456901,0.0,0.0,0.0,0.970889,0.0,0.979469,0.484133,0.089027,5281.944137,6299.901568,12463.162002,,,14121.362027,,5.904977,0.29999
"""min""",1.0,,1.0,1.0,"""1.0""","""0.0""",1.0,1.0,0.0,2.0,0.0,0.0,0.01,0.0,1.0,1.0,0.0,-5.747049,0.0,-5.382674,-3.033857,0.0,0.0,0.0,-9661000.0,"""-0.009999999999990905""","""-0.009999999999999787""",0.0,,0.0,0.0
"""25%""",83312.0,,5.0,100.0,,,100.0,35021.0,0.0,7100.0,4.0,0.0,142.28,0.0,1.0,1.0,0.0,-0.644389,0.0,-0.465098,-0.2947,0.5037048,40.0,40.12,-25.367592,,,89.0,,2.0,1.0
"""50%""",167095.0,,5.0,219.0,,,219.0,69607.0,1.0,13609.0,17.0,0.0,387.85,0.0,1.0,1.0,0.0,0.042003,0.0,0.221939,0.057866,0.568538,63.425,71.5,17.651304,,,129.0,,5.0,1.0
"""75%""",249966.0,,14.0,219.0,,,219.0,105179.0,1.0,21160.0,49.0,1.0,1517.56,1.0,1.0,1.0,0.0,0.6561673,0.0,0.663139,0.344956,0.6213313,103.21,118.95,50.024202,,,200.0,,9.0,1.0
"""max""",332787.0,,34.0,231.0,"""5.0""","""99.98""",230.0,140821.0,1.0,28416.0,498.0,1.0,11692.98,1.0,1.0,1.0,0.0,5.7470493,0.0,5.4800777,2.4041638,1.0,3220400.0,4830670.0,497.261251,"""999.88""","""5.0""",9661340.0,,36.0,1.0


## Convert str datatype to float

In [8]:
for col_name in engineered_data.columns:
    if engineered_data[col_name].dtype == pl.Utf8:
        engineered_data = engineered_data.with_columns(
            engineered_data[col_name]
            .str.replace("NULL", "NaN") 
            .str.replace("N/A", "NaN")   
            .cast(pl.Float64)          
            .alias(col_name)
        )

engineered_data.write_csv("../data/preprocessed/engineered_test_set.csv")
engineered_data.describe()

statistic,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_brand_bool,srch_destination_id,srch_booking_window,srch_saturday_night_bool,orig_destination_distance,random_bool,has_hist_starrating,has_hist_adr_usd,score1d2,normalized_prop_starrating,normalized_score1d2,normalized_prop_review_score,hotel_quality,normalized_hotel_quality,price_per_person,avg_price_per_day,ump,price_diff,starrating_diff,total_price,score2ma,promotion_count,promotion_any
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",4959183.0,0.0,4959183.0,4959183.0,253431.0,254624.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,3350504.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,254624.0,253431.0,4959183.0,0.0,4959183.0,4959183.0
"""null_count""",0.0,4959183.0,0.0,0.0,4705752.0,4704559.0,0.0,0.0,0.0,0.0,0.0,0.0,1608679.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4704559.0,4705752.0,0.0,4959183.0,0.0,0.0
"""mean""",166646.023925,,9.979214,175.250352,3.374933,177.15073,173.837066,70081.112228,0.633909,14069.670932,37.773391,0.4983,1312.801653,0.296919,1.0,1.0,0.0,1.5765e-09,0.0,6.2023e-09,2.3341e-09,0.557897,120.166354,132.812844,-77.031889,22.872424,0.114207,255.960239,,6.291075,0.900008
"""std""",96149.918406,,7.667722,65.902836,0.69192,109.592336,68.350058,40613.628746,0.481735,8118.397545,52.221874,0.499997,2036.578173,0.456901,0.0,0.0,0.0,0.970889,0.0,0.979469,0.484133,0.089027,5281.944137,6299.901568,12463.162002,154.52387,1.107724,14121.362027,,5.904977,0.29999
"""min""",1.0,,1.0,1.0,1.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,0.01,0.0,1.0,1.0,0.0,-5.747049,0.0,-5.382674,-3.033857,0.0,0.0,0.0,-9661000.0,-13946.72,-4.0,0.0,,0.0,0.0
"""25%""",83312.0,,5.0,100.0,2.93,111.01,100.0,35021.0,0.0,7100.0,4.0,0.0,142.28,0.0,1.0,1.0,0.0,-0.644389,0.0,-0.465098,-0.2947,0.5037048,40.0,40.12,-25.367592,-32.09,-0.57,89.0,,2.0,1.0
"""50%""",167095.0,,5.0,219.0,3.43,151.39,219.0,69607.0,1.0,13609.0,17.0,0.0,387.85,0.0,1.0,1.0,0.0,0.042003,0.0,0.221939,0.057866,0.568538,63.425,71.5,17.651304,23.34,0.0,129.0,,5.0,1.0
"""75%""",249966.0,,14.0,219.0,3.95,215.38,219.0,105179.0,1.0,21160.0,49.0,1.0,1517.56,1.0,1.0,1.0,0.0,0.6561673,0.0,0.663139,0.344956,0.6213313,103.21,118.95,50.024202,81.32,0.72,200.0,,9.0,1.0
"""max""",332787.0,,34.0,231.0,5.0,2768.93,230.0,140821.0,1.0,28416.0,498.0,1.0,11692.98,1.0,1.0,1.0,0.0,5.7470493,0.0,5.4800777,2.4041638,1.0,3220400.0,4830670.0,497.261251,2711.51,5.0,9661340.0,,36.0,1.0
