In [1]:
import polars as pl
import numpy as np
import pandas as pd

In [2]:
raw_filepath = '../data/raw/test_set_VU_DM.csv'
engineered_filepath = "../data/preprocessed/engineered_test_set.csv"

In [3]:
df = pl.read_csv(raw_filepath)

# Convert NULL to None so polars can handle
df_pandas = df.to_pandas()
df_pandas = df_pandas.replace('NULL', pd.NA)
df = pl.from_pandas(df_pandas)

# Handling outliers in price_usd based on maximum value of visitor_hist_adr_usd
df = df.filter(pl.col('price_usd') < 2000)

# Create binary indicators for historical data to denote whether the historical booking data is available for a user.
df = df.with_columns([
    df['visitor_hist_starrating'].is_not_null().cast(pl.UInt8).alias('has_hist_starrating'),
    df['visitor_hist_adr_usd'].is_not_null().cast(pl.UInt8).alias('has_hist_adr_usd')
])

# Drop columns with more than 90% missing data
threshold = 0.9 * df.height

cols_to_drop = [col for col in df.columns if df[col].is_null().sum() > threshold]
cols_to_keep = ['visitor_hist_starrating', 'visitor_hist_adr_usd', 'srch_query_affinity_score', 'gross_bookings_usd']
final_columns_to_drop = list(set(cols_to_drop) - set(cols_to_keep))
print(final_columns_to_drop)
df = df.drop(final_columns_to_drop)

['comp7_rate', 'comp7_rate_percent_diff', 'comp6_inv', 'comp1_inv', 'comp4_rate', 'comp4_inv', 'comp1_rate', 'comp6_rate_percent_diff', 'comp4_rate_percent_diff', 'comp1_rate_percent_diff', 'comp7_inv', 'comp6_rate', 'comp3_rate_percent_diff']


## Handling missing values in prop_location_score2 and prop_location_score1

In [4]:
df = df.with_columns(
    pl.col('prop_location_score2').map_elements(lambda x: None if x == 'NULL' else float(x), return_dtype=pl.Float64).alias('prop_location_score2')
)

# Calculate the first quartile of prop_location_score2 for each country
first_quartiles = df.group_by('prop_country_id').agg([
    pl.col('prop_location_score2').quantile(0.25).alias('first_quartile')
])

# Join this data back to the original dataframe
df = df.join(first_quartiles, on='prop_country_id', how='left')

# Fill missing prop_location_score2 values with the first quartile value for the respective country
df = df.with_columns(
    pl.when(pl.col('prop_location_score2').is_null())
    .then(pl.col('first_quartile'))
    .otherwise(pl.col('prop_location_score2'))
    .alias('prop_location_score2')
)

# Optionally, remove the temporary 'first_quartile' column if it's no longer needed
df = df.drop('first_quartile')



In [5]:
df = df.with_columns(
    pl.col('prop_location_score1').map_elements(lambda x: None if x == 'NULL' else float(x), return_dtype=pl.Float64).alias('prop_location_score1')
)

# Calculate the first quartile of prop_location_score1 for each country
first_quartiles = df.group_by('prop_country_id').agg([
    pl.col('prop_location_score1').quantile(0.25).alias('first_quartile')
])

# Join this data back to the original dataframe
df = df.join(first_quartiles, on='prop_country_id', how='left')

# Fill missing prop_location_score1 values with the first quartile value for the respective country
df = df.with_columns(
    pl.when(pl.col('prop_location_score1').is_null())
    .then(pl.col('first_quartile'))
    .otherwise(pl.col('prop_location_score1'))
    .alias('prop_location_score1')
)

# Optionally, remove the temporary 'first_quartile' column if it's no longer needed
df = df.drop('first_quartile')

## Creating hotel_quality feature

In [6]:
# Create a score difference between prop_location_score2 and prop_location_score1
df = df.with_columns([
    df["prop_location_score2"].map_elements(lambda x: None if x == "NULL" else x, return_dtype=pl.Float64).cast(pl.Float64).fill_nan(0).alias("prop_location_score2"),
    df["prop_location_score1"].map_elements(lambda x: None if x == "NULL" else x, return_dtype=pl.Float64).cast(pl.Float64).fill_nan(0).alias("prop_location_score1")
])
df = df.with_columns(
    ((df["prop_location_score2"] + 0.0001) / (df["prop_location_score1"] + 0.0001)).alias("score1d2")
)

# 3. Normalize features within each 'srch_id' group
features_to_normalize = ['prop_starrating', 'score1d2', 'prop_review_score']
for feature in features_to_normalize:
    df = df.with_columns(
        df[feature].map_elements(lambda x: None if x == "NULL" else x, return_dtype=pl.Float64)
             .cast(pl.Float64)
             .fill_null(0) 
             .alias(feature)
    )

for feature in features_to_normalize:
    temp_mean_name = f'{feature}_mean_temp'
    temp_std_name = f'{feature}_std_temp'
    group_stats = df.group_by('srch_id').agg([
        pl.col(feature).mean().alias(temp_mean_name),
        pl.col(feature).std().alias(temp_std_name)
    ])
    df = df.join(group_stats, on='srch_id')

    # Perform normalization and create new column
    df = df.with_columns(
        ((pl.col(feature) - pl.col(temp_mean_name)) / (pl.col(temp_std_name) + 0.00001))
        .fill_nan(0)  # Handle division by zero or missing std dev
        .alias(f'normalized_{feature}')
    )

    # Drop temporary columns to prevent duplicates
    df = df.drop([temp_mean_name, temp_std_name])

# 4. Engineer the 'hotel_quality' feature using a weighted sum of normalized features
weights = {
    'normalized_prop_starrating': 0.3,
    'normalized_score1d2': 0.4,
    'normalized_prop_review_score': 0.3
}

weighted_features = [pl.col(feature) * weight for feature, weight in weights.items()]
df = df.with_columns(
    sum(weighted_features).alias('hotel_quality')
)

# 5. Normalize the 'hotel_quality' score to range between 0 and 1
df = df.with_columns(
    ((df['hotel_quality'] - df['hotel_quality'].min()) / (df['hotel_quality'].max() - df['hotel_quality'].min())).alias('normalized_hotel_quality')
)

## Create price features

In [7]:
# 1. Price per Person
df = df.with_columns(
    ((df["price_usd"] * df["srch_room_count"]) / 
     (df["srch_adults_count"] + df["srch_children_count"]).fill_null(1)).alias("price_per_person")
)

# 2. Average Price per Day
df = df.with_columns(
    (df["price_usd"] / df["srch_length_of_stay"]).alias("avg_price_per_day")
)

# 3. UMP (User Margin Price)
df = df.with_columns(
    (pl.col("prop_log_historical_price").exp() - pl.col("price_usd")).alias("ump")
)

df = df.with_columns([
    pl.col("visitor_hist_adr_usd").map_elements(lambda x: None if x in ["NULL", ""] else float(x), return_dtype=pl.Float64).alias("visitor_hist_adr_usd"),
    pl.col("price_usd").map_elements(lambda x: None if x in ["NULL", ""] else float(x), return_dtype=pl.Float64).alias("price_usd"),
    pl.col("visitor_hist_starrating").map_elements(lambda x: None if x in ["NULL", ""] else float(x), return_dtype=pl.Float64).alias("visitor_hist_starrating"),
    pl.col("prop_starrating").map_elements(lambda x: None if x in ["NULL", ""] else float(x), return_dtype=pl.Float64).alias("prop_starrating"),
    pl.col("srch_room_count").map_elements(lambda x: None if x in ["NULL", ""] else float(x), return_dtype=pl.Float64).alias("srch_room_count"),
    pl.col("srch_query_affinity_score").map_elements(lambda x: None if x in ["NULL", ""] else float(x), return_dtype=pl.Float64).alias("srch_query_affinity_score")
])

df = df.with_columns([
    # 4. Price Difference
    pl.when(pl.col("visitor_hist_adr_usd").is_null())
       .then(None)
       .otherwise(pl.col("visitor_hist_adr_usd") - pl.col("price_usd")).alias("price_diff"),
    # 5. Star Rating Difference
    pl.when(pl.col("visitor_hist_starrating").is_null())
       .then(None)
       .otherwise(pl.col("visitor_hist_starrating") - pl.col("prop_starrating")).alias("starrating_diff")
])

# 6. Total Price
df = df.with_columns(
    (df["price_usd"] * df["srch_room_count"]).alias("total_price")
)

## Promotion feature

In [8]:
# Count how many promotions are flagged within each search
promotion_agg = df.group_by("srch_id").agg([
    pl.col("promotion_flag").sum().alias("promotion_count")
])
df = df.join(promotion_agg, on="srch_id", how="left")

# Check if any property was on promotion per srch_id
promotion_any = df.group_by("srch_id").agg([
    (pl.col("promotion_flag").max() > 0).cast(pl.UInt8).alias("promotion_any")
])

df = df.join(promotion_any, on="srch_id", how="left")

## Ranking features

In [9]:
# Group by 'srch_id' and rank 'price_usd' in ascending order (lowest to highest price)
df = df.with_columns([
    pl.col("price_usd").rank("dense", descending=False).over("srch_id").alias("price_rank")
])

# Group by 'srch_id' and rank 'prop_starrating' in descending order (highest to lowest rating)
df = df.with_columns([
    pl.col("prop_starrating").rank("dense", descending=True).over("srch_id").alias("star_rank")
])

In [10]:
columns_to_drop = [
    'prop_location_score1',
    'prop_starrating', 
    'prop_review_score',
    'price_usd',
    'srch_room_count',
    'prop_log_historical_price',
    'srch_length_of_stay',
    'srch_children_count',
    'srch_adults_count',
    'room_count',
    'promotion_flag'
]

df = df.drop(columns_to_drop)

drop_competitors = [col for col in df.columns if col.startswith("comp") and 
                   ("rate" in col or "inv" in col or "percent_diff" in col)]

df = df.drop(drop_competitors)

def replace_with_nan(value):
    try:
        return float(value)
    except ValueError:
        return None

for col in df.columns:
    if df[col].dtype == pl.Utf8:
        # Convert string columns, replacing non-convertible strings with NaN
        df = df.with_columns(
            df[col].map_elements(replace_with_nan, return_dtype=pl.Float64).alias(col)
        )
    elif df[col].dtype == pl.Float64:
        # For numeric columns, just ensure that they do not contain invalid strings
        df = df.with_columns(
            df[col].map_elements(lambda x: x if isinstance(x, float) else None, return_dtype=pl.Float64).alias(col)
        )


df.write_csv(engineered_filepath)

In [11]:
engineered_data = pl.read_csv(engineered_filepath)
engineered_data.describe()

statistic,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_brand_bool,prop_location_score2,srch_destination_id,srch_booking_window,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,has_hist_starrating,has_hist_adr_usd,score1d2,normalized_prop_starrating,normalized_score1d2,normalized_prop_review_score,hotel_quality,normalized_hotel_quality,price_per_person,avg_price_per_day,ump,price_diff,starrating_diff,total_price,promotion_count,promotion_any,price_rank,star_rank
str,f64,str,f64,f64,str,str,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,f64
"""count""",4953797.0,"""0""",4953797.0,4953797.0,"""253366""","""254558""",4953797.0,4953797.0,4953797.0,4953792.0,4953797.0,4953797.0,4953797.0,"""318039""",3348504.0,4953797.0,4953797.0,4953797.0,4953797.0,4953789.0,4953789.0,4953789.0,4953789.0,4953789.0,4953797.0,4953797.0,4953797.0,"""254558""","""253366""",4953797.0,4953797.0,4953797.0,4953797.0,4953797.0
"""null_count""",0.0,"""4953797""",0.0,0.0,"""4700431""","""4699239""",0.0,0.0,0.0,5.0,0.0,0.0,0.0,"""4635758""",1605293.0,0.0,0.0,0.0,0.0,8.0,8.0,8.0,8.0,8.0,0.0,0.0,0.0,"""4699239""","""4700431""",0.0,0.0,0.0,0.0,0.0
"""mean""",166650.050685,,9.979691,175.288132,,,173.863529,70079.371657,0.634046,0.106352,14068.943352,37.773908,0.49836,,1312.250238,0.296867,0.051146,0.051386,14.55101,-1.9346e-17,-6.2537e-17,8.9503e-19,-2.1785e-17,0.512731,85.409712,92.329614,-1.050751,,,170.782663,6.287903,0.899962,12.118226,2.160134
"""std""",96152.186935,,7.668162,65.887558,,,68.33938,40613.125643,0.481697,0.148208,8118.386941,52.226439,0.499997,,2036.052064,0.456878,0.220295,0.220785,148.354285,0.970866,0.979175,0.979443,0.634607,0.076053,83.267017,83.261527,107.82569,,,153.171551,5.904002,0.300051,7.403705,0.861888
"""min""",1.0,,1.0,1.0,"""1.0""","""0.0""",1.0,1.0,0.0,0.0,2.0,0.0,0.0,"""-10.0""",0.01,0.0,0.0,0.0,0.0,-5.746709,-5.511471,-5.382605,-4.278366,0.0,0.0,0.0,-1994.0,"""-0.009999999999990905""","""-0.009999999999999787""",0.0,0.0,0.0,1.0,1.0
"""25%""",83302.0,,5.0,100.0,,,100.0,35020.0,0.0,0.0182,7100.0,4.0,0.0,,142.22,0.0,0.0,0.0,0.007596,-0.644377,-0.37811,-0.465259,-0.3838,0.466735,40.0,40.04,-25.0,,,89.0,2.0,1.0,6.0,2.0
"""50%""",167108.0,,5.0,219.0,,,219.0,69607.0,1.0,0.0379,13609.0,17.0,0.0,,387.61,0.0,0.0,0.0,0.019333,0.041898,-0.220267,0.221822,0.016396,0.514696,63.33,71.45,17.701184,,,129.0,5.0,1.0,11.0,2.0
"""75%""",249977.0,,14.0,219.0,,,219.0,105179.0,1.0,0.138,21160.0,49.0,1.0,,1516.76,1.0,0.0,0.0,0.054735,0.656159,0.063656,0.663191,0.356175,0.555416,103.0,118.56,50.094058,,,199.65,9.0,1.0,18.0,3.0
"""max""",332787.0,,34.0,231.0,"""5.0""","""99.98""",230.0,140821.0,1.0,1.0,28416.0,498.0,1.0,"""-99.9883""",11692.98,1.0,1.0,1.0,9895.0,5.746709,6.002114,5.479458,4.065903,1.0,13162.08,1999.9,497.261251,"""999.88""","""5.0""",13162.08,36.0,1.0,35.0,6.0


## Convert str datatype to float

In [12]:
for col_name in engineered_data.columns:
    if engineered_data[col_name].dtype == pl.Utf8:
        engineered_data = engineered_data.with_columns(
            engineered_data[col_name]
            .str.replace("NULL", "NaN") 
            .str.replace("N/A", "NaN")   
            .cast(pl.Float64)          
            .alias(col_name)
        )

engineered_data.write_csv(engineered_filepath)
engineered_data.describe()

statistic,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_brand_bool,prop_location_score2,srch_destination_id,srch_booking_window,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,has_hist_starrating,has_hist_adr_usd,score1d2,normalized_prop_starrating,normalized_score1d2,normalized_prop_review_score,hotel_quality,normalized_hotel_quality,price_per_person,avg_price_per_day,ump,price_diff,starrating_diff,total_price,promotion_count,promotion_any,price_rank,star_rank
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",4953797.0,0.0,4953797.0,4953797.0,253366.0,254558.0,4953797.0,4953797.0,4953797.0,4953792.0,4953797.0,4953797.0,4953797.0,318039.0,3348504.0,4953797.0,4953797.0,4953797.0,4953797.0,4953789.0,4953789.0,4953789.0,4953789.0,4953789.0,4953797.0,4953797.0,4953797.0,254558.0,253366.0,4953797.0,4953797.0,4953797.0,4953797.0,4953797.0
"""null_count""",0.0,4953797.0,0.0,0.0,4700431.0,4699239.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,4635758.0,1605293.0,0.0,0.0,0.0,0.0,8.0,8.0,8.0,8.0,8.0,0.0,0.0,0.0,4699239.0,4700431.0,0.0,0.0,0.0,0.0,0.0
"""mean""",166650.050685,,9.979691,175.288132,3.374849,177.138355,173.863529,70079.371657,0.634046,0.106352,14068.943352,37.773908,0.49836,-24.460135,1312.250238,0.296867,0.051146,0.051386,14.55101,-1.9346e-17,-6.2537e-17,8.9503e-19,-2.1785e-17,0.512731,85.409712,92.329614,-1.050751,23.699542,0.11442,170.782663,6.287903,0.899962,12.118226,2.160134
"""std""",96152.186935,,7.668162,65.887558,0.691904,109.586905,68.33938,40613.125643,0.481697,0.148208,8118.386941,52.226439,0.499997,15.869085,2036.052064,0.456878,0.220295,0.220785,148.354285,0.970866,0.979175,0.979443,0.634607,0.076053,83.267017,83.261527,107.82569,141.427106,1.107542,153.171551,5.904002,0.300051,7.403705,0.861888
"""min""",1.0,,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0,0.0,-250.7864,0.01,0.0,0.0,0.0,0.0,-5.746709,-5.511471,-5.382605,-4.278366,0.0,0.0,0.0,-1994.0,-1829.3,-4.0,0.0,0.0,0.0,1.0,1.0
"""25%""",83302.0,,5.0,100.0,2.93,111.01,100.0,35020.0,0.0,0.0182,7100.0,4.0,0.0,-31.3713,142.22,0.0,0.0,0.0,0.007596,-0.644377,-0.37811,-0.465259,-0.3838,0.466735,40.0,40.04,-25.0,-32.01,-0.57,89.0,2.0,1.0,6.0,2.0
"""50%""",167108.0,,5.0,219.0,3.43,151.38,219.0,69607.0,1.0,0.0379,13609.0,17.0,0.0,-20.5766,387.61,0.0,0.0,0.0,0.019333,0.041898,-0.220267,0.221822,0.016396,0.514696,63.33,71.45,17.701184,23.37,0.0,129.0,5.0,1.0,11.0,2.0
"""75%""",249977.0,,14.0,219.0,3.95,215.37,219.0,105179.0,1.0,0.138,21160.0,49.0,1.0,-13.4608,1516.76,1.0,0.0,0.0,0.054735,0.656159,0.063656,0.663191,0.356175,0.555416,103.0,118.56,50.094058,81.35,0.72,199.65,9.0,1.0,18.0,3.0
"""max""",332787.0,,34.0,231.0,5.0,2768.93,230.0,140821.0,1.0,1.0,28416.0,498.0,1.0,-2.4941,11692.98,1.0,1.0,1.0,9895.0,5.746709,6.002114,5.479458,4.065903,1.0,13162.08,1999.9,497.261251,2711.51,5.0,13162.08,36.0,1.0,35.0,6.0
