In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.cluster import KMeans # for clustering routes
import statsmodels.api as sm # useful for regression diagnostics

from econml.dml import LinearDML, CausalForestDML
from econml.cate_interpreter import SingleTreeCateInterpreter

# --- Plotting Configuration ---
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_context("notebook", font_scale=1.2)
%matplotlib inline

In [60]:
def clean_numeric_col(series):
    if series.dtype == 'object':
        series = series.str.replace('%', '', regex=False)
        series = series.str.replace(',', '.', regex=False) 
        series = series.str.replace(',', '', regex=False)
    numeric_series = pd.to_numeric(series, errors='coerce')
    return numeric_series

In [61]:
df_t1 = pd.read_csv('../datasets/processed/df1.csv', sep=',', decimal='.')
df_t7 = pd.read_csv('../datasets/processed/df7.csv', sep=',', decimal='.')

In [62]:
print("--- Inspecting df_t1['citymarketid_1'] before cleaning/conversion ---")
print("Data Type:", df_t1['citymarketid_1'].dtype)
print("Non-Null Count:", df_t1['citymarketid_1'].notna().sum())
print("Unique values (sample):")
try:
    unique_vals = df_t1['citymarketid_1'].unique()
    print(unique_vals[:50]) # shows values like '32,575'
except Exception as e:
    print(f"Could not display unique values: {e}")


# --- Clean the string column BEFORE converting ---
print("\n--- Applying cleaning steps to citymarketid_1 ---")

# 1. Ensure it's treated as string and strip whitespace 
df_t1['citymarketid_1'] = df_t1['citymarketid_1'].astype(str).str.strip()

# 2. *** FIX: Remove commas used as thousands separators ***
print("Removing commas...")
df_t1['citymarketid_1'] = df_t1['citymarketid_1'].str.replace(',', '', regex=False)

# Opt. Check unique values again AFTER cleaning to verify
print("Unique values (sample) AFTER cleaning:")
unique_vals_cleaned = df_t1['citymarketid_1'].unique()
print(unique_vals_cleaned[:50])

# --- Convert to numeric AFTER cleaning ---
print("\n--- Attempting Conversion to Int64 ---")
df_t1['citymarketid_1'] = pd.to_numeric(df_t1['citymarketid_1'], errors='coerce').astype('Int64')
# Also convert citymarketid_2 and df_t7['citymarketid']
df_t1['citymarketid_2'] = pd.to_numeric(df_t1['citymarketid_2'], errors='coerce').astype('Int64')
df_t7['citymarketid'] = pd.to_numeric(df_t7['citymarketid'], errors='coerce').astype('Int64')


# --- Check conversion result ---
print("\n--- Checking df_t1['citymarketid_1'] AFTER conversion ---")
print("Data Type:", df_t1['citymarketid_1'].dtype)
print("Non-Null Count:", df_t1['citymarketid_1'].notna().sum())
print("Null Count:", df_t1['citymarketid_1'].isna().sum()) 


# --- Verify types before merge ---
print("\n--- Verifying types before merge ---")
print(f"df_t1 citymarketid_1 dtype: {df_t1['citymarketid_1'].dtype}, Non-Nulls: {df_t1['citymarketid_1'].notna().sum()}")
print(f"df_t1 citymarketid_2 dtype: {df_t1['citymarketid_2'].dtype}, Non-Nulls: {df_t1['citymarketid_2'].notna().sum()}")
print(f"df_t7 citymarketid dtype: {df_t7['citymarketid'].dtype}, Non-Nulls: {df_t7['citymarketid'].notna().sum()}")


# --- Proceed with merge only if conversion was successful ---
if df_t1['citymarketid_1'].notna().sum() > 0:
    print("\nConversion appears successful, proceeding with merge...")

    # Ensure 'Year', 'quarter' have matching types
    df_t1['Year'] = pd.to_numeric(df_t1['Year'], errors='coerce').astype('Int64')
    df_t1['quarter'] = pd.to_numeric(df_t1['quarter'], errors='coerce').astype('Int64')
    df_t7['Year'] = pd.to_numeric(df_t7['Year'], errors='coerce').astype('Int64')
    df_t7['quarter'] = pd.to_numeric(df_t7['quarter'], errors='coerce').astype('Int64')

    # Clean T7 columns and rename (using correct T7 column names from info())
    df_t7_clean = df_t7[['Year', 'quarter', 'citymarketid', 'TotalPerPrem', 'TotalPerLFMkts']].copy()
    df_t7_clean.rename(columns={'TotalPerPrem': 'city_fare_premium',
                          'TotalPerLFMkts': 'city_perc_lcc_pax'}, inplace=True)
    df_t7_final = df_t7_clean[['Year', 'quarter', 'citymarketid', 'city_fare_premium', 'city_perc_lcc_pax']]

print(f"Number of rows in df_t7_final before duplicate check: {len(df_t7_final)}")
key_cols = ['Year', 'quarter', 'citymarketid']

# --- Check for duplicates based on merge keys ---
num_duplicates = df_t7_final.duplicated(subset=key_cols).sum()
print(f"Number of duplicate key sets found in df_t7_final: {num_duplicates}")

if num_duplicates > 0:
    print("Handling duplicates by averaging values...")
    value_cols = ['city_fare_premium', 'city_perc_lcc_pax'] 
    
    # Group by keys, calculate mean, reset index
    df_t7_final = df_t7_final.groupby(key_cols, as_index=False)[value_cols].mean()
    print(f"Number of rows in df_t7_final after averaging duplicates: {len(df_t7_final)}")

    # Merge for Origin City
    df_merged = pd.merge(df_t1, df_t7_final,
                         left_on=['Year', 'quarter', 'citymarketid_1'],
                         right_on=['Year', 'quarter', 'citymarketid'],
                         how='left')
    df_merged.rename(columns={'city_fare_premium': 'origin_fare_premium',
                              'city_perc_lcc_pax': 'origin_perc_lcc_pax'}, inplace=True)

    # Merge for Destination City
    df_merged = pd.merge(df_merged, df_t7_final,
                         left_on=['Year', 'quarter', 'citymarketid_2'],
                         right_on=['Year', 'quarter', 'citymarketid'],
                         how='left', suffixes=('_orig_temp', ''))
    df_merged.rename(columns={'city_fare_premium': 'dest_fare_premium',
                              'city_perc_lcc_pax': 'dest_perc_lcc_pax'}, inplace=True)

    # Clean up columns
    df_merged.drop(columns=['citymarketid_orig_temp', 'citymarketid'], errors='ignore', inplace=True)

    # Create route_id
    df_merged['route_id'] = df_merged['citymarketid_1'].astype(str) + '_' + df_merged['citymarketid_2'].astype(str)

    # Assign to df and check results
    df = df_merged.copy()
    print("\n--- Final merged DataFrame info: ---")
    df.info()
    print("\n--- Final missing values AFTER MERGE: ---")
    print(df.isnull().sum())

    # Handle remaining NAs (from non-matches)
    fill_cols = ['origin_fare_premium', 'origin_perc_lcc_pax', 'dest_fare_premium', 'dest_perc_lcc_pax']
    for col in fill_cols:
        if col in df.columns:
            df[col] = df[col].fillna(0) # fill with 0 for no matching city market ids
    
    print("\n--- Final missing values AFTER HANDLING: ---")
    print(df.isnull().sum())

else:
    print("\nMerge skipped: df_t1['citymarketid_1'] conversion still failed after cleaning.")
    print("If commas were removed and it still fails, re-inspect unique values for other issues.")

--- Inspecting df_t1['citymarketid_1'] before cleaning/conversion ---
Data Type: object
Non-Null Count: 115032
Unique values (sample):
['32,575' '32,467' '31,703' '30,977' '30,397' '32,211' '30,721' '32,457'
 '30,194' '33,570' '30,325' '31,454' '31,453' '31,057' '31,295' '30,693'
 '30,466' '34,057' '31,650' '30,559' '30,423' '33,195' '33,192' '31,714'
 '30,994' '33,495' '31,136' '30,792' '34,614' '32,337' '31,066' '30,647'
 '34,100' '31,135' '33,105' '30,529' '31,123' '33,198' '30,713' '33,342'
 '33,214' '30,852' '34,492' '30,140' '34,321' '30,154' '30,198' '33,044'
 '31,995' '33,244']

--- Applying cleaning steps to citymarketid_1 ---
Removing commas...
Unique values (sample) AFTER cleaning:
['32575' '32467' '31703' '30977' '30397' '32211' '30721' '32457' '30194'
 '33570' '30325' '31454' '31453' '31057' '31295' '30693' '30466' '34057'
 '31650' '30559' '30423' '33195' '33192' '31714' '30994' '33495' '31136'
 '30792' '34614' '32337' '31066' '30647' '34100' '31135' '33105' '30529'
 '3112