In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

#
def load_and_preprocess_data(df, is_training=True, brand_map=None, model_map=None, encoders=None):
   
    df = df.copy()
    
    # handle missing data intelligently
    df['clean_title'] = df['clean_title'].fillna('Unknown')
    df['accident'] = df['accident'].fillna('Unknown')
    if df['fuel_type'].isna().any():
        df['fuel_type'] = df['fuel_type'].fillna(df['fuel_type'].mode()[0])
    
    # cap price outliers (training only)
    if is_training and 'price' in df.columns:
        price_cap = df['price'].quantile(0.99)
        df['price'] = df['price'].clip(upper=price_cap)
        print(f"price capped at ${price_cap:,.0f}")
    
    # target encoding for high-cardinality categoricals
    if is_training:
        # create encoding maps from training data
        brand_map = df.groupby('brand')['price'].mean().to_dict()
        model_map = df.groupby('model')['price'].mean().to_dict()
        
        df['brand_encoded'] = df['brand'].map(brand_map)
        df['model_encoded'] = df['model'].map(model_map)
    else:
        # apply training maps to test data, use global mean for unseen values
        global_brand_mean = np.mean(list(brand_map.values()))
        global_model_mean = np.mean(list(model_map.values()))
        
        df['brand_encoded'] = df['brand'].map(brand_map).fillna(global_brand_mean)
        df['model_encoded'] = df['model'].map(model_map).fillna(global_model_mean)
    
    # parse engine features from text
    df['horsepower'] = df['engine'].str.extract(r'(\d+\.?\d*)HP').astype(float)
    df['displacement'] = df['engine'].str.extract(r'(\d+\.?\d*)L').astype(float) 
    df['cylinders'] = df['engine'].str.extract(r'(\d+) Cylinder').astype(float)
    
    # handle missing engine features
    df['horsepower'] = df['horsepower'].fillna(df['horsepower'].median())
    df['displacement'] = df['displacement'].fillna(df['displacement'].median())
    df['cylinders'] = df['cylinders'].fillna(df['cylinders'].median())
    
    # create additional engineered features
    current_year = 2024
    df['car_age'] = current_year - df['model_year']
    df['mileage_per_year'] = df['milage'] / (df['car_age'] + 1)
    df['hp_per_liter'] = df['horsepower'] / (df['displacement'] + 0.1)  # avoid division by zero
    
    # encode remaining categorical variables
    categorical_cols = ['fuel_type', 'transmission', 'ext_col', 'int_col', 'clean_title', 'accident']
    
    if is_training:
        # create encoders from training data
        encoders = {}
        for col in categorical_cols:
            le = LabelEncoder()
            df[f'{col}_encoded'] = le.fit_transform(df[col])
            encoders[col] = le
    else:
        # apply training encoders to test data
        for col in categorical_cols:
            le = encoders[col]
            # handle unseen categories by mapping to a default value
            df[f'{col}_temp'] = df[col].apply(lambda x: x if x in le.classes_ else le.classes_[0])
            df[f'{col}_encoded'] = le.transform(df[f'{col}_temp'])
            df.drop(f'{col}_temp', axis=1, inplace=True)
    
    return df, brand_map, model_map, encoders