In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

#
def load_and_preprocess_data(df, is_training=True, brand_map=None, model_map=None, encoders=None):
   
    df = df.copy()
    
    # handle missing data intelligently
    df['clean_title'] = df['clean_title'].fillna('Unknown')
    df['accident'] = df['accident'].fillna('Unknown')
    if df['fuel_type'].isna().any():
        df['fuel_type'] = df['fuel_type'].fillna(df['fuel_type'].mode()[0])
    
    # cap price outliers (training only)
    if is_training and 'price' in df.columns:
        price_cap = df['price'].quantile(0.99)
        df['price'] = df['price'].clip(upper=price_cap)
        print(f"price capped at ${price_cap:,.0f}")
    
    # target encoding for high-cardinality categoricals
    if is_training:
        # create encoding maps from training data
        brand_map = df.groupby('brand')['price'].mean().to_dict()
        model_map = df.groupby('model')['price'].mean().to_dict()
        
        df['brand_encoded'] = df['brand'].map(brand_map)
        df['model_encoded'] = df['model'].map(model_map)
    else:
        # apply training maps to test data, use global mean for unseen values
        global_brand_mean = np.mean(list(brand_map.values()))
        global_model_mean = np.mean(list(model_map.values()))
        
        df['brand_encoded'] = df['brand'].map(brand_map).fillna(global_brand_mean)
        df['model_encoded'] = df['model'].map(model_map).fillna(global_model_mean)
    
    # parse engine features from text
    df['horsepower'] = df['engine'].str.extract(r'(\d+\.?\d*)HP').astype(float)
    df['displacement'] = df['engine'].str.extract(r'(\d+\.?\d*)L').astype(float) 
    df['cylinders'] = df['engine'].str.extract(r'(\d+) Cylinder').astype(float)
    
    # handle missing engine features
    df['horsepower'] = df['horsepower'].fillna(df['horsepower'].median())
    df['displacement'] = df['displacement'].fillna(df['displacement'].median())
    df['cylinders'] = df['cylinders'].fillna(df['cylinders'].median())
    
    # create additional engineered features
    current_year = 2024
    df['car_age'] = current_year - df['model_year']
    df['mileage_per_year'] = df['milage'] / (df['car_age'] + 1)
    df['hp_per_liter'] = df['horsepower'] / (df['displacement'] + 0.1)  # avoid division by zero
    
    # encode remaining categorical variables
    categorical_cols = ['fuel_type', 'transmission', 'ext_col', 'int_col', 'clean_title', 'accident']
    
    if is_training:
        # create encoders from training data
        encoders = {}
        for col in categorical_cols:
            le = LabelEncoder()
            df[f'{col}_encoded'] = le.fit_transform(df[col])
            encoders[col] = le
    else:
        # apply training encoders to test data
        for col in categorical_cols:
            le = encoders[col]
            # handle unseen categories by mapping to a default value
            df[f'{col}_temp'] = df[col].apply(lambda x: x if x in le.classes_ else le.classes_[0])
            df[f'{col}_encoded'] = le.transform(df[f'{col}_temp'])
            df.drop(f'{col}_temp', axis=1, inplace=True)
    
    return df, brand_map, model_map, encoders

#
def create_feature_matrix(df):

    feature_cols = [
        'model_year', 'milage', 'brand_encoded', 'model_encoded',
        'horsepower', 'displacement', 'cylinders', 'car_age', 
        'mileage_per_year', 'hp_per_liter',
        'fuel_type_encoded', 'transmission_encoded', 'ext_col_encoded',
        'int_col_encoded', 'clean_title_encoded', 'accident_encoded'
    ]
    
    return df[feature_cols]

#
def train_model(X_train, y_train, X_val, y_val):
    
    model = xgb.XGBRegressor(
        n_estimators=300,
        max_depth=8,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=20,
        verbose=False
    )
    
    return model


#
def main():
    """
    main execution pipeline for car price prediction
    loads data, preprocesses, trains model, and creates submission
    """
    print("loading training data...")
    train_df = pd.read_csv('/kaggle/input/hackathon-qualification/archive/train.csv')
    print(f"training data shape: {train_df.shape}")
    
    print("loading test data...")
    test_df = pd.read_csv('/kaggle/input/hackathon-qualification/archive/test.csv')
    print(f"test data shape: {test_df.shape}")
    
    # preprocess training data
    print("preprocessing training data...")
    train_processed, brand_map, model_map, encoders = load_and_preprocess_data(
        train_df, is_training=True
    )
    
    # create feature matrices
    X = create_feature_matrix(train_processed)
    y = train_processed['price']
    
    print(f"feature matrix shape: {X.shape}")
    print(f"features: {list(X.columns)}")
    
    # train/validation split
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=23
    )
    
    # train model
    print("training xgboost model...")
    model = train_model(X_train, y_train, X_val, y_val)
    
    # evaluate on validation set
    y_pred_val = model.predict(X_val)
    mae = mean_absolute_error(y_val, y_pred_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
    
    print(f"validation mae: ${mae:,.0f}")
    print(f"validation rmse: ${rmse:,.0f}")
    
    # feature importance
    feature_importance = sorted(
        zip(X.columns, model.feature_importances_), 
        key=lambda x: x[1], reverse=True
    )
    print("\ntop 5 feature importance:")
    for feature, importance in feature_importance[:5]:
        print(f"{feature}: {importance:.3f}")
    
    # preprocess test data using training mappings
    print("preprocessing test data...")
    test_processed, _, _, _ = load_and_preprocess_data(
        test_df, is_training=False, 
        brand_map=brand_map, model_map=model_map, encoders=encoders
    )
    
    # create test feature matrix
    X_test = create_feature_matrix(test_processed)
    print(f"test feature matrix shape: {X_test.shape}")
    
    # make predictions
    print("generating predictions...")
    test_predictions = model.predict(X_test)
    
    # create submission file
    submission = pd.DataFrame({
        'id': test_df['id'],
        'price': test_predictions.astype(int)
    })
    
    submission.to_csv('submission.csv', index=False)
    print("submission file created: submission.csv")
    print(f"submission shape: {submission.shape}")
    print(f"prediction range: ${submission['price'].min():,} to ${submission['price'].max():,}")

if __name__ == "__main__":
    main()