In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)


In [9]:

# ============================================================================
# 1. DATA LOADING AND INITIAL EXPLORATION
# ============================================================================

def load_and_explore_data(file_path='data.csv'):
    """
    Load the dataset and perform initial exploration
    """
    print("=" * 60)
    print("1. DATA LOADING AND EXPLORATION")
    print("=" * 60)
    
    # Load data
    df = pd.read_csv(file_path)
    
    # # Based on the sample data, let's define column names
    # column_names = [
    #     'vehicle_type', 'year', 'make', 'model', 'body_style', 'model_group', 
    #     'series', 'vin', 'sale_date', 'odometer', 'run_and_drive', 'title_type',
    #     'damage_primary', 'damage_secondary', 'zip_code', 'sale_price', 'lot_number',
    #     'location_city', 'location_state_full', 'location_state', 'grade', 
    #     'location_country', 'color', 'transmission', 'image_url', 'fuel_type',
    #     'keys', 'notes'
    # ]
    
    # # Assign column names if they don't exist
    # if len(df.columns) == len(column_names):
    #     df.columns = column_names
    
    print(f"Dataset shape: {df.shape}")
    # print(f"Dataset info:")
    # print(df.info())
    
    # Check for missing values
    print(f"\nMissing values:")
    missing_values = df.isnull().sum()
    print(missing_values[missing_values > 0])
    
    # Basic statistics
    print(f"\nBasic statistics for numerical columns:")
    print(df.describe())
    
    return df

df = load_and_explore_data('data.csv')
df

1. DATA LOADING AND EXPLORATION
Dataset shape: (45723, 28)

Missing values:
Drivetrain                           16824
Vehicle Body Style                   16824
Vehicle Engine                       16824
Secondary Damage Type Description    24214
Transmission Type                      650
dtype: int64

Basic statistics for numerical columns:
           Lot Year     Sale Price  Odometer Reading   Yard Number  \
count  45723.000000   45723.000000      4.572300e+04  45723.000000   
mean    2014.656803    3999.724143      1.126040e+05    124.722547   
std        6.106087    5525.663314      9.533528e+04    108.200947   
min     1928.000000       1.000000      0.000000e+00      1.000000   
25%     2012.000000     875.000000      5.719900e+04     37.000000   
50%     2015.000000    2200.000000      1.082800e+05     91.000000   
75%     2019.000000    5000.000000      1.585090e+05    174.000000   
max     2025.000000  184000.000000      9.592488e+06    396.000000   

       LOT_SOLD_RUN   RE

Unnamed: 0,Automobile,Lot Year,Lot Make,Lot Model,Drivetrain,Vehicle Body Style,Vehicle Engine,VIN,Invoice Date,Sale Price,...,Yard city,Yard State,Title Type,Title State,Lot Color,Transmission Type,Lot Link,Lot Fuel Type,LOT_SOLD_RUN,RERUN_COUNT
0,AUTOMOBILE ...,2024,NISS,VERSA S,,,,3N1CN8DV8RL867765,05/26/2025,3800,...,SEAFORD,DE,S1,MD,GRAY,Automatic,https://www.copart.com/lot/49239315,GAS ...,3,0
1,AUTOMOBILE ...,2022,HYUN,ELANTRA N,,,,KMHLW4AK9NU008872,05/27/2025,7400,...,COLUMBUS,OH,ST,OH,BLACK,Automatic,https://www.copart.com/lot/54318285,GAS ...,1,0
2,AUTOMOBILE ...,2025,DODG,CHARGER DA,,,,2C3CDBCK9SR543225,05/27/2025,28500,...,CHICAGO HEIGHTS,IL,CT,MI,BLACK,Automatic,https://www.copart.com/lot/49432685,ELECTRIC ...,1,0
3,AUTOMOBILE ...,2018,FORD,FOCUS SEL,,,,1FADP3H24JL294433,05/26/2025,600,...,GRAHAM,WA,BS,WA,WHITE,Automatic,https://www.copart.com/lot/38223554,GAS ...,2,1
4,VANS/MINIVANS ...,2013,TOYT,SIENNA LE,FWD,SPORTS VAN,3.5L 6,5TDKK3DC0DS353348,05/27/2025,2150,...,PENNSBURG,PA,SC,PA,SILVR,Automatic,https://www.copart.com/lot/57671325,GAS ...,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45718,AUTOMOBILE ...,2018,NISS,KICKS S,,,,3N1CP5CU5JL542657,05/30/2025,2650,...,DES MOINES,IA,ST,IA,WHITE,Automatic,https://www.copart.com/lot/42538725,GAS ...,1,0
45719,AUTOMOBILE ...,2007,TOYT,COROLLA CE,FWD,SEDAN 4DR,1.8L 4,1NXBR30E47Z815523,05/28/2025,1550,...,PORTLAND,OR,SC,OR,SILVR,Automatic,https://www.copart.com/lot/51857845,GAS ...,1,0
45720,SUV'S ...,2023,MITS,OUTLANDER,,,,JA4J4UA83PZ011708,05/28/2025,8100,...,COLORADO SPRINGS,CO,ST,CO,BLACK,Automatic,https://www.copart.com/lot/49513265,GAS ...,1,0
45721,AUTOMOBILE ...,2022,HYUN,ELANTRA SE,,,,KMHLN4AGXNU227404,05/29/2025,6400,...,COOKSTOWN,ON,ST,ON,RED,Automatic,https://www.copart.com/lot/49952685,GAS ...,1,0


In [7]:
# ============================================================================
# 2. DATA PREPROCESSING AND CLEANING
# ============================================================================

def preprocess_data(df):
    """
    Clean and preprocess the data
    """
    print("\n" + "=" * 60)
    print("2. DATA PREPROCESSING AND CLEANING")
    print("=" * 60)
    
    # Create a copy to avoid modifying original data
    df_processed = df.copy()
    
    # Clean sale_price (target variable)
    # Remove rows where sale_price is null or zero
    initial_rows = len(df_processed)
    df_processed = df_processed.dropna(subset=['Sale Price'])
    df_processed = df_processed[df_processed['Sale Price'] > 0]
    print(f"Removed {initial_rows - len(df_processed)} rows with invalid sale prices")
    
    # Clean year column
    df_processed['Lot Year'] = pd.to_numeric(df_processed['Lot Year'], errors='coerce')
    df_processed = df_processed[df_processed['Lot Year'].between(1980, 2025)]
    
    # Clean odometer readings
    df_processed['Odometer Reading'] = pd.to_numeric(df_processed['Odometer Reading'], errors='coerce')
    # Replace unrealistic odometer readings
    df_processed.loc[df_processed['Odometer Reading'] > 500000, 'Odometer Reading'] = np.nan
    
    # Clean and standardize categorical variables
    categorical_columns = ['make', 'model', 'color', 'transmission', 'fuel_type', 
                          'run_and_drive', 'title_type', 'damage_primary', 'location_state']
    
    for col in categorical_columns:
        if col in df_processed.columns:
            df_processed[col] = df_processed[col].astype(str).str.strip().str.upper()
    
    # Create vehicle age feature
    df_processed['vehicle_age'] = 2025 - df_processed['year']
    
    # Extract useful features from VIN (if available)
    if 'vin' in df_processed.columns:
        df_processed['vin_length'] = df_processed['vin'].astype(str).str.len()
    
    # Create damage severity feature
    if 'damage_primary' in df_processed.columns:
        df_processed['has_damage'] = df_processed['damage_primary'].notna().astype(int)
    
    # Clean location data
    if 'location_state' in df_processed.columns:
        df_processed['location_state'] = df_processed['location_state'].fillna('UNKNOWN')
    
    print(f"Final dataset shape after preprocessing: {df_processed.shape}")
    
    return df_processed

df_processed = preprocess_data(df)
df_processed


2. DATA PREPROCESSING AND CLEANING


KeyError: ['sale_price']

In [None]:


# ============================================================================
# 3. EXPLORATORY DATA ANALYSIS (EDA)
# ============================================================================

def perform_eda(df):
    """
    Perform exploratory data analysis
    """
    print("\n" + "=" * 60)
    print("3. EXPLORATORY DATA ANALYSIS")
    print("=" * 60)
    
    # Set up the plotting style
    plt.style.use('default')
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    
    # 1. Distribution of target variable (sale_price)
    axes[0, 0].hist(df['sale_price'], bins=50, alpha=0.7, color='skyblue')
    axes[0, 0].set_title('Distribution of Sale Prices')
    axes[0, 0].set_xlabel('Sale Price ($)')
    axes[0, 0].set_ylabel('Frequency')
    
    # 2. Sale price vs Vehicle Age
    if 'vehicle_age' in df.columns:
        axes[0, 1].scatter(df['vehicle_age'], df['sale_price'], alpha=0.5, color='green')
        axes[0, 1].set_title('Sale Price vs Vehicle Age')
        axes[0, 1].set_xlabel('Vehicle Age (years)')
        axes[0, 1].set_ylabel('Sale Price ($)')
    
    # 3. Sale price vs Odometer
    if 'odometer' in df.columns:
        valid_odometer = df.dropna(subset=['odometer'])
        axes[0, 2].scatter(valid_odometer['odometer'], valid_odometer['sale_price'], 
                          alpha=0.5, color='red')
        axes[0, 2].set_title('Sale Price vs Odometer Reading')
        axes[0, 2].set_xlabel('Odometer (miles)')
        axes[0, 2].set_ylabel('Sale Price ($)')
    
    # 4. Average sale price by make (top 10)
    if 'make' in df.columns:
        make_prices = df.groupby('make')['sale_price'].agg(['mean', 'count']).reset_index()
        make_prices = make_prices[make_prices['count'] >= 10].sort_values('mean', ascending=False).head(10)
        axes[1, 0].bar(range(len(make_prices)), make_prices['mean'], color='orange')
        axes[1, 0].set_title('Average Sale Price by Make (Top 10)')
        axes[1, 0].set_xlabel('Make')
        axes[1, 0].set_ylabel('Average Sale Price ($)')
        axes[1, 0].set_xticks(range(len(make_prices)))
        axes[1, 0].set_xticklabels(make_prices['make'], rotation=45, ha='right')
    
    # 5. Sale price by transmission type
    if 'transmission' in df.columns:
        trans_prices = df.groupby('transmission')['sale_price'].mean().sort_values(ascending=False)
        axes[1, 1].bar(range(len(trans_prices)), trans_prices.values, color='purple')
        axes[1, 1].set_title('Average Sale Price by Transmission')
        axes[1, 1].set_xlabel('Transmission Type')
        axes[1, 1].set_ylabel('Average Sale Price ($)')
        axes[1, 1].set_xticks(range(len(trans_prices)))
        axes[1, 1].set_xticklabels(trans_prices.index, rotation=45, ha='right')
    
    # 6. Correlation heatmap for numerical features
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    correlation_matrix = df[numerical_cols].corr()
    im = axes[1, 2].imshow(correlation_matrix, cmap='coolwarm', aspect='auto')
    axes[1, 2].set_title('Correlation Matrix')
    axes[1, 2].set_xticks(range(len(numerical_cols)))
    axes[1, 2].set_yticks(range(len(numerical_cols)))
    axes[1, 2].set_xticklabels(numerical_cols, rotation=45, ha='right')
    axes[1, 2].set_yticklabels(numerical_cols)
    
    plt.tight_layout()
    plt.show()
    
    # Print key insights
    print(f"\nKey Insights:")
    print(f"- Average sale price: ${df['sale_price'].mean():.2f}")
    print(f"- Median sale price: ${df['sale_price'].median():.2f}")
    print(f"- Price range: ${df['sale_price'].min():.2f} - ${df['sale_price'].max():.2f}")
    
    if 'vehicle_age' in df.columns:
        correlation = df['vehicle_age'].corr(df['sale_price'])
        print(f"- Correlation between vehicle age and price: {correlation:.3f}")
    
    if 'odometer' in df.columns:
        correlation = df['odometer'].corr(df['sale_price'])
        print(f"- Correlation between odometer and price: {correlation:.3f}")

# ============================================================================
# 4. FEATURE ENGINEERING
# ============================================================================

def engineer_features(df):
    """
    Create new features and prepare data for modeling
    """
    print("\n" + "=" * 60)
    print("4. FEATURE ENGINEERING")
    print("=" * 60)
    
    df_features = df.copy()
    
    # 1. Price per mile (if odometer available)
    if 'odometer' in df_features.columns:
        df_features['price_per_mile'] = df_features['sale_price'] / (df_features['odometer'] + 1)
    
    # 2. Luxury brand indicator
    luxury_brands = ['BMW', 'MERCEDES', 'AUDI', 'LEXUS', 'INFINITI', 'ACURA', 'CADILLAC']
    if 'make' in df_features.columns:
        df_features['is_luxury'] = df_features['make'].isin(luxury_brands).astype(int)
    
    # 3. Popular model indicator (models with >50 entries)
    if 'model' in df_features.columns:
        model_counts = df_features['model'].value_counts()
        popular_models = model_counts[model_counts > 50].index
        df_features['is_popular_model'] = df_features['model'].isin(popular_models).astype(int)
    
    # 4. Damage severity score
    if 'damage_primary' in df_features.columns:
        damage_severity = {
            'MINOR': 1, 'MODERATE': 2, 'MAJOR': 3, 'SEVERE': 4,
            'FRONT': 2, 'REAR': 2, 'SIDE': 2, 'ALL OVER': 4
        }
        df_features['damage_severity'] = df_features['damage_primary'].map(damage_severity).fillna(0)
    
    # 5. Age groups
    if 'vehicle_age' in df_features.columns:
        df_features['age_group'] = pd.cut(df_features['vehicle_age'], 
                                         bins=[0, 3, 7, 15, 100], 
                                         labels=['New', 'Recent', 'Older', 'Classic'])
    
    # 6. Mileage groups
    if 'odometer' in df_features.columns:
        df_features['mileage_group'] = pd.cut(df_features['odometer'], 
                                             bins=[0, 30000, 75000, 150000, 500000], 
                                             labels=['Low', 'Medium', 'High', 'Very High'])
    
    print(f"Features engineered. New dataset shape: {df_features.shape}")
    print(f"New features created: price_per_mile, is_luxury, is_popular_model, damage_severity, age_group, mileage_group")
    
    return df_features

# ============================================================================
# 5. MODEL PREPARATION
# ============================================================================

def prepare_model_data(df):
    """
    Prepare data for machine learning models
    """
    print("\n" + "=" * 60)
    print("5. MODEL PREPARATION")
    print("=" * 60)
    
    # Define target variable
    target = 'sale_price'
    
    # Define features to use
    numerical_features = ['year', 'odometer', 'vehicle_age', 'damage_severity']
    categorical_features = ['make', 'transmission', 'fuel_type', 'run_and_drive', 
                           'title_type', 'location_state', 'color']
    
    # Filter features that actually exist in the dataset
    numerical_features = [f for f in numerical_features if f in df.columns]
    categorical_features = [f for f in categorical_features if f in df.columns]
    
    # Add engineered features if they exist
    if 'is_luxury' in df.columns:
        numerical_features.append('is_luxury')
    if 'is_popular_model' in df.columns:
        numerical_features.append('is_popular_model')
    
    all_features = numerical_features + categorical_features
    
    # Prepare X and y
    X = df[all_features].copy()
    y = df[target].copy()
    
    # Remove rows with missing target values
    mask = ~y.isna()
    X = X[mask]
    y = y[mask]
    
    print(f"Features selected:")
    print(f"- Numerical features: {numerical_features}")
    print(f"- Categorical features: {categorical_features}")
    print(f"Final dataset shape: {X.shape}")
    
    return X, y, numerical_features, categorical_features

# ============================================================================
# 6. MODEL TRAINING AND EVALUATION
# ============================================================================

def train_and_evaluate_models(X, y, numerical_features, categorical_features):
    """
    Train multiple models and evaluate their performance
    """
    print("\n" + "=" * 60)
    print("6. MODEL TRAINING AND EVALUATION")
    print("=" * 60)
    
    # Create preprocessing pipelines
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    
    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    # Define models to try
    models = {
        'Linear Regression': LinearRegression(),
        'Ridge Regression': Ridge(alpha=1.0),
        'Lasso Regression': Lasso(alpha=1.0),
        'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
        'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
        'Decision Tree': DecisionTreeRegressor(random_state=42)
    }
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Store results
    model_results = {}
    trained_models = {}
    
    print("Training models...")
    
    for model_name, model in models.items():
        print(f"\nTraining {model_name}...")
        
        # Create pipeline
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('regressor', model)
        ])
        
        # Train model
        pipeline.fit(X_train, y_train)
        
        # Make predictions
        y_pred_train = pipeline.predict(X_train)
        y_pred_test = pipeline.predict(X_test)
        
        # Calculate metrics
        train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
        test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
        train_r2 = r2_score(y_train, y_pred_train)
        test_r2 = r2_score(y_test, y_pred_test)
        test_mae = mean_absolute_error(y_test, y_pred_test)
        
        # Cross-validation score
        cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, 
                                   scoring='neg_root_mean_squared_error')
        cv_rmse = -cv_scores.mean()
        
        # Store results
        model_results[model_name] = {
            'Train RMSE': train_rmse,
            'Test RMSE': test_rmse,
            'Train R²': train_r2,
            'Test R²': test_r2,
            'Test MAE': test_mae,
            'CV RMSE': cv_rmse
        }
        
        trained_models[model_name] = pipeline
        
        print(f"  Test RMSE: ${test_rmse:.2f}")
        print(f"  Test R²: {test_r2:.4f}")
    
    # Create results DataFrame
    results_df = pd.DataFrame(model_results).T
    print(f"\nModel Comparison:")
    print(results_df.round(2))
    
    # Find best model
    best_model_name = results_df['Test R²'].idxmax()
    best_model = trained_models[best_model_name]
    
    print(f"\nBest model: {best_model_name}")
    print(f"Best model Test R²: {results_df.loc[best_model_name, 'Test R²']:.4f}")
    
    return trained_models, results_df, best_model, best_model_name, X_test, y_test

# ============================================================================
# 7. MODEL INTERPRETATION AND FEATURE IMPORTANCE
# ============================================================================

def analyze_model_performance(best_model, best_model_name, X_test, y_test, numerical_features, categorical_features):
    """
    Analyze the best model's performance and feature importance
    """
    print("\n" + "=" * 60)
    print("7. MODEL INTERPRETATION")
    print("=" * 60)
    
    # Make predictions
    y_pred = best_model.predict(X_test)
    
    # Create visualization
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # 1. Actual vs Predicted
    axes[0, 0].scatter(y_test, y_pred, alpha=0.6)
    axes[0, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    axes[0, 0].set_xlabel('Actual Price')
    axes[0, 0].set_ylabel('Predicted Price')
    axes[0, 0].set_title('Actual vs Predicted Prices')
    
    # 2. Residuals plot
    residuals = y_test - y_pred
    axes[0, 1].scatter(y_pred, residuals, alpha=0.6)
    axes[0, 1].axhline(y=0, color='r', linestyle='--')
    axes[0, 1].set_xlabel('Predicted Price')
    axes[0, 1].set_ylabel('Residuals')
    axes[0, 1].set_title('Residuals Plot')
    
    # 3. Feature importance (if available)
    if hasattr(best_model.named_steps['regressor'], 'feature_importances_'):
        # Get feature names after preprocessing
        preprocessor = best_model.named_steps['preprocessor']
        feature_names = (numerical_features + 
                        list(preprocessor.named_transformers_['cat']
                            .named_steps['onehot'].get_feature_names_out(categorical_features)))
        
        importances = best_model.named_steps['regressor'].feature_importances_
        
        # Get top 15 features
        top_indices = np.argsort(importances)[-15:]
        top_features = [feature_names[i] for i in top_indices]
        top_importances = importances[top_indices]
        
        axes[1, 0].barh(range(len(top_features)), top_importances)
        axes[1, 0].set_yticks(range(len(top_features)))
        axes[1, 0].set_yticklabels(top_features)
        axes[1, 0].set_xlabel('Feature Importance')
        axes[1, 0].set_title('Top 15 Feature Importances')
    
    # 4. Distribution of residuals
    axes[1, 1].hist(residuals, bins=30, alpha=0.7, color='skyblue')
    axes[1, 1].set_xlabel('Residuals')
    axes[1, 1].set_ylabel('Frequency')
    axes[1, 1].set_title('Distribution of Residuals')
    
    plt.tight_layout()
    plt.show()
    
    # Calculate additional metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"\nFinal Model Performance ({best_model_name}):")
    print(f"- RMSE: ${rmse:.2f}")
    print(f"- MAE: ${mae:.2f}")
    print(f"- R² Score: {r2:.4f}")
    print(f"- Mean Actual Price: ${y_test.mean():.2f}")
    print(f"- Mean Predicted Price: ${y_pred.mean():.2f}")

# ============================================================================
# 8. PREDICTION FUNCTION
# ============================================================================

def make_predictions(model, X_sample):
    """
    Make predictions on new data
    """
    print("\n" + "=" * 60)
    print("8. MAKING PREDICTIONS")
    print("=" * 60)
    
    predictions = model.predict(X_sample)
    
    print(f"Predictions for sample data:")
    for i, pred in enumerate(predictions[:5]):  # Show first 5 predictions
        print(f"Sample {i+1}: ${pred:.2f}")
    
    return predictions

# ============================================================================
# 9. MODEL HYPERPARAMETER TUNING
# ============================================================================

def tune_best_model(best_model_name, X_train, y_train, numerical_features, categorical_features):
    """
    Perform hyperparameter tuning on the best model
    """
    print("\n" + "=" * 60)
    print("9. HYPERPARAMETER TUNING")
    print("=" * 60)
    
    # Create preprocessing pipeline
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    # Define parameter grids for different models
    param_grids = {
        'Random Forest': {
            'regressor__n_estimators': [100, 200],
            'regressor__max_depth': [10, 20, None],
            'regressor__min_samples_split': [2, 5],
            'regressor__min_samples_leaf': [1, 2]
        },
        'Gradient Boosting': {
            'regressor__n_estimators': [100, 200],
            'regressor__learning_rate': [0.05, 0.1, 0.2],
            'regressor__max_depth': [3, 5, 7]
        }
    }
    
    if best_model_name in param_grids:
        print(f"Tuning {best_model_name}...")
        
        # Create model
        if best_model_name == 'Random Forest':
            model = RandomForestRegressor(random_state=42)
        elif best_model_name == 'Gradient Boosting':
            model = GradientBoostingRegressor(random_state=42)
        
        # Create pipeline
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('regressor', model)
        ])
        
        # Perform grid search
        grid_search = GridSearchCV(
            pipeline, 
            param_grids[best_model_name], 
            cv=3, 
            scoring='neg_root_mean_squared_error',
            n_jobs=-1,
            verbose=1
        )
        
        grid_search.fit(X_train, y_train)
        
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Best CV score: {-grid_search.best_score_:.2f}")
        
        return grid_search.best_estimator_
    
    else:
        print(f"No hyperparameter tuning defined for {best_model_name}")
        return None

# ============================================================================
# 10. MAIN EXECUTION FUNCTION
# ============================================================================

def main():
    """
    Main function to execute the complete ML pipeline
    """
    try:
        # Step 1: Load and explore data
        df = load_and_explore_data('data.csv')
        
        # Step 2: Preprocess data
        df_processed = preprocess_data(df)
        
        # Step 3: Exploratory Data Analysis
        perform_eda(df_processed)
        
        # Step 4: Feature Engineering
        df_features = engineer_features(df_processed)
        
        # Step 5: Prepare data for modeling
        X, y, numerical_features, categorical_features = prepare_model_data(df_features)
        
        # Step 6: Train and evaluate models
        trained_models, results_df, best_model, best_model_name, X_test, y_test = train_and_evaluate_models(
            X, y, numerical_features, categorical_features
        )
        
        # Step 7: Analyze best model performance
        analyze_model_performance(best_model, best_model_name, X_test, y_test, 
                                 numerical_features, categorical_features)
        
        # Step 8: Make sample predictions
        sample_predictions = make_predictions(best_model, X_test.head(5))
        
        # Step 9: Hyperparameter tuning (optional)
        X_train = X.drop(X_test.index)
        y_train = y.drop(X_test.index)
        tuned_model = tune_best_model(best_model_name, X_train, y_train, 
                                     numerical_features, categorical_features)
        
        print("\n" + "=" * 60)
        print("PIPELINE COMPLETED SUCCESSFULLY!")
        print("=" * 60)
        
        return {
            'best_model': best_model,
            'tuned_model': tuned_model,
            'results': results_df,
            'preprocessed_data': df_features
        }
        
    except Exception as e:
        print(f"Error in pipeline: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

# ============================================================================
# 11. UTILITY FUNCTIONS FOR PRODUCTION
# ============================================================================

def save_model(model, filename):
    """
    Save the trained model
    """
    import joblib
    joblib.dump(model, filename)
    print(f"Model saved as {filename}")

def load_model(filename):
    """
    Load a saved model
    """
    import joblib
    return joblib.load(filename)

def predict_single_vehicle(model, vehicle_data):
    """
    Predict price for a single vehicle
    
    Example usage:
    vehicle_data = {
        'year': 2020,
        'make': 'TOYOTA',
        'model': 'CAMRY',
        'odometer': 50000,
        'transmission': 'AUTOMATIC',
        'fuel_type': 'GAS',
        'run_and_drive': 'YES',
        'title_type': 'CLEAN',
        'location_state': 'CA',
        'color': 'WHITE'
    }
    """
    # Convert to DataFrame
    df_single = pd.DataFrame([vehicle_data])
    
    # Add derived features
    if 'year' in df_single.columns:
        df_single['vehicle_age'] = 2025 - df_single['year']
    
    # Make prediction
    prediction = model.predict(df_single)
    return prediction[0]

# ============================================================================
# EXECUTION
# ============================================================================

if __name__ == "__main__":
    # Run the complete pipeline
    results = main()
    
    # Example of how to use the trained model
    if results and results['best_model']:
        print("\n" + "=" * 60)
        print("EXAMPLE PREDICTION")
        print("=" * 60)
        
        # Example vehicle data
        example_vehicle = {
            'year': 2020,
            'make': 'TOYOTA',

<!-- # That's an excellent request\! Providing "the complete code" for an ideal sales price prediction problem is a fantastic goal, but it's important to understand a few things up front:

# 1.  **"Complete Code" is Context-Dependent:** A truly "complete" production-ready solution would involve:

#       * Sophisticated data ingestion from various sources (databases, APIs, web).
#       * Robust error handling and logging.
#       * Advanced feature engineering specific to the domain (e.g., real estate, cars, retail goods).
#       * A full-fledged MLOps pipeline for deployment, monitoring, and retraining.
#       * A user interface or API for predictions.
#       * These elements are highly specific to the project's real-world constraints and data.

# 2.  **Focus on the Core ML Pipeline:** For a single Python script, I'll focus on demonstrating the *core machine learning pipeline* from data loading to model evaluation, assuming a clean dataset is readily available.

# 3.  **Illustrative Example:** I'll create a synthetic dataset that mimics some characteristics of real-world sales data (e.g., property sales, car sales) to make the example runnable and understandable without needing to download external files.

# 4.  **Key Libraries:** We'll use standard libraries like `pandas` for data manipulation, `numpy` for numerical operations, `scikit-learn` for machine learning algorithms and preprocessing, and `matplotlib`/`seaborn` for visualization.

# -----

# Here's the Python code covering the essential steps, with detailed comments and explanations within the code itself.

# ```python -->
