In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import gradio as gr
import traceback

# --- Load Data ---
try:
    train_df = pd.read_csv('/content/drive/MyDrive/ML/Car price prediction/train.csv')
    test_df = pd.read_csv('/content/drive/MyDrive/ML/Car price prediction/test.csv')
    sample_submission_df = pd.read_csv('/content/drive/MyDrive/ML/Car price prediction/sample_submission.csv')
except FileNotFoundError:
    print("Ensure train.csv, test.csv, and sample_submission.csv are uploaded to your Colab environment.")
    raise

# --- EDA PLOTS for Coursework Task (LO2) ---
# Detect target column
possible_target_cols = [col for col in train_df.columns if col.lower() in ['price', 'Price', 'cost', 'target', 'price_usd', 'sale_price', 'value', 'car_price', 'price_in_usd', 'amount']]
if not possible_target_cols:
    print(f"Columns in train_df: {list(train_df.columns)}")
    raise ValueError("No target column found in train_df.")
target_col = possible_target_cols[0]
print(f"\n[EDA] Using target column for plots: {target_col}")

# 1. Price Distribution
plt.figure(figsize=(10, 6))
sns.histplot(train_df[target_col], bins=50, kde=True, color='orange')
plt.title('Price Distribution')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.grid(True)
plt.tight_layout()
plt.show()

# 2. Scatter Plot: Car Age vs Price
if 'year' in [col.lower() for col in train_df.columns]:
    year_col = [col for col in train_df.columns if 'year' in col.lower()][0]
    train_df['car_age'] = 2024 - train_df[year_col]

    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='car_age', y=target_col, data=train_df, alpha=0.5, color='teal')
    plt.title('Car Age vs Price')
    plt.xlabel('Car Age')
    plt.ylabel('Price')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# --- 1. Enhanced Data Preprocessing ---
def advanced_preprocessing(train_df, test_df):
    possible_target_cols = [col for col in train_df.columns if col.lower() in ['price', 'Price', 'cost', 'target', 'price_usd', 'sale_price', 'value', 'car_price', 'price_in_usd', 'amount']]
    if not possible_target_cols:
        print(f"Columns in train_df: {list(train_df.columns)}")
        raise ValueError("No target column (e.g., 'price') found.")
    target_col = possible_target_cols[0]
    print(f"Using target column: {target_col}")

    X = train_df.drop(target_col, axis=1)
    y = train_df[target_col]
    test_ids = test_df['Id'] if 'Id' in test_df.columns else test_df.index
    X_test = test_df.drop('Id', axis=1) if 'Id' in test_df.columns else test_df
    combined_df = pd.concat([X, X_test], ignore_index=True)

    # Fill missing values
    for col in combined_df.columns:
        if combined_df[col].dtype == 'object':
            combined_df[col] = combined_df[col].fillna('Unknown')
        else:
            combined_df[col] = combined_df[col].fillna(combined_df[col].median())

    numerical_features = combined_df.select_dtypes(include=[np.number]).columns
    categorical_features = combined_df.select_dtypes(include=['object']).columns

    # Create derived features
    if 'year' in [col.lower() for col in numerical_features]:
        year_col = [col for col in numerical_features if 'year' in col.lower()][0]
        combined_df['car_age'] = 2024 - combined_df[year_col]

    if any('mileage' in col.lower() or 'km' in col.lower() for col in numerical_features):
        mileage_col = [col for col in numerical_features if 'mileage' in col.lower() or 'km' in col.lower()][0]
        combined_df['mileage_per_year'] = combined_df[mileage_col] / (combined_df.get('car_age', 1) + 1)

    if any('motor' in col.lower() or 'engine' in col.lower() for col in numerical_features):
        motor_col = [col for col in numerical_features if 'motor' in col.lower() or 'engine' in col.lower()][0]
        combined_df['motor_efficiency'] = combined_df[motor_col] * 1000

    # Update feature lists after creating derived features
    numerical_features = combined_df.select_dtypes(include=[np.number]).columns
    categorical_features = combined_df.select_dtypes(include=['object']).columns

    # Store encoders for later use
    encoders = {}
    encoded_features = []

    for cat_col in categorical_features:
        if combined_df[cat_col].nunique() > 10:
            le = LabelEncoder()
            combined_df[f'{cat_col}_encoded'] = le.fit_transform(combined_df[cat_col])
            encoders[cat_col] = {'type': 'label', 'encoder': le}
            encoded_features.append(f'{cat_col}_encoded')
        else:
            encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
            encoded_cats = encoder.fit_transform(combined_df[[cat_col]])
            encoded_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out([cat_col]))
            combined_df = pd.concat([combined_df, encoded_df], axis=1)
            encoders[cat_col] = {'type': 'onehot', 'encoder': encoder}
            encoded_features.extend(encoded_df.columns)

    final_features = list(numerical_features) + encoded_features
    final_combined_df = combined_df[final_features].fillna(0)
    X_processed = final_combined_df.iloc[:len(train_df)]
    X_test_processed = final_combined_df.iloc[len(train_df):]

    return X_processed, X_test_processed, y, test_ids, final_features, encoders

# Apply preprocessing
X_processed, X_test_processed, y, test_ids, final_features, encoders = advanced_preprocessing(train_df, test_df)

print(f"Advanced preprocessing completed.")
print(f"Training data shape: {X_processed.shape}")
print(f"Test data shape: {X_test_processed.shape}")
print(f"Number of features: {len(final_features)}")

# --- 2. Outlier Detection and Handling ---
def remove_outliers(X, y, method='iqr'):
    if method == 'iqr':
        Q1 = y.quantile(0.25)
        Q3 = y.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        mask = (y >= lower_bound) & (y <= upper_bound)
        return X[mask], y[mask]
    return X, y

X_processed_clean, y_clean = remove_outliers(X_processed, y)
print(f"After outlier removal: {X_processed_clean.shape[0]} samples (removed {len(X_processed) - len(X_processed_clean)} outliers)")

# --- 3. Feature Scaling ---
scaler = StandardScaler()
X_processed_scaled = scaler.fit_transform(X_processed_clean)
X_test_processed_scaled = scaler.transform(X_test_processed)

# --- 4. Enhanced Model Training ---
X_train, X_val, y_train, y_val = train_test_split(X_processed_scaled, y_clean, test_size=0.2, random_state=42)

xgb_params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'mae',
    'n_estimators': 1500,
    'learning_rate': 0.03,
    'max_depth': 8,
    'subsample': 0.85,
    'colsample_bytree': 0.85,
    'colsample_bylevel': 0.85,
    'reg_alpha': 0.1,
    'reg_lambda': 1.0,
    'min_child_weight': 3,
    'gamma': 0.1,
    'early_stopping_rounds': 100,
    'random_state': 42,
    'n_jobs': -1
}

print("\nTraining enhanced XGBoost model...")
xgb_model = xgb.XGBRegressor(**xgb_params)
xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

y_pred_val = xgb_model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred_val)
r2 = r2_score(y_val, y_pred_val)

print(f"\nEnhanced XGBoost Validation Results:")
print(f"Mean Absolute Error (MAE): ${mae:,.2f}")
print(f"R-squared (R²) Score: {r2:.4f}")

# --- 5. Ensemble Model ---
print("\nTraining ensemble model...")

rf_model = RandomForestRegressor(n_estimators=200, max_depth=15, min_samples_split=5, min_samples_leaf=2, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

ridge_model = Ridge(alpha=10.0, random_state=42)
ridge_model.fit(X_train, y_train)

xgb_pred = xgb_model.predict(X_val)
rf_pred = rf_model.predict(X_val)
ridge_pred = ridge_model.predict(X_val)

ensemble_pred = 0.6 * xgb_pred + 0.3 * rf_pred + 0.1 * ridge_pred
ensemble_mae = mean_absolute_error(y_val, ensemble_pred)
ensemble_r2 = r2_score(y_val, ensemble_pred)

print(f"\nEnsemble Model Validation Results:")
print(f"Mean Absolute Error (MAE): ${ensemble_mae:,.2f}")
print(f"R-squared (R²) Score: {ensemble_r2:.4f}")

if ensemble_mae < mae:
    print("\nEnsemble model performs better!")
    best_model_type = "ensemble"
    best_mae = ensemble_mae
    best_r2 = ensemble_r2
else:
    print("\nXGBoost model performs better!")
    best_model_type = "xgboost"
    best_mae = mae
    best_r2 = r2

# --- 6. Final Model Training on Full Dataset ---
print(f"\nRetraining best model ({best_model_type}) on full dataset...")

xgb_model_final = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=xgb_model.best_iteration if hasattr(xgb_model, 'best_iteration') else 1000,
    learning_rate=0.03,
    max_depth=8,
    subsample=0.85,
    colsample_bytree=0.85,
    colsample_bylevel=0.85,
    reg_alpha=0.1,
    reg_lambda=1.0,
    min_child_weight=3,
    gamma=0.1,
    random_state=42,
    n_jobs=-1
)
xgb_model_final.fit(X_processed_scaled, y_clean)

rf_model_final = None
ridge_model_final = None

if best_model_type == "ensemble":
    rf_model_final = RandomForestRegressor(n_estimators=200, max_depth=15, min_samples_split=5, min_samples_leaf=2, random_state=42, n_jobs=-1)
    rf_model_final.fit(X_processed_scaled, y_clean)

    ridge_model_final = Ridge(alpha=10.0, random_state=42)
    ridge_model_final.fit(X_processed_scaled, y_clean)

# --- 7. Final Predictions ---
print("Making final predictions on test set...")

if best_model_type == "ensemble":
    xgb_final_pred = xgb_model_final.predict(X_test_processed_scaled)
    rf_final_pred = rf_model_final.predict(X_test_processed_scaled)
    ridge_final_pred = ridge_model_final.predict(X_test_processed_scaled)
    final_predictions = 0.6 * xgb_final_pred + 0.3 * rf_final_pred + 0.1 * ridge_final_pred
else:
    final_predictions = xgb_model_final.predict(X_test_processed_scaled)

submission_df = pd.DataFrame({'Id': test_ids, 'Price': final_predictions})
submission_df['Price'] = submission_df['Price'].clip(lower=0)
submission_df.to_csv('submission.csv', index=False)

print(f"\nFinal Results:")
print(f"Best Model: {best_model_type.title()}")
print(f"Best MAE: ${best_mae:,.2f}")
print(f"Best R²: {best_r2:.4f}")
print(f"Submission file 'submission.csv' created successfully!")
print("\nSample predictions:")
print(submission_df.head())

# Store column information for UI
original_columns = [col for col in train_df.columns if col != target_col]

def preprocess_single_input(input_dict, encoders, final_features):
    """
    Preprocess a single input for prediction using the same logic as training
    """
    try:
        print(f"Input received: {input_dict}")

        # Create DataFrame from input
        input_df = pd.DataFrame([input_dict])
        print(f"Input DataFrame shape: {input_df.shape}")
        print(f"Input DataFrame:\n{input_df}")

        # Handle missing values and convert data types
        for col in input_df.columns:
            if input_df[col].dtype == 'object':
                input_df[col] = input_df[col].fillna('Unknown')
            else:
                try:
                    input_df[col] = pd.to_numeric(input_df[col], errors='coerce')
                    input_df[col] = input_df[col].fillna(0)
                except:
                    input_df[col] = input_df[col].fillna('Unknown')

        print(f"After type conversion:\n{input_df}")

        # Create derived features (same as training)
        numerical_features = input_df.select_dtypes(include=[np.number]).columns

        # Car age feature
        if 'year' in [col.lower() for col in numerical_features]:
            year_col = [col for col in numerical_features if 'year' in col.lower()][0]
            input_df['car_age'] = 2024 - input_df[year_col]
            print(f"Created car_age: {input_df['car_age'].iloc[0]}")

        # Mileage per year feature
        if any('mileage' in col.lower() or 'km' in col.lower() for col in numerical_features):
            mileage_col = [col for col in numerical_features if 'mileage' in col.lower() or 'km' in col.lower()][0]
            car_age = input_df.get('car_age', pd.Series([1])).iloc[0]
            input_df['mileage_per_year'] = input_df[mileage_col] / (car_age + 1)
            print(f"Created mileage_per_year: {input_df['mileage_per_year'].iloc[0]}")

        # Motor efficiency feature
        if any('motor' in col.lower() or 'engine' in col.lower() for col in numerical_features):
            motor_col = [col for col in numerical_features if 'motor' in col.lower() or 'engine' in col.lower()][0]
            input_df['motor_efficiency'] = input_df[motor_col] * 1000
            print(f"Created motor_efficiency: {input_df['motor_efficiency'].iloc[0]}")

        # Apply encodings
        for col in input_df.select_dtypes(include=['object']).columns:
            if col in encoders:
                if encoders[col]['type'] == 'label':
                    try:
                        # Handle unknown categories for label encoder
                        value = input_df[col].iloc[0]
                        if value in encoders[col]['encoder'].classes_:
                            input_df[f'{col}_encoded'] = encoders[col]['encoder'].transform([value])
                        else:
                            # Use the most frequent class or 0 for unknown values
                            input_df[f'{col}_encoded'] = 0
                        print(f"Label encoded {col}: {input_df[f'{col}_encoded'].iloc[0]}")
                    except Exception as e:
                        print(f"Label encoding error for {col}: {e}")
                        input_df[f'{col}_encoded'] = 0
                else:  # OneHot encoding
                    try:
                        encoded_cats = encoders[col]['encoder'].transform(input_df[[col]])
                        encoded_df = pd.DataFrame(encoded_cats, columns=encoders[col]['encoder'].get_feature_names_out([col]))
                        input_df = pd.concat([input_df, encoded_df], axis=1)
                        print(f"One-hot encoded {col}: {encoded_df.shape[1]} features")
                    except Exception as e:
                        print(f"One-hot encoding error for {col}: {e}")
                        # Create dummy columns with zeros
                        feature_names = encoders[col]['encoder'].get_feature_names_out([col])
                        for feature_name in feature_names:
                            input_df[feature_name] = 0

        print(f"After encoding, DataFrame shape: {input_df.shape}")
        print(f"After encoding columns: {list(input_df.columns)}")

        # Select only the features used in training
        missing_features = []
        final_input = pd.DataFrame()

        for feature in final_features:
            if feature in input_df.columns:
                final_input[feature] = input_df[feature]
            else:
                final_input[feature] = 0  # Fill missing features with 0
                missing_features.append(feature)

        if missing_features:
            print(f"Missing features filled with 0: {missing_features[:10]}...")  # Show first 10

        print(f"Final input shape: {final_input.shape}")
        print(f"Expected features: {len(final_features)}")

        return final_input

    except Exception as e:
        print(f"Preprocessing error: {str(e)}")
        print(f"Traceback: {traceback.format_exc()}")
        raise e

def create_prediction_ui():
    """
    Creates a Gradio UI for car price prediction with better error handling
    """

    def predict_car_price(*args):
        try:
            print(f"Received {len(args)} arguments")
            print(f"Original columns: {original_columns}")

            # Create input dictionary from arguments
            input_dict = {}
            for i, col in enumerate(original_columns):
                if i < len(args):
                    input_dict[col] = args[i]
                    print(f"{col}: {args[i]} (type: {type(args[i])})")

            # Preprocess the input
            processed_input = preprocess_single_input(input_dict, encoders, final_features)

            # Scale the features
            scaled_input = scaler.transform(processed_input)
            print(f"Scaled input shape: {scaled_input.shape}")

            # Make prediction
            if best_model_type == "ensemble":
                xgb_pred = xgb_model_final.predict(scaled_input)[0]
                rf_pred = rf_model_final.predict(scaled_input)[0]
                ridge_pred = ridge_model_final.predict(scaled_input)[0]
                prediction = 0.6 * xgb_pred + 0.3 * rf_pred + 0.1 * ridge_pred
                print(f"Ensemble predictions - XGB: {xgb_pred:.2f}, RF: {rf_pred:.2f}, Ridge: {ridge_pred:.2f}")
            else:
                prediction = xgb_model_final.predict(scaled_input)[0]
                print(f"XGBoost prediction: {prediction:.2f}")

            # Ensure non-negative price
            prediction = max(0, prediction)

            result = f" Predicted Car Price: ${prediction:,.2f}"
            print(f"Final prediction: {result}")
            return result

        except Exception as e:
            error_msg = f" Prediction Error: {str(e)}"
            print(f"Error in predict_car_price: {error_msg}")
            print(f"Full traceback: {traceback.format_exc()}")
            return error_msg

    # Create input components based on original columns
    input_components = []

    for col in original_columns:
        try:
            # Get unique values from test_df or train_df
            if col in test_df.columns:
                unique_values = pd.concat([train_df[col], test_df[col]]).dropna().unique()
            else:
                unique_values = train_df[col].dropna().unique()

            # Convert to strings and sort
            if len(unique_values) > 0:
                if pd.api.types.is_numeric_dtype(unique_values):
                    # For numerical columns, create a number input
                    min_val = float(unique_values.min())
                    max_val = float(unique_values.max())
                    default_val = float(np.median(unique_values))
                    input_components.append(
                        gr.Number(
                            label=col,
                            value=default_val,
                            minimum=min_val,
                            maximum=max_val,
                            step=1 if col.lower() in ['year'] else None
                        )
                    )
                else:
                    # For categorical columns, create dropdown
                    unique_values_str = [str(val) for val in unique_values if str(val) != 'nan']
                    unique_values_str = sorted(list(set(unique_values_str)))
                    if len(unique_values_str) == 0:
                        unique_values_str = ['Unknown']
                    input_components.append(
                        gr.Dropdown(
                            label=col,
                            choices=unique_values_str,
                            value=unique_values_str[0]
                        )
                    )
            else:
                # Fallback for empty columns
                input_components.append(gr.Textbox(label=col, value="Unknown"))

        except Exception as e:
            print(f"Error creating component for {col}: {e}")
            input_components.append(gr.Textbox(label=col, value="Unknown"))

    # Create the Gradio interface
    with gr.Blocks(title="Car Price Prediction", theme=gr.themes.Soft()) as interface:
        gr.Markdown("# Car Price Prediction System")
        gr.Markdown("Enter car details below to get a price prediction.")

        with gr.Row():
            with gr.Column(scale=2):
                gr.Markdown("###  Car Details")

                # Split inputs into multiple columns for better layout
                mid_point = len(input_components) // 2

                with gr.Row():
                    with gr.Column():
                        for component in input_components[:mid_point]:
                            component.render()

                    with gr.Column():
                        for component in input_components[mid_point:]:
                            component.render()

                predict_button = gr.Button("🔮 Predict Price", variant="primary", size="lg")

            with gr.Column(scale=1):
                gr.Markdown("### Model Information")
                gr.Markdown(f"""
                **Model Type:** {best_model_type.title()}
                **Validation MAE:** ${best_mae:,.2f}
                **Validation R²:** {best_r2:.4f}
                **Features Used:** {len(final_features)}
                """)

                gr.Markdown("### Prediction Result")
                output = gr.Textbox(
                    label="Predicted Price",
                    placeholder="Click 'Predict Price' to see the result...",
                    interactive=False,
                    lines=2
                )

        # Connect the prediction function
        predict_button.click(
            fn=predict_car_price,
            inputs=input_components,
            outputs=output
        )

        gr.Markdown("""
        ### How it works:
        1. **Select/Enter** car details in the form above
        2. **Click** 'Predict Price' to get the estimated value
        3. **Model** uses advanced ML algorithms trained on car data
        """)

    return interface

# Launch the UI
print("\n" + "="*50)
print("🚀 LAUNCHING CAR PRICE PREDICTION UI")
print("="*50)

try:
    ui = create_prediction_ui()
    print("UI created successfully!")
    print("Launching Gradio interface...")
    ui.launch(share=True, debug=True)
except Exception as e:
    print(f"Failed to launch UI: {str(e)}")
    print(f"Full error: {traceback.format_exc()}")