In [2]:
# Import Data
import pandas as pd

new_used_car = pd.read_csv("../data/processed_used_car.csv")
random_state = 42

y = new_used_car['sales_price_log']
X = new_used_car.drop(['price', 'sales_price_log','model'], axis=1)
# X = new_used_car.drop(['price', 'sales_price_log', 'model', 'int_col', 'ext_col'], axis=1)
print(X.shape)

(4009, 14)


In [3]:
# Data types of features
for column in X.columns:
    print(f"Column: {column}, Data Type: {new_used_car[column].dtype}")

Column: brand, Data Type: object
Column: model_year, Data Type: int64
Column: milage, Data Type: int64
Column: fuel_type, Data Type: object
Column: ext_col, Data Type: object
Column: int_col, Data Type: object
Column: accident, Data Type: object
Column: clean_title, Data Type: object
Column: horsepower, Data Type: float64
Column: displacement, Data Type: float64
Column: cylinders, Data Type: float64
Column: turbo, Data Type: bool
Column: transmission_type, Data Type: object
Column: gears, Data Type: float64


In [4]:
# Inspect Missing Values
perc_missing_per_ftr = new_used_car.isnull().sum(axis=0)/new_used_car.shape[0]
print('fraction of missing values in features:')
print(perc_missing_per_ftr[perc_missing_per_ftr > 0])
print('data types of the features with missing values:')
print(new_used_car[perc_missing_per_ftr[perc_missing_per_ftr > 0].index].dtypes)
frac_missing = sum(new_used_car.isnull().sum(axis=1)!=0)/new_used_car.shape[0]
print('fraction of points with missing values:',frac_missing)

fraction of missing values in features:
fuel_type            0.054128
accident             0.028187
clean_title          0.148666
horsepower           0.201547
displacement         0.054128
cylinders            0.109753
transmission_type    0.121976
gears                0.457221
dtype: float64
data types of the features with missing values:
fuel_type             object
accident              object
clean_title           object
horsepower           float64
displacement         float64
cylinders            float64
transmission_type     object
gears                float64
dtype: object
fraction of points with missing values: 0.6083811424295336


In [None]:
from sklearn.model_selection import train_test_split

# Split to train, CV, and test
X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

print(X_other.shape)
print(X_test.shape)

# Save y for ML models
y_other_series = pd.Series(y_other, name="y_other")
y_other_series.to_csv("../data/y_other.csv", index=False)
y_test_series = pd.Series(y_test, name="y_test")
y_test_series.to_csv("../data/y_test.csv", index=False)

(3207, 14)
(802, 14)


In [49]:
# Group features into numerical and categorical variables
num_ftrs = ['model_year', 'milage', 'horsepower', 'displacement', 'cylinders', 'turbo', 'gears']
# cat_ftrs = ['brand', 'fuel_type', 'accident', 'clean_title', 'transmission_type']
cat_ftrs = ['brand', 'fuel_type', 'ext_col', 'int_col', 'accident', 'clean_title', 'transmission_type']

In [50]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Preprocess
# one-hot encoder for categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse_output=False,handle_unknown='ignore'))])

# standard scaler for numerical variables
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

# collect the transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_ftrs),
        ('cat', categorical_transformer, cat_ftrs)])

In [51]:
import pandas as pd

# Fit-transform training set
X_prep = preprocessor.fit_transform(X_other)
feature_names = preprocessor.get_feature_names_out()
df_other = pd.DataFrame(data=X_prep, columns=feature_names)

# Transform validation and test sets
df_test = pd.DataFrame(data=preprocessor.transform(X_test), columns=feature_names)

# Print shapes of the datasets
print(f"Shape of training set after preprocessing: {df_other.shape}")
print(f"Shape of test set after preprocessing: {df_test.shape}")

Shape of training set after preprocessing: (3207, 483)
Shape of test set after preprocessing: (802, 483)


In [52]:
import pandas as pd

def report_missing_values(df_other, df_test):
    """
    Reports the proportion of missing values per feature and per row for
    training, validation, test, and combined datasets.

    Args:
        df_train: The preprocessed training set as a DataFrame.
        df_val: The preprocessed validation set as a DataFrame.
        df_test: The preprocessed test set as a DataFrame.

    Returns:
        A dictionary with missing value statistics for training, validation,
        test, and combined datasets.
    """
    # Combine datasets
    df_combined = pd.concat([df_other, df_test], ignore_index=True)

    # Helper function to calculate missing value statistics
    def calculate_missing_stats(df, name):
        perc_missing_per_ftr = df.isnull().sum(axis=0) / df.shape[0]
        frac_missing_rows = (df.isnull().sum(axis=1) != 0).mean()
        return {
            'feature_missing_proportion': perc_missing_per_ftr[perc_missing_per_ftr > 0],
            'row_missing_proportion': frac_missing_rows
        }

    # Calculate missing value statistics
    missing_stats = {
        'training': calculate_missing_stats(df_other, 'Training'),
        'test': calculate_missing_stats(df_test, 'Test'),
        'combined': calculate_missing_stats(df_combined, 'Combined')
    }

    # Print results
    for dataset, stats in missing_stats.items():
        print(f"\n{dataset.capitalize()} Dataset:")
        print(f"Proportion of missing values per feature:")
        print(stats['feature_missing_proportion'])
        print(f"Proportion of rows with missing values: {stats['row_missing_proportion']:.4f}")


report_missing_values(df_other, df_test)



Training Dataset:
Proportion of missing values per feature:
num__horsepower      0.197381
num__displacement    0.050514
num__cylinders       0.106330
num__gears           0.455254
dtype: float64
Proportion of rows with missing values: 0.5893

Test Dataset:
Proportion of missing values per feature:
num__horsepower      0.218204
num__displacement    0.068579
num__cylinders       0.123441
num__gears           0.465087
dtype: float64
Proportion of rows with missing values: 0.6209

Combined Dataset:
Proportion of missing values per feature:
num__horsepower      0.201547
num__displacement    0.054128
num__cylinders       0.109753
num__gears           0.457221
dtype: float64
Proportion of rows with missing values: 0.5957


*Missing Value Imputation*

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

def multivariate_imputer(X_other, X_test, random_state=42):
    """
    Constructs a multivariate imputer using IterativeImputer with RandomForestRegressor 
    and imputes missing values in the provided datasets.

    Args:
        X_train (pd.DataFrame): Training feature matrix with missing values.
        X_val (pd.DataFrame): Validation feature matrix with missing values.
        X_test (pd.DataFrame): Test feature matrix with missing values.
        random_state (int): Random seed for reproducibility.

    Returns:
        tuple: Imputed versions of X_train, X_val, X_test as DataFrames.
    """
    # Initialize the IterativeImputer with RandomForestRegressor
    imputer = IterativeImputer(
        estimator=RandomForestRegressor(n_estimators=10, random_state=random_state),
        random_state=random_state
    )
    
    # Fit the imputer on the training data and transform all datasets
    print("Fitting the imputer on the training data...")
    X_other_imputed = pd.DataFrame(imputer.fit_transform(X_other), columns=X_other.columns)
    print("Imputing missing values in the test datasets...")
    X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_other.columns)
    
    print("Imputation complete.")
    return X_other_imputed, X_test_imputed

X_other_imputed_mi, X_test_imputed_mi = multivariate_imputer(df_other, df_test, random_state=42)
report_missing_values(X_other_imputed_mi, X_test_imputed_mi)

In [None]:
from xgboost import XGBRegressor

def impute_with_xgboost(df_other, df_test, target_column):
    """
    Imputes missing values in a single column using XGBoost.

    Args:
        df_train (pd.DataFrame): Training set with missing values.
        df_val (pd.DataFrame): Validation set with missing values.
        df_test (pd.DataFrame): Test set with missing values.
        target_column (str): Column to impute.

    Returns:
        tuple: Updated versions of df_train, df_val, and df_test.
    """
    # Separate rows with and without missing values in the training set
    train_data = df_other[df_other[target_column].notnull()]
    missing_data_other = df_other[df_other[target_column].isnull()]

    # Features and target for training
    X_other = train_data.drop(columns=[target_column])
    y_other = train_data[target_column]

    # Features for prediction (rows with missing target values)
    X_missing_train = missing_data_other.drop(columns=[target_column])
    X_missing_test = df_test[df_test[target_column].isnull()].drop(columns=[target_column])

    # Train XGBoost Regressor
    model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
    model.fit(X_other, y_other)

    # Predict missing values in the training set
    if not X_missing_train.empty:
        imputed_values_train = model.predict(X_missing_train)
        df_other.loc[df_other[target_column].isnull(), target_column] = imputed_values_train

    # Predict missing values in the test sets
    if not X_missing_test.empty:
        imputed_values_test = model.predict(X_missing_test)
        df_test.loc[df_test[target_column].isnull(), target_column] = imputed_values_test

    return df_other, df_test


# Impute each column with missing values
X_other_imputed_xgb = df_other.copy()
X_test_imputed_xgb = df_test.copy()

for col in df_other.columns:
    if (
        X_other_imputed_xgb[col].isnull().any()
        or X_test_imputed_xgb[col].isnull().any()
    ):
        print(f"Imputing missing values for column: {col}")
        X_other_imputed_xgb, X_test_imputed_xgb = impute_with_xgboost(
            X_other_imputed_xgb, X_test_imputed_xgb, col
        )

# Check missing values after imputation
report_missing_values(X_other_imputed_xgb, X_test_imputed_xgb)

# Save imputed training data to CSV
X_other_imputed_xgb.to_csv("../data/X_other_imputed_xgb.csv", index=False)
print("Imputed training data saved to results/X_other_imputed_xgb.csv")

# Save imputed test data to CSV
X_test_imputed_xgb.to_csv("../data/X_test_imputed_xgb.csv", index=False)
print("Imputed test data saved to results/X_test_imputed_xgb.csv")


Imputing missing values for column: num__horsepower
Imputing missing values for column: num__displacement
Imputing missing values for column: num__cylinders
Imputing missing values for column: num__gears

Training Dataset:
Proportion of missing values per feature:
Series([], dtype: float64)
Proportion of rows with missing values: 0.0000

Test Dataset:
Proportion of missing values per feature:
Series([], dtype: float64)
Proportion of rows with missing values: 0.0000

Combined Dataset:
Proportion of missing values per feature:
Series([], dtype: float64)
Proportion of rows with missing values: 0.0000


In [61]:
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

def MLpipeline(X_other, y_other, X_test, y_test, ML_algo, param_grid):
    '''
    This function directly uses the provided train, validation, and test splits.
    It evaluates models using MAE, RMSE, and R², and records the best model for each metric.

    Args:
    - X_train, y_train: Training dataset
    - X_val, y_val: Validation dataset for GridSearchCV
    - X_test, y_test: Test dataset for evaluation
    - ML_algo: Machine Learning algorithm
    - param_grid: Parameter grid for hyperparameter tuning

    Returns:
    - A dictionary containing test scores (MAE, RMSE, R²)
    - The best model for each metric
    '''
    # Lists to store results
    test_scores = {
        'MAE': [],
        'RMSE': [],
        'R2': []
    }
    best_models = {
        'MAE': None,
        'RMSE': None,
        'R2': None
    }

    # GridSearchCV
    print("\nPerforming GridSearchCV...")
    grid = GridSearchCV(
        estimator=ML_algo,
        param_grid=param_grid,
        scoring='neg_root_mean_squared_error',  # Primary metric for GridSearch
        cv=4,  # Fixed to 4-fold cross-validation
        return_train_score=True,
        n_jobs=-1,
        verbose=True
    )
    grid.fit(X_other, y_other)

    # Save the best model
    best_model = grid.best_estimator_
    print('Best model parameters:', grid.best_params_)
    print('Validation score (RMSE):', -grid.best_score_)

    # Predictions and metrics on the test set
    y_test_pred = best_model.predict(X_test)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_rmse = root_mean_squared_error(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    print('Test MAE:', test_mae)
    print('Test RMSE:', test_rmse)
    print('Test R²:', test_r2)

    # Store test scores
    test_scores['MAE'] = test_mae
    test_scores['RMSE'] = test_rmse
    test_scores['R2'] = test_r2

    # Save the best model for each metric
    best_models['MAE'] = best_model
    best_models['RMSE'] = best_model
    best_models['R2'] = best_model

    # Return results
    return test_scores, best_models

In [54]:
import numpy as np
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score

y_mean = np.mean(y_test)  # Baseline: Predict the mean for all instances
baseline_rmse = root_mean_squared_error(y_test, [y_mean] * len(y_test))
baseline_mae = mean_absolute_error(y_test, [y_mean] * len(y_test))
baseline_r2 = r2_score(y_test, [y_mean] * len(y_test))

print("Baseline Metrics:")
print(f"Baseline RMSE: {baseline_rmse:.4f}")
print(f"Baseline MAE: {baseline_mae:.4f}")
print(f"Baseline R²: {baseline_r2:.4f}")


Baseline Metrics:
Baseline RMSE: 0.3833
Baseline MAE: 0.2889
Baseline R²: 0.0000


In [56]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=42, n_estimators=100)
model.fit(X_other_imputed_xgb, y_other)
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X_other_imputed_xgb.columns, 'Importance': importances}).sort_values(by='Importance', ascending=False)
feature_importance_df.to_csv('feature_importance.csv', index=False)

In [None]:
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score

y_pred = model.predict(X_test_imputed_xgb)

# Calculate metrics
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the metrics
print(f"Model Performance on Test Set:")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"R²: {r2:.4f}")


Model Performance on Test Set:
RMSE: 0.1627
MAE: 0.0979
R²: 0.8198


In [None]:
import numpy as np
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

# Parameter grids
param_grids = {
    'Lasso': {'alpha': [0.01, 0.1, 1, 10, 100]},
    'Ridge': {'alpha': [0.01, 0.1, 1, 10, 100]},
    'ElasticNet': {
        'alpha': [0.01, 0.1, 1, 10, 100],
        'l1_ratio': [0.2, 0.4, 0.6, 0.8]
    },
    'RandomForestRegressor': {
        'max_depth': [1, 3, 10, 30, 100],
        'max_features': [0.25, 0.5, 0.75, 1.0]
    },
    'SVR': {
        'C': [0.1, 1, 10, 100],
        'epsilon': [0.1, 0.2, 0.5],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto', 0.01, 0.1, 1]
    },
    'KNeighborsRegressor': {
        'n_neighbors': [3, 5, 10, 20],
        'weights': ['uniform', 'distance']
    },
    'XGBRegressor': {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    }
}

# Models to train
models = {
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet(),
    'RandomForestRegressor': RandomForestRegressor(random_state=42),
    'SVR': SVR(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'XGBRegressor': XGBRegressor()
}

results = {}
best_models_overall = {
    'MAE': {'method': None, 'model': None, 'value': float('inf')},  # Lower is better
    'RMSE': {'method': None, 'model': None, 'value': float('inf')},  # Lower is better
    'R2': {'method': None, 'model': None, 'value': float('-inf')}   # Higher is better
}

# Training pipeline
for model_name, model in models.items():
    print(f"\nTraining {model_name}")
    test_scores, best_models = MLpipeline(
        X_other_imputed_xgb, y_other, X_test_imputed_xgb, y_test, model, param_grids[model_name]
    )
    # Save detailed results for each model
    results[model_name] = {
        'Best Parameters': best_models['R2'].get_params(),  # Assuming best model is selected by R²
        'Validation RMSE': -test_scores.get('Validation_RMSE', 0),  # Replace if validation RMSE available
        'Test MAE': test_scores['MAE'],
        'Test RMSE': test_scores['RMSE'],
        'Test R²': test_scores['R2']
    }

    # Update best overall models for each metric
    if test_scores['MAE'] < best_models_overall['MAE']['value']:
        best_models_overall['MAE'] = {'method': model_name, 'model': best_models['MAE'], 'value': test_scores['MAE']}
    if test_scores['RMSE'] < best_models_overall['RMSE']['value']:
        best_models_overall['RMSE'] = {'method': model_name, 'model': best_models['RMSE'], 'value': test_scores['RMSE']}
    if test_scores['R2'] > best_models_overall['R2']['value']:
        best_models_overall['R2'] = {'method': model_name, 'model': best_models['R2'], 'value': test_scores['R2']}

# Print the best overall models for each metric
print("\nBest Overall Models:")
for metric, info in best_models_overall.items():
    print(f"{metric} - Method: {info['method']}, Value: {info['value']:.4f}")



Training Lasso

Performing GridSearchCV...
Fitting 4 folds for each of 5 candidates, totalling 20 fits
Best model parameters: {'alpha': 0.01}
Validation score (RMSE): 0.17982392978408387
Test MAE: 0.12994943746643633
Test RMSE: 0.19827571038513128
Test R²: 0.7323478373791811

Training Ridge

Performing GridSearchCV...
Fitting 4 folds for each of 5 candidates, totalling 20 fits
Best model parameters: {'alpha': 1}
Validation score (RMSE): 0.14728566013218672
Test MAE: 0.11143840796739744
Test RMSE: 0.17964958087158353
Test R²: 0.7802726366545693

Training ElasticNet

Performing GridSearchCV...
Fitting 4 folds for each of 20 candidates, totalling 80 fits
Best model parameters: {'alpha': 0.01, 'l1_ratio': 0.2}
Validation score (RMSE): 0.1663927464167528
Test MAE: 0.12036548922221313
Test RMSE: 0.18781770822556532
Test R²: 0.7598377179809125

Training RandomForestRegressor

Performing GridSearchCV...
Fitting 4 folds for each of 20 candidates, totalling 80 fits
Best model parameters: {'max_

In [None]:
# Save results as a text file
with open("results/xgb_imputed_best_models_results.txt", "w") as f:
    # Write feature columns and their data types
    f.write("\nFeature Columns and Data Types:\n\n")
    for column in X.columns:
        f.write(f"Column: {column}, Data Type: {new_used_car[column].dtype}\n")

    # Write detailed results for each model
    f.write("Detailed Results for Each Model:\n\n")
    for model_name, result in results.items():
        f.write(f"Model: {model_name}\n")
        f.write(f"Best Parameters: {result['Best Parameters']}\n")
        f.write(f"Validation RMSE: {result['Validation RMSE']:.4f}\n")
        f.write(f"Test MAE: {result['Test MAE']:.4f}\n")
        f.write(f"Test RMSE: {result['Test RMSE']:.4f}\n")
        f.write(f"Test R²: {result['Test R²']:.4f}\n")
        f.write("\n")

    # Write the best overall models
    f.write("Best Overall Models:\n")
    for metric, info in best_models_overall.items():
        f.write(f"{metric} - Method: {info['method']}, Value: {info['value']:.4f}\n")