In [17]:
# Import Data
import pandas as pd

new_used_car = pd.read_csv("../data/processed_used_car.csv")
random_state = 42

y = new_used_car['sales_price_log']
# X = new_used_car.drop(['price', 'sales_price_log'], axis=1)
X = new_used_car.drop(['price', 'sales_price_log', 'model', 'int_col', 'ext_col'], axis=1)
print(X.shape)

(4009, 12)


In [18]:
# Data types of features
for column in new_used_car.columns:
    print(f"Column: {column}, Data Type: {new_used_car[column].dtype}")

Column: brand, Data Type: object
Column: model, Data Type: object
Column: model_year, Data Type: int64
Column: milage, Data Type: int64
Column: fuel_type, Data Type: object
Column: ext_col, Data Type: object
Column: int_col, Data Type: object
Column: accident, Data Type: object
Column: clean_title, Data Type: object
Column: price, Data Type: int64
Column: horsepower, Data Type: float64
Column: displacement, Data Type: float64
Column: cylinders, Data Type: float64
Column: turbo, Data Type: bool
Column: transmission_type, Data Type: object
Column: gears, Data Type: float64
Column: sales_price_log, Data Type: float64


In [19]:
# Inspect Missing Values
perc_missing_per_ftr = new_used_car.isnull().sum(axis=0)/new_used_car.shape[0]
print('fraction of missing values in features:')
print(perc_missing_per_ftr[perc_missing_per_ftr > 0])
print('data types of the features with missing values:')
print(new_used_car[perc_missing_per_ftr[perc_missing_per_ftr > 0].index].dtypes)
frac_missing = sum(new_used_car.isnull().sum(axis=1)!=0)/new_used_car.shape[0]
print('fraction of points with missing values:',frac_missing)

fraction of missing values in features:
fuel_type            0.054128
accident             0.028187
clean_title          0.148666
horsepower           0.201547
displacement         0.054128
cylinders            0.109753
transmission_type    0.121976
gears                0.457221
dtype: float64
data types of the features with missing values:
fuel_type             object
accident              object
clean_title           object
horsepower           float64
displacement         float64
cylinders            float64
transmission_type     object
gears                float64
dtype: object
fraction of points with missing values: 0.6083811424295336


In [20]:
from sklearn.model_selection import train_test_split

# Split to train, CV, and test
X_train, X_other, y_train, y_other = train_test_split(X, y, train_size=0.6, random_state=random_state)
X_val, X_test, y_val, y_test = train_test_split(X_other, y_other, test_size=0.5, random_state=random_state)

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(2405, 12)
(802, 12)
(802, 12)


In [21]:
# Group features into numerical and categorical variables
num_ftrs = ['model_year', 'milage', 'horsepower', 'displacement', 'cylinders', 'turbo', 'gears']
cat_ftrs = ['brand', 'fuel_type', 'accident', 'clean_title', 'transmission_type']
# cat_ftrs = ['brand', 'fuel_type', 'model', 'ext_col', 'int_col', 'accident', 'clean_title', 'transmission_type']

In [22]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Preprocess
# one-hot encoder for categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse_output=False,handle_unknown='ignore'))])

# standard scaler for numerical variables
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

# collect the transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_ftrs),
        ('cat', categorical_transformer, cat_ftrs)])

In [23]:
import pandas as pd

# Fit-transform training set
X_prep_train = preprocessor.fit_transform(X_train)
feature_names = preprocessor.get_feature_names_out()
df_train = pd.DataFrame(data=X_prep_train, columns=feature_names)

# Transform validation and test sets
df_val = pd.DataFrame(data=preprocessor.transform(X_val), columns=feature_names)
df_test = pd.DataFrame(data=preprocessor.transform(X_test), columns=feature_names)

# Print shapes of the datasets
print(f"Shape of training set after preprocessing: {df_train.shape}")
print(f"Shape of validation set after preprocessing: {df_val.shape}")
print(f"Shape of test set after preprocessing: {df_test.shape}")

Shape of training set after preprocessing: (2405, 75)
Shape of validation set after preprocessing: (802, 75)
Shape of test set after preprocessing: (802, 75)


In [24]:
import pandas as pd

def report_missing_values(df_train, df_val, df_test):
    """
    Reports the proportion of missing values per feature and per row for
    training, validation, test, and combined datasets.

    Args:
        df_train: The preprocessed training set as a DataFrame.
        df_val: The preprocessed validation set as a DataFrame.
        df_test: The preprocessed test set as a DataFrame.

    Returns:
        A dictionary with missing value statistics for training, validation,
        test, and combined datasets.
    """
    # Combine datasets
    df_combined = pd.concat([df_train, df_val, df_test], ignore_index=True)

    # Helper function to calculate missing value statistics
    def calculate_missing_stats(df, name):
        perc_missing_per_ftr = df.isnull().sum(axis=0) / df.shape[0]
        frac_missing_rows = (df.isnull().sum(axis=1) != 0).mean()
        return {
            'feature_missing_proportion': perc_missing_per_ftr[perc_missing_per_ftr > 0],
            'row_missing_proportion': frac_missing_rows
        }

    # Calculate missing value statistics
    missing_stats = {
        'training': calculate_missing_stats(df_train, 'Training'),
        'validation': calculate_missing_stats(df_val, 'Validation'),
        'test': calculate_missing_stats(df_test, 'Test'),
        'combined': calculate_missing_stats(df_combined, 'Combined')
    }

    # Print results
    for dataset, stats in missing_stats.items():
        print(f"\n{dataset.capitalize()} Dataset:")
        print(f"Proportion of missing values per feature:")
        print(stats['feature_missing_proportion'])
        print(f"Proportion of rows with missing values: {stats['row_missing_proportion']:.4f}")


report_missing_values(df_train, df_val, df_test)



Training Dataset:
Proportion of missing values per feature:
num__horsepower      0.200416
num__displacement    0.050728
num__cylinders       0.111435
num__gears           0.456549
dtype: float64
Proportion of rows with missing values: 0.5900

Validation Dataset:
Proportion of missing values per feature:
num__horsepower      0.209476
num__displacement    0.061097
num__cylinders       0.108479
num__gears           0.441397
dtype: float64
Proportion of rows with missing values: 0.5923

Test Dataset:
Proportion of missing values per feature:
num__horsepower      0.197007
num__displacement    0.057357
num__cylinders       0.105985
num__gears           0.475062
dtype: float64
Proportion of rows with missing values: 0.6160

Combined Dataset:
Proportion of missing values per feature:
num__horsepower      0.201547
num__displacement    0.054128
num__cylinders       0.109753
num__gears           0.457221
dtype: float64
Proportion of rows with missing values: 0.5957


*Missing Value Imputation*

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

def multivariate_imputer(X_train, X_val, X_test, random_state=42):
    """
    Constructs a multivariate imputer using IterativeImputer with RandomForestRegressor 
    and imputes missing values in the provided datasets.

    Args:
        X_train (pd.DataFrame): Training feature matrix with missing values.
        X_val (pd.DataFrame): Validation feature matrix with missing values.
        X_test (pd.DataFrame): Test feature matrix with missing values.
        random_state (int): Random seed for reproducibility.

    Returns:
        tuple: Imputed versions of X_train, X_val, X_test as DataFrames.
    """
    # Initialize the IterativeImputer with RandomForestRegressor
    imputer = IterativeImputer(
        estimator=RandomForestRegressor(n_estimators=10, random_state=random_state),
        random_state=random_state
    )
    
    # Fit the imputer on the training data and transform all datasets
    print("Fitting the imputer on the training data...")
    X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
    print("Imputing missing values in the validation and test datasets...")
    X_val_imputed = pd.DataFrame(imputer.transform(X_val), columns=X_train.columns)
    X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_train.columns)
    
    print("Imputation complete.")
    return X_train_imputed, X_val_imputed, X_test_imputed

X_train_imputed_mi, X_val_imputed_mi, X_test_imputed_mi = multivariate_imputer(df_train, df_val, df_test, random_state=42)
report_missing_values(X_train_imputed_mi, X_val_imputed_mi, X_test_imputed_mi)

Fitting the imputer on the training data...
Imputing missing values in the validation and test datasets...
Imputation complete.

Training Dataset:
Proportion of missing values per feature:
Series([], dtype: float64)
Proportion of rows with missing values: 0.0000

Validation Dataset:
Proportion of missing values per feature:
Series([], dtype: float64)
Proportion of rows with missing values: 0.0000

Test Dataset:
Proportion of missing values per feature:
Series([], dtype: float64)
Proportion of rows with missing values: 0.0000

Combined Dataset:
Proportion of missing values per feature:
Series([], dtype: float64)
Proportion of rows with missing values: 0.0000




In [None]:
from xgboost import XGBRegressor

def impute_with_xgboost(df_train, df_val, df_test, target_column):
    """
    Imputes missing values in a single column using XGBoost.

    Args:
        df_train (pd.DataFrame): Training set with missing values.
        df_val (pd.DataFrame): Validation set with missing values.
        df_test (pd.DataFrame): Test set with missing values.
        target_column (str): Column to impute.

    Returns:
        tuple: Updated versions of df_train, df_val, and df_test.
    """
    # Separate rows with and without missing values in the training set
    train_data = df_train[df_train[target_column].notnull()]
    missing_data_train = df_train[df_train[target_column].isnull()]

    # Features and target for training
    X_train = train_data.drop(columns=[target_column])
    y_train = train_data[target_column]

    # Features for prediction (rows with missing target values)
    X_missing_train = missing_data_train.drop(columns=[target_column])
    X_missing_val = df_val[df_val[target_column].isnull()].drop(columns=[target_column])
    X_missing_test = df_test[df_test[target_column].isnull()].drop(columns=[target_column])

    # Train XGBoost Regressor
    model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
    model.fit(X_train, y_train)

    # Predict missing values in the training set
    if not X_missing_train.empty:
        imputed_values_train = model.predict(X_missing_train)
        df_train.loc[df_train[target_column].isnull(), target_column] = imputed_values_train

    # Predict missing values in the validation and test sets
    if not X_missing_val.empty:
        imputed_values_val = model.predict(X_missing_val)
        df_val.loc[df_val[target_column].isnull(), target_column] = imputed_values_val

    if not X_missing_test.empty:
        imputed_values_test = model.predict(X_missing_test)
        df_test.loc[df_test[target_column].isnull(), target_column] = imputed_values_test

    return df_train, df_val, df_test


# Impute each column with missing values
X_train_imputed_xgb = df_train.copy()
X_val_imputed_xgb = df_val.copy()
X_test_imputed_xgb = df_test.copy()

for col in df_train.columns:
    if (
        X_train_imputed_xgb[col].isnull().any()
        or X_val_imputed_xgb[col].isnull().any()
        or X_test_imputed_xgb[col].isnull().any()
    ):
        print(f"Imputing missing values for column: {col}")
        X_train_imputed_xgb, X_val_imputed_xgb, X_test_imputed_xgb = impute_with_xgboost(
            X_train_imputed_xgb, X_val_imputed_xgb, X_test_imputed_xgb, col
        )

# Check missing values after imputation
report_missing_values(X_train_imputed_xgb, X_val_imputed_xgb, X_test_imputed_xgb)


Imputing missing values for column: num__horsepower
Imputing missing values for column: num__displacement
Imputing missing values for column: num__cylinders
Imputing missing values for column: num__gears

Training Dataset:
Proportion of missing values per feature:
Series([], dtype: float64)
Proportion of rows with missing values: 0.0000

Validation Dataset:
Proportion of missing values per feature:
Series([], dtype: float64)
Proportion of rows with missing values: 0.0000

Test Dataset:
Proportion of missing values per feature:
Series([], dtype: float64)
Proportion of rows with missing values: 0.0000

Combined Dataset:
Proportion of missing values per feature:
Series([], dtype: float64)
Proportion of rows with missing values: 0.0000


In [None]:
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.pipeline import make_pipeline

def MLpipeline(X, y, preprocessor, ML_algo, param_grid):
    '''
    This function splits the data into other/test (80/20) and then applies KFold with 4 folds to other.
    It evaluates models using MAE, RMSE, and R², and records the best model for each metric.

    Returns:
    - A dictionary containing test scores (MAE, RMSE, R²) for each iteration
    - The best model for each metric
    '''

    # Lists to store results
    test_scores = {
        'MAE': [],
        'RMSE': [],
        'R2': []
    }
    best_models = {
        'MAE': None,
        'RMSE': None,
        'R2': None
    }
    best_scores = {
        'MAE': float('inf'),
        'RMSE': float('inf'),
        'R2': float('-inf')
    }

    nr_states = 10

    for i in range(nr_states):
        print(f"\nIteration {i+1}")

        # Split data
        X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state=42 * i)

        # Create KFold object
        kf = KFold(n_splits=4, shuffle=True, random_state=42 * i)

        # Pipeline and GridSearchCV
        pipe = make_pipeline(preprocessor, ML_algo)
        grid = GridSearchCV(
            pipe,
            param_grid=param_grid,
            scoring='neg_root_mean_squared_error',  # Primary metric for GridSearch
            cv=kf,
            return_train_score=True,
            n_jobs=-1,
            verbose=True
        )
        grid.fit(X_other, y_other)

        # Save the best model
        best_model = grid.best_estimator_
        print('Best model parameters:', grid.best_params_)
        print('Validation score (RMSE):', -grid.best_score_)

        # Predictions and metrics on the test set
        y_test_pred = best_model.predict(X_test)
        test_mae = mean_absolute_error(y_test, y_test_pred)
        test_rmse = root_mean_squared_error(y_test, y_test_pred)  # RMSE
        test_r2 = r2_score(y_test, y_test_pred)

        print('Test MAE:', test_mae)
        print('Test RMSE:', test_rmse)
        print('Test R²:', test_r2)

        # Append test scores
        test_scores['MAE'].append(test_mae)
        test_scores['RMSE'].append(test_rmse)
        test_scores['R2'].append(test_r2)

        # Update best models
        if test_mae < best_scores['MAE']:
            best_scores['MAE'] = test_mae
            best_models['MAE'] = best_model

        if test_rmse < best_scores['RMSE']:
            best_scores['RMSE'] = test_rmse
            best_models['RMSE'] = best_model

        if test_r2 > best_scores['R2']:
            best_scores['R2'] = test_r2
            best_models['R2'] = best_model

    # Return results
    return test_scores, best_models

In [None]:
import numpy as np
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

# Parameter grids
param_grids = {
    'Lasso': {'lasso__alpha': [0.01, 0.1, 1, 10, 100]},
    'Ridge': {'ridge__alpha': [0.01, 0.1, 1, 10, 100]},
    'ElasticNet': {
        'elasticnet__alpha': [0.01, 0.1, 1, 10, 100],
        'elasticnet__l1_ratio': [0.2, 0.4, 0.6, 0.8]
    },
    'RandomForestRegressor': {
        'randomforestregressor__max_depth': [1, 3, 10, 30, 100],
        'randomforestregressor__max_features': [0.25, 0.5, 0.75, 1.0]
    },
    'SVR': {
        'svr__C': [0.1, 1, 10, 100],
        'svr__epsilon': [0.1, 0.2, 0.5],
        'svr__kernel': ['linear', 'rbf'],
        'svr__gamma': ['scale', 'auto', 0.01, 0.1, 1]
    },
    'KNeighborsRegressor': {
        'kneighborsregressor__n_neighbors': [3, 5, 10, 20],
        'kneighborsregressor__weights': ['uniform', 'distance']
    },
    'XGBRegressor': {
        'classifier__n_estimators': [50, 100, 150],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7]
    }
}

# Models to train
models = {
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet(),
    'RandomForestRegressor': RandomForestRegressor(random_state=42),
    'SVR': SVR(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'XGBRegressor': XGBRegressor()
}

results = {}

# Training pipeline
for model_name, model in models.items():
    print(f"\nTraining {model_name}")
    test_scores, best_models = MLpipeline(X, y, preprocessor, model, param_grids[model_name])
    
    # Calculate metrics
    mean_mae = np.mean(test_scores['MAE'])
    std_mae = np.std(test_scores['MAE'])
    mean_rmse = np.mean(test_scores['RMSE'])
    std_rmse = np.std(test_scores['RMSE'])
    mean_r2 = np.mean(test_scores['R2'])
    std_r2 = np.std(test_scores['R2'])

    # Store results
    results[model_name] = {
        'mean_mae': mean_mae,
        'std_mae': std_mae,
        'mean_rmse': mean_rmse,
        'std_rmse': std_rmse,
        'mean_r2': mean_r2,
        'std_r2': std_r2,
        'best_models': best_models
    }
    
    # Print metrics
    print(f"\n{model_name} Metrics:")
    print(f"Mean MAE: {mean_mae:.4f}, Std MAE: {std_mae:.4f}")
    print(f"Mean RMSE: {mean_rmse:.4f}, Std RMSE: {std_rmse:.4f}")
    print(f"Mean R²: {mean_r2:.4f}, Std R²: {std_r2:.4f}")



Training Lasso

Iteration 1
Fitting 4 folds for each of 5 candidates, totalling 20 fits


ValueError: 
All the 20 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/home/lshiyu/anaconda3/envs/data1030_env/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/lshiyu/anaconda3/envs/data1030_env/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/lshiyu/anaconda3/envs/data1030_env/lib/python3.12/site-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/home/lshiyu/anaconda3/envs/data1030_env/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/lshiyu/anaconda3/envs/data1030_env/lib/python3.12/site-packages/sklearn/linear_model/_coordinate_descent.py", line 980, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "/home/lshiyu/anaconda3/envs/data1030_env/lib/python3.12/site-packages/sklearn/base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/lshiyu/anaconda3/envs/data1030_env/lib/python3.12/site-packages/sklearn/utils/validation.py", line 1301, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "/home/lshiyu/anaconda3/envs/data1030_env/lib/python3.12/site-packages/sklearn/utils/validation.py", line 1064, in check_array
    _assert_all_finite(
  File "/home/lshiyu/anaconda3/envs/data1030_env/lib/python3.12/site-packages/sklearn/utils/validation.py", line 123, in _assert_all_finite
    _assert_all_finite_element_wise(
  File "/home/lshiyu/anaconda3/envs/data1030_env/lib/python3.12/site-packages/sklearn/utils/validation.py", line 172, in _assert_all_finite_element_wise
    raise ValueError(msg_err)
ValueError: Input X contains NaN.
Lasso does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
