In [17]:
# Import Data
import pandas as pd

new_used_car = pd.read_csv("../data/processed_used_car.csv")
random_state = 42

y = new_used_car['sales_price_log']
X = new_used_car.drop(['price', 'sales_price_log'], axis=1)
print(X.shape)

(4009, 15)


In [5]:
# Data types of features
for column in new_used_car.columns:
    print(f"Column: {column}, Data Type: {new_used_car[column].dtype}")

Column: brand, Data Type: object
Column: model, Data Type: object
Column: model_year, Data Type: int64
Column: milage, Data Type: int64
Column: fuel_type, Data Type: object
Column: ext_col, Data Type: object
Column: int_col, Data Type: object
Column: accident, Data Type: object
Column: clean_title, Data Type: object
Column: horsepower, Data Type: float64
Column: displacement, Data Type: float64
Column: cylinders, Data Type: float64
Column: turbo, Data Type: bool
Column: transmission_type, Data Type: object
Column: gears, Data Type: float64


In [18]:
# Inspect Missing Value
perc_missing_per_ftr = new_used_car.isnull().sum(axis=0)/new_used_car.shape[0]
print('fraction of missing values in features:')
print(perc_missing_per_ftr[perc_missing_per_ftr > 0])
print('data types of the features with missing values:')
print(new_used_car[perc_missing_per_ftr[perc_missing_per_ftr > 0].index].dtypes)
frac_missing = sum(new_used_car.isnull().sum(axis=1)!=0)/new_used_car.shape[0]
print('fraction of points with missing values:',frac_missing)

fraction of missing values in features:
fuel_type            0.054128
accident             0.028187
clean_title          0.148666
horsepower           0.201547
displacement         0.054128
cylinders            0.109753
transmission_type    0.121976
gears                0.457221
dtype: float64
data types of the features with missing values:
fuel_type             object
accident              object
clean_title           object
horsepower           float64
displacement         float64
cylinders            float64
transmission_type     object
gears                float64
dtype: object
fraction of points with missing values: 0.6083811424295336


In [19]:
from sklearn.model_selection import train_test_split

# Split to train, CV, and test
X_train, X_other, y_train, y_other = train_test_split(X, y, train_size=0.6, random_state=random_state)
X_val, X_test, y_val, y_test = train_test_split(X_other, y_other, test_size=0.5, random_state=random_state)

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(2405, 15)
(802, 15)
(802, 15)


In [11]:
# Group features into numerical and categorical variables
num_ftrs = ['model_year', 'milage', 'horsepower', 'displacement', 'cylinders', 'turbo', 'gears']
cat_ftrs = ['brand', 'model', 'fuel_type', 'ext_col', 'int_col', 'accident', 'clean_title', 'transmission_type']

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Preprocess
# one-hot encoder for categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse_output=False,handle_unknown='ignore'))])

# standard scaler for numerical variables
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

# collect the transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_ftrs),
        ('cat', categorical_transformer, cat_ftrs)])

In [None]:
# fit_transform the training set
X_prep = preprocessor.fit_transform(X_train)
# the feature names after fit
feature_names = preprocessor.get_feature_names_out()

# you can convert the numpy array back to a data frame with the feature names if you want
df_train = pd.DataFrame(data=X_prep,columns=feature_names)
print(df_train.shape)

# transform the CV
df_val = preprocessor.transform(X_val)
df_val = pd.DataFrame(data=df_val,columns = feature_names)
print(df_val.shape)

# transform the test
df_test = preprocessor.transform(X_test)
df_test = pd.DataFrame(data=df_test,columns = feature_names)
print(df_test.shape)

(2405, 1777)
(802, 1777)
(802, 1777)
['num__model_year' 'num__milage' 'num__horsepower' ...
 'cat__transmission_type_Automatic' 'cat__transmission_type_Manual'
 'cat__transmission_type_nan']


In [None]:
# Missing Value of training set After transformation
print('data dimensions:',df_train.shape)
perc_missing_per_ftr = df_train.isnull().sum(axis=0)/df_train.shape[0]
print('fraction of missing values in features:')
print(perc_missing_per_ftr[perc_missing_per_ftr > 0])
print('data types of the features with missing values:')
print(df_train[perc_missing_per_ftr[perc_missing_per_ftr > 0].index].dtypes)
frac_missing = sum(df_train.isnull().sum(axis=1)!=0)/df_train.shape[0]
print('fraction of points with missing values:',frac_missing)

data dimensions: (2405, 1777)
fraction of missing values in features:
num__horsepower      0.200416
num__displacement    0.050728
num__cylinders       0.111435
num__gears           0.456549
dtype: float64
data types of the features with missing values:
num__horsepower      float64
num__displacement    float64
num__cylinders       float64
num__gears           float64
dtype: object
fraction of points with missing values: 0.59002079002079


In [8]:
# Check missing values for each feature (column) in the training set
perc_missing_per_ftr_train = df_train.isnull().sum(axis=0) / df_train.shape[0]
print('Proportion of missing values per feature in the training set:')
print(perc_missing_per_ftr_train[perc_missing_per_ftr_train > 0])

# Check the proportion of rows with missing values in the training set
frac_missing_train = (df_train.isnull().sum(axis=1) != 0).mean()
print('Proportion of rows with missing values in the training set:', frac_missing_train)

# Repeat for validation set
perc_missing_per_ftr_val = df_val.isnull().sum(axis=0) / df_val.shape[0]
print('Proportion of missing values per feature in the validation set:')
print(perc_missing_per_ftr_val[perc_missing_per_ftr_val > 0])

frac_missing_val = (df_val.isnull().sum(axis=1) != 0).mean()
print('Proportion of rows with missing values in the validation set:', frac_missing_val)

# Repeat for test set
perc_missing_per_ftr_test = df_test.isnull().sum(axis=0) / df_test.shape[0]
print('Proportion of missing values per feature in the test set:')
print(perc_missing_per_ftr_test[perc_missing_per_ftr_test > 0])

frac_missing_test = (df_test.isnull().sum(axis=1) != 0).mean()
print('Proportion of rows with missing values in the test set:', frac_missing_test)


Proportion of missing values per feature in the training set:
num__horsepower      0.200416
num__displacement    0.050728
num__cylinders       0.111435
num__gears           0.456549
dtype: float64
Proportion of rows with missing values in the training set: 0.59002079002079
Proportion of missing values per feature in the validation set:
num__horsepower      0.209476
num__displacement    0.061097
num__cylinders       0.108479
num__gears           0.441397
dtype: float64
Proportion of rows with missing values in the validation set: 0.5922693266832918
Proportion of missing values per feature in the test set:
num__horsepower      0.197007
num__displacement    0.057357
num__cylinders       0.105985
num__gears           0.475062
dtype: float64
Proportion of rows with missing values in the test set: 0.6159600997506235


In [9]:
# Concatenate the training, validation, and test sets
df_combined = pd.concat([df_train, df_val, df_test], ignore_index=True)

# Calculate the proportion of missing values per feature across all datasets
perc_missing_per_ftr_combined = df_combined.isnull().sum(axis=0) / df_combined.shape[0]
print('Proportion of missing values per feature across all sets:')
print(perc_missing_per_ftr_combined[perc_missing_per_ftr_combined > 0])

# Calculate the proportion of rows with missing values across all datasets
frac_missing_combined = (df_combined.isnull().sum(axis=1) != 0).mean()
print('Proportion of rows with missing values across all sets:', frac_missing_combined)

Proportion of missing values per feature across all sets:
num__horsepower      0.201547
num__displacement    0.054128
num__cylinders       0.109753
num__gears           0.457221
dtype: float64
Proportion of rows with missing values across all sets: 0.595659765527563


In [None]:
import pandas as pd

def report_missing_values(preprocessor, X_train, X_val, X_test):
    """
    Reports the proportion of missing values per feature and per row for
    training, validation, test, and combined datasets after preprocessing.
    
    Args:
        preprocessor: The preprocessing pipeline that transforms the data.
        X_train: The training set before transformation.
        X_val: The validation set before transformation.
        X_test: The test set before transformation.
    
    Returns:
        A dictionary with missing value statistics for training, validation,
        test, and combined datasets.
    """
    # Fit-transform training set
    X_prep_train = preprocessor.fit_transform(X_train)
    feature_names = preprocessor.get_feature_names_out()
    df_train = pd.DataFrame(data=X_prep_train, columns=feature_names)

    # Transform validation and test sets
    df_val = pd.DataFrame(data=preprocessor.transform(X_val), columns=feature_names)
    df_test = pd.DataFrame(data=preprocessor.transform(X_test), columns=feature_names)

    # Combine datasets
    df_combined = pd.concat([df_train, df_val, df_test], ignore_index=True)

    # Helper function to calculate missing value statistics
    def calculate_missing_stats(df, name):
        perc_missing_per_ftr = df.isnull().sum(axis=0) / df.shape[0]
        frac_missing_rows = (df.isnull().sum(axis=1) != 0).mean()
        return {
            'feature_missing_proportion': perc_missing_per_ftr[perc_missing_per_ftr > 0],
            'row_missing_proportion': frac_missing_rows
        }

    # Calculate missing value statistics
    missing_stats = {
        'training': calculate_missing_stats(df_train, 'Training'),
        'validation': calculate_missing_stats(df_val, 'Validation'),
        'test': calculate_missing_stats(df_test, 'Test'),
        'combined': calculate_missing_stats(df_combined, 'Combined')
    }

    # Print results
    for dataset, stats in missing_stats.items():
        print(f"\n{dataset.capitalize()} Dataset:")
        print(f"Proportion of missing values per feature:")
        print(stats['feature_missing_proportion'])
        print(f"Proportion of rows with missing values: {stats['row_missing_proportion']:.4f}")

    return missing_stats

# Example Usage
missing_stats = report_missing_values(preprocessor, X_train, X_val, X_test)


In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

def impute_with_xgboost_tuning(df, target_column, exclude_column):
    """
    Impute missing values in a single column using XGBoost with hyperparameter tuning.
    
    Args:
        df (pd.DataFrame): The dataset with missing values.
        target_column (str): The column to impute.
        exclude_column (str): The column to exclude from features (e.g., the target column).
    
    Returns:
        pd.Series: The column with imputed values.
    """
    # Separate rows with and without missing values in the target column
    train_data = df[df[target_column].notnull()]
    missing_data = df[df[target_column].isnull()]
    
    # If no missing values, return the column as-is
    if missing_data.empty:
        return df[target_column]
    
    # Define features (exclude target_column and exclude_column)
    X_train = train_data.drop(columns=[target_column, exclude_column], errors='ignore')
    y_train = train_data[target_column]
    X_missing = missing_data.drop(columns=[target_column, exclude_column], errors='ignore')
    
    # Define the model and parameter grid
    model = XGBRegressor(random_state=42)
    param_grid = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.3],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
    
    # Use GridSearchCV to find the best parameters
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    
    # Best model
    best_model = grid_search.best_estimator_
    print(f"Best parameters for {target_column}: {grid_search.best_params_}")
    
    # Predict missing values
    imputed_values = best_model.predict(X_missing)
    
    # Replace missing values with predictions
    df.loc[df[target_column].isnull(), target_column] = imputed_values
    
    return df[target_column]

# Impute all columns with missing values except the target column (sales_price_log)
for col in new_used_car.columns:
    if new_used_car[col].isnull().any() and col != 'sales_price_log':
        print(f"Imputing missing values for column: {col}")
        new_used_car[col] = impute_with_xgboost_tuning(new_used_car, col, 'sales_price_log')


In [10]:
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.pipeline import make_pipeline

def MLpipeline(X, y, preprocessor, ML_algo, param_grid):
    '''
    This function splits the data into other/test (80/20) and then applies KFold with 4 folds to other.
    It evaluates models using MAE, RMSE, and R², and records the best model for each metric.

    Returns:
    - A dictionary containing test scores (MAE, RMSE, R²) for each iteration
    - The best model for each metric
    '''

    # Lists to store results
    test_scores = {
        'MAE': [],
        'RMSE': [],
        'R2': []
    }
    best_models = {
        'MAE': None,
        'RMSE': None,
        'R2': None
    }
    best_scores = {
        'MAE': float('inf'),
        'RMSE': float('inf'),
        'R2': float('-inf')
    }

    nr_states = 10

    for i in range(nr_states):
        print(f"\nIteration {i+1}")

        # Split data
        X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state=42 * i)

        # Create KFold object
        kf = KFold(n_splits=4, shuffle=True, random_state=42 * i)

        # Pipeline and GridSearchCV
        pipe = make_pipeline(preprocessor, ML_algo)
        grid = GridSearchCV(
            pipe,
            param_grid=param_grid,
            scoring='neg_root_mean_squared_error',  # Primary metric for GridSearch
            cv=kf,
            return_train_score=True,
            n_jobs=-1,
            verbose=True
        )
        grid.fit(X_other, y_other)

        # Save the best model
        best_model = grid.best_estimator_
        print('Best model parameters:', grid.best_params_)
        print('Validation score (RMSE):', -grid.best_score_)

        # Predictions and metrics on the test set
        y_test_pred = best_model.predict(X_test)
        test_mae = mean_absolute_error(y_test, y_test_pred)
        test_rmse = root_mean_squared_error(y_test, y_test_pred)  # RMSE
        test_r2 = r2_score(y_test, y_test_pred)

        print('Test MAE:', test_mae)
        print('Test RMSE:', test_rmse)
        print('Test R²:', test_r2)

        # Append test scores
        test_scores['MAE'].append(test_mae)
        test_scores['RMSE'].append(test_rmse)
        test_scores['R2'].append(test_r2)

        # Update best models
        if test_mae < best_scores['MAE']:
            best_scores['MAE'] = test_mae
            best_models['MAE'] = best_model

        if test_rmse < best_scores['RMSE']:
            best_scores['RMSE'] = test_rmse
            best_models['RMSE'] = best_model

        if test_r2 > best_scores['R2']:
            best_scores['R2'] = test_r2
            best_models['R2'] = best_model

    # Return results
    return test_scores, best_models

In [11]:
import numpy as np
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

# Parameter grids
param_grids = {
    'Lasso': {'lasso__alpha': [0.01, 0.1, 1, 10, 100]},
    'Ridge': {'ridge__alpha': [0.01, 0.1, 1, 10, 100]},
    'ElasticNet': {
        'elasticnet__alpha': [0.01, 0.1, 1, 10, 100],
        'elasticnet__l1_ratio': [0.2, 0.4, 0.6, 0.8]
    },
    'RandomForestRegressor': {
        'randomforestregressor__max_depth': [1, 3, 10, 30, 100],
        'randomforestregressor__max_features': [0.25, 0.5, 0.75, 1.0]
    },
    'SVR': {
        'svr__C': [0.1, 1, 10, 100],
        'svr__epsilon': [0.1, 0.2, 0.5],
        'svr__kernel': ['linear', 'rbf'],
        'svr__gamma': ['scale', 'auto', 0.01, 0.1, 1]
    },
    'KNeighborsRegressor': {
        'kneighborsregressor__n_neighbors': [3, 5, 10, 20],
        'kneighborsregressor__weights': ['uniform', 'distance']
    },
    'XGBRegressor': {
        'classifier__n_estimators': [50, 100, 150],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7]
    }
}

# Models to train
models = {
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet(),
    'RandomForestRegressor': RandomForestRegressor(random_state=42),
    'SVR': SVR(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'XGBRegressor': XGBRegressor()
}

results = {}

# Training pipeline
for model_name, model in models.items():
    print(f"\nTraining {model_name}")
    test_scores, best_models = MLpipeline(X, y, preprocessor, model, param_grids[model_name])
    
    # Calculate metrics
    mean_mae = np.mean(test_scores['MAE'])
    std_mae = np.std(test_scores['MAE'])
    mean_rmse = np.mean(test_scores['RMSE'])
    std_rmse = np.std(test_scores['RMSE'])
    mean_r2 = np.mean(test_scores['R2'])
    std_r2 = np.std(test_scores['R2'])

    # Store results
    results[model_name] = {
        'mean_mae': mean_mae,
        'std_mae': std_mae,
        'mean_rmse': mean_rmse,
        'std_rmse': std_rmse,
        'mean_r2': mean_r2,
        'std_r2': std_r2,
        'best_models': best_models
    }
    
    # Print metrics
    print(f"\n{model_name} Metrics:")
    print(f"Mean MAE: {mean_mae:.4f}, Std MAE: {std_mae:.4f}")
    print(f"Mean RMSE: {mean_rmse:.4f}, Std RMSE: {std_rmse:.4f}")
    print(f"Mean R²: {mean_r2:.4f}, Std R²: {std_r2:.4f}")



Training Lasso

Iteration 1
Fitting 4 folds for each of 5 candidates, totalling 20 fits


ValueError: 
All the 20 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/home/lshiyu/anaconda3/envs/data1030_env/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/lshiyu/anaconda3/envs/data1030_env/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/lshiyu/anaconda3/envs/data1030_env/lib/python3.12/site-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/home/lshiyu/anaconda3/envs/data1030_env/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/lshiyu/anaconda3/envs/data1030_env/lib/python3.12/site-packages/sklearn/linear_model/_coordinate_descent.py", line 980, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "/home/lshiyu/anaconda3/envs/data1030_env/lib/python3.12/site-packages/sklearn/base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/lshiyu/anaconda3/envs/data1030_env/lib/python3.12/site-packages/sklearn/utils/validation.py", line 1301, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "/home/lshiyu/anaconda3/envs/data1030_env/lib/python3.12/site-packages/sklearn/utils/validation.py", line 1064, in check_array
    _assert_all_finite(
  File "/home/lshiyu/anaconda3/envs/data1030_env/lib/python3.12/site-packages/sklearn/utils/validation.py", line 123, in _assert_all_finite
    _assert_all_finite_element_wise(
  File "/home/lshiyu/anaconda3/envs/data1030_env/lib/python3.12/site-packages/sklearn/utils/validation.py", line 172, in _assert_all_finite_element_wise
    raise ValueError(msg_err)
ValueError: Input X contains NaN.
Lasso does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
