In [None]:
# Import Data
import pandas as pd

new_used_car = pd.read_csv("../data/processed_used_car.csv")
random_state = 42

y = new_used_car['sales_price_log']
X = new_used_car.drop(['price', 'sales_price_log','model'], axis=1)     # XGB
# X = new_used_car.drop(['price', 'sales_price_log', 'model', 'int_col', 'ext_col'], axis=1)    # Multivariate Imputation
print(X.shape)

(4009, 14)


In [2]:
# Data types of features
for column in X.columns:
    print(f"Column: {column}, Data Type: {new_used_car[column].dtype}")

Column: brand, Data Type: object
Column: model_year, Data Type: int64
Column: milage, Data Type: int64
Column: fuel_type, Data Type: object
Column: ext_col, Data Type: object
Column: int_col, Data Type: object
Column: accident, Data Type: object
Column: clean_title, Data Type: object
Column: horsepower, Data Type: float64
Column: displacement, Data Type: float64
Column: cylinders, Data Type: float64
Column: turbo, Data Type: bool
Column: transmission_type, Data Type: object
Column: gears, Data Type: float64


In [3]:
# Inspect Missing Values
perc_missing_per_ftr = new_used_car.isnull().sum(axis=0)/new_used_car.shape[0]
print('fraction of missing values in features:')
print(perc_missing_per_ftr[perc_missing_per_ftr > 0])
print('data types of the features with missing values:')
print(new_used_car[perc_missing_per_ftr[perc_missing_per_ftr > 0].index].dtypes)
frac_missing = sum(new_used_car.isnull().sum(axis=1)!=0)/new_used_car.shape[0]
print('fraction of points with missing values:',frac_missing)

fraction of missing values in features:
fuel_type            0.054128
accident             0.028187
clean_title          0.148666
horsepower           0.201547
displacement         0.054128
cylinders            0.109753
transmission_type    0.121976
gears                0.457221
dtype: float64
data types of the features with missing values:
fuel_type             object
accident              object
clean_title           object
horsepower           float64
displacement         float64
cylinders            float64
transmission_type     object
gears                float64
dtype: object
fraction of points with missing values: 0.6083811424295336


In [5]:
from sklearn.model_selection import train_test_split

# Split to train, CV, and test
X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

print(X_other.shape)
print(X_test.shape)

# Save y for ML models
y_other_series = pd.Series(y_other, name="y_other")
y_other_series.to_csv("../data/y_other.csv", index=False)
y_test_series = pd.Series(y_test, name="y_test")
y_test_series.to_csv("../data/y_test.csv", index=False)

(3207, 14)
(802, 14)


In [6]:
# Group features into numerical and categorical variables
num_ftrs = ['model_year', 'milage', 'horsepower', 'displacement', 'cylinders', 'turbo', 'gears']
cat_ftrs = ['brand', 'fuel_type', 'ext_col', 'int_col', 'accident', 'clean_title', 'transmission_type']     # XGB
# cat_ftrs = ['brand', 'fuel_type', 'accident', 'clean_title', 'transmission_type']     # Multivariate Imputation

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Preprocess
# one-hot encoder for categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse_output=False,handle_unknown='ignore'))])

# standard scaler for numerical variables
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

# collect the transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_ftrs),
        ('cat', categorical_transformer, cat_ftrs)])

In [8]:
import pandas as pd

# Fit-transform training set
X_prep = preprocessor.fit_transform(X_other)
feature_names = preprocessor.get_feature_names_out()
df_other = pd.DataFrame(data=X_prep, columns=feature_names)

# Transform validation and test sets
df_test = pd.DataFrame(data=preprocessor.transform(X_test), columns=feature_names)

# Print shapes of the datasets
print(f"Shape of training set after preprocessing: {df_other.shape}")
print(f"Shape of test set after preprocessing: {df_test.shape}")

Shape of training set after preprocessing: (3207, 483)
Shape of test set after preprocessing: (802, 483)


In [9]:
import pandas as pd

def report_missing_values(df_other, df_test):
    """
    Reports the proportion of missing values per feature and per row for
    training, validation, test, and combined datasets.

    Args:
        df_train: The preprocessed training set as a DataFrame.
        df_val: The preprocessed validation set as a DataFrame.
        df_test: The preprocessed test set as a DataFrame.

    Returns:
        A dictionary with missing value statistics for training, validation,
        test, and combined datasets.
    """
    # Combine datasets
    df_combined = pd.concat([df_other, df_test], ignore_index=True)

    # Helper function to calculate missing value statistics
    def calculate_missing_stats(df, name):
        perc_missing_per_ftr = df.isnull().sum(axis=0) / df.shape[0]
        frac_missing_rows = (df.isnull().sum(axis=1) != 0).mean()
        return {
            'feature_missing_proportion': perc_missing_per_ftr[perc_missing_per_ftr > 0],
            'row_missing_proportion': frac_missing_rows
        }

    # Calculate missing value statistics
    missing_stats = {
        'training': calculate_missing_stats(df_other, 'Training'),
        'test': calculate_missing_stats(df_test, 'Test'),
        'combined': calculate_missing_stats(df_combined, 'Combined')
    }

    # Print results
    for dataset, stats in missing_stats.items():
        print(f"\n{dataset.capitalize()} Dataset:")
        print(f"Proportion of missing values per feature:")
        print(stats['feature_missing_proportion'])
        print(f"Proportion of rows with missing values: {stats['row_missing_proportion']:.4f}")


report_missing_values(df_other, df_test)



Training Dataset:
Proportion of missing values per feature:
num__horsepower      0.197381
num__displacement    0.050514
num__cylinders       0.106330
num__gears           0.455254
dtype: float64
Proportion of rows with missing values: 0.5893

Test Dataset:
Proportion of missing values per feature:
num__horsepower      0.218204
num__displacement    0.068579
num__cylinders       0.123441
num__gears           0.465087
dtype: float64
Proportion of rows with missing values: 0.6209

Combined Dataset:
Proportion of missing values per feature:
num__horsepower      0.201547
num__displacement    0.054128
num__cylinders       0.109753
num__gears           0.457221
dtype: float64
Proportion of rows with missing values: 0.5957


*Missing Value Imputation*

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

def multivariate_imputer(X_other, X_test, random_state=42):
    """
    Constructs a multivariate imputer using IterativeImputer with RandomForestRegressor 
    and imputes missing values in the provided datasets.

    Args:
        X_train (pd.DataFrame): Training feature matrix with missing values.
        X_val (pd.DataFrame): Validation feature matrix with missing values.
        X_test (pd.DataFrame): Test feature matrix with missing values.
        random_state (int): Random seed for reproducibility.

    Returns:
        tuple: Imputed versions of X_train, X_val, X_test as DataFrames.
    """
    # Initialize the IterativeImputer with RandomForestRegressor
    imputer = IterativeImputer(
        estimator=RandomForestRegressor(n_estimators=10, random_state=random_state),
        random_state=random_state
    )
    
    # Fit the imputer on the training data and transform all datasets
    print("Fitting the imputer on the training data...")
    X_other_imputed = pd.DataFrame(imputer.fit_transform(X_other), columns=X_other.columns)
    print("Imputing missing values in the test datasets...")
    X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_other.columns)
    
    print("Imputation complete.")
    return X_other_imputed, X_test_imputed

X_other_imputed_mi, X_test_imputed_mi = multivariate_imputer(df_other, df_test, random_state=42)
report_missing_values(X_other_imputed_mi, X_test_imputed_mi)

In [10]:
from xgboost import XGBRegressor

def impute_with_xgboost(df_other, df_test, target_column):
    """
    Imputes missing values in a single column using XGBoost.

    Args:
        df_train (pd.DataFrame): Training set with missing values.
        df_val (pd.DataFrame): Validation set with missing values.
        df_test (pd.DataFrame): Test set with missing values.
        target_column (str): Column to impute.

    Returns:
        tuple: Updated versions of df_train, df_val, and df_test.
    """
    # Separate rows with and without missing values in the training set
    train_data = df_other[df_other[target_column].notnull()]
    missing_data_other = df_other[df_other[target_column].isnull()]

    # Features and target for training
    X_other = train_data.drop(columns=[target_column])
    y_other = train_data[target_column]

    # Features for prediction (rows with missing target values)
    X_missing_train = missing_data_other.drop(columns=[target_column])
    X_missing_test = df_test[df_test[target_column].isnull()].drop(columns=[target_column])

    # Train XGBoost Regressor
    model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
    model.fit(X_other, y_other)

    # Predict missing values in the training set
    if not X_missing_train.empty:
        imputed_values_train = model.predict(X_missing_train)
        df_other.loc[df_other[target_column].isnull(), target_column] = imputed_values_train

    # Predict missing values in the test sets
    if not X_missing_test.empty:
        imputed_values_test = model.predict(X_missing_test)
        df_test.loc[df_test[target_column].isnull(), target_column] = imputed_values_test

    return df_other, df_test


# Impute each column with missing values
X_other_imputed_xgb = df_other.copy()
X_test_imputed_xgb = df_test.copy()

for col in df_other.columns:
    if (
        X_other_imputed_xgb[col].isnull().any()
        or X_test_imputed_xgb[col].isnull().any()
    ):
        print(f"Imputing missing values for column: {col}")
        X_other_imputed_xgb, X_test_imputed_xgb = impute_with_xgboost(
            X_other_imputed_xgb, X_test_imputed_xgb, col
        )

# Check missing values after imputation
report_missing_values(X_other_imputed_xgb, X_test_imputed_xgb)

# Save imputed training data to CSV
X_other_imputed_xgb.to_csv("../data/X_other_imputed_xgb.csv", index=False)
print("Imputed training data saved to results/X_other_imputed_xgb.csv")

# Save imputed test data to CSV
X_test_imputed_xgb.to_csv("../data/X_test_imputed_xgb.csv", index=False)
print("Imputed test data saved to results/X_test_imputed_xgb.csv")


Imputing missing values for column: num__horsepower
Imputing missing values for column: num__displacement
Imputing missing values for column: num__cylinders
Imputing missing values for column: num__gears

Training Dataset:
Proportion of missing values per feature:
Series([], dtype: float64)
Proportion of rows with missing values: 0.0000

Test Dataset:
Proportion of missing values per feature:
Series([], dtype: float64)
Proportion of rows with missing values: 0.0000

Combined Dataset:
Proportion of missing values per feature:
Series([], dtype: float64)
Proportion of rows with missing values: 0.0000
Imputed training data saved to results/X_other_imputed_xgb.csv
Imputed test data saved to results/X_test_imputed_xgb.csv
