In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
class DataPreprocessor:
    def __init__(self):
        self.model_to_make_mapping = None
        self.overall_medians = {}

    def fit(self, train_df):
        """
        Fit the preprocessing on the training data to create mappings and overall statistics.
        """
        # Create model_to_make mapping using non-null entries in the training dataset
        self.model_to_make_mapping = train_df.dropna(subset=['make']).set_index('model')['make'].str.lower().to_dict()

        # Calculate overall medians for columns where necessary
        self.overall_medians = {
            'manufactured': train_df['manufactured'].median(),
            'power': train_df['power'].median(),
            'mileage': train_df['mileage'].median(),
            'engine_cap': train_df['engine_cap'].median(),
            'depreciation': train_df['depreciation'].median(),
            'road_tax': train_df['road_tax'].mean(),
            'dereg_value': train_df['dereg_value'].mean(),
            'coe': train_df['coe'].mean(),
            'omv': train_df['omv'].mean(),
            'arf': train_df['arf'].mean(),
        }

    def fill_missing_make(self, df):
        """
        Fill missing values in the 'make' column using the 'model' column based on the mapping dictionary.
        """
        def derive_make_from_model(row):
            if pd.isnull(row['make']):
                return self.model_to_make_mapping.get(row['model'].lower(), None)
            else:
                return row['make']

        df['make'] = df.apply(derive_make_from_model, axis=1)

    def fill_missing_with_group_mode_or_median(self, df, column_name):
        """
        Fill missing values in a specified column using the mode of each group (model).
        If the mode is not available, use the median of the training dataset.
        """
        overall_median = self.overall_medians[column_name]
        df[column_name] = df.groupby('model')[column_name].transform(
            lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else overall_median)
        )

    def fill_missing_with_group_mean_or_median(self, df, column_name):
        """
        Fill missing values in a specified column using the mean of each group (model).
        If the group mean is not available, use the overall mean from the training dataset.
        """
        overall_mean = self.overall_medians[column_name]
        df[column_name] = df.groupby('model')[column_name].transform(
            lambda x: x.fillna(x.mean() if not x.mode().empty else overall_mean)
        )

    def transform(self, df):
        """
        Apply the preprocessing steps to a dataset (training or test) using the fitted parameters.
        """
        # Fill missing 'make' values using model information
        self.fill_missing_make(df)

        # Fill missing values for other columns using appropriate methods
        self.fill_missing_with_group_mode_or_median(df, 'manufactured')
        self.fill_missing_with_group_mode_or_median(df, 'power')
        self.fill_missing_with_group_mode_or_median(df, 'mileage')
        self.fill_missing_with_group_mode_or_median(df, 'engine_cap')
        self.fill_missing_with_group_mode_or_median(df, 'depreciation')
        self.fill_missing_with_group_mode_or_median(df, 'road_tax')
        self.fill_missing_with_group_mean_or_median(df, 'dereg_value')
        self.fill_missing_with_group_mean_or_median(df, 'coe')
        self.fill_missing_with_group_mean_or_median(df, 'omv')
        self.fill_missing_with_group_mean_or_median(df, 'arf')

        return df


In [5]:
features = ['make', 'model','type_of_vehicle', 'manufactured', 'mileage', 'power','engine_cap', 'depreciation','road_tax','dereg_value','coe', 'omv', 'arf' ]
target = 'price'
numerical_features = ['manufactured', 'mileage', 'power','engine_cap', 'depreciation','road_tax','dereg_value','coe', 'omv', 'arf']
categorical_features = ['make', 'model','type_of_vehicle']

preprocessor = DataPreprocessor()

train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

train_df = train_df.dropna(subset=[target])

preprocessor.fit(train_df)

train_df = preprocessor.transform(train_df)
test_df = preprocessor.transform(test_df)

X = train_df[features]
y = train_df[target]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:


# Step 3: Create preprocessing pipelines for both numeric and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Step 4: Combine preprocessing steps into a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Step 5: Create a pipeline that first preprocesses the data and then applies the XGBoost model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    # ('model', RandomForestRegressor(n_estimators=100, random_state=42))
    ('model', XGBRegressor(n_estimators=500, learning_rate=0.1, max_depth=20, random_state=42))
])

# Step 6: Split the data into training and testing sets


# Step 7: Fit the model pipeline to the training data
model_pipeline.fit(X_train, y_train)

# Step 8: Make predictions on the test set
y_pred = model_pipeline.predict(X_valid)

# Step 9: Evaluate the model's performance
mae = mean_absolute_error(y_valid, y_pred)
mse = mean_squared_error(y_valid, y_pred)
rmse = mean_squared_error(y_valid, y_pred, squared=False)
r2 = r2_score(y_valid, y_pred)

# Print the evaluation metrics
print(f'Mean Absolute Error (MAE): {mae:.2f}')
print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
print(f'R-squared (R2): {r2:.2f}')

KeyboardInterrupt: 

In [17]:
X_train.head()

Unnamed: 0,make,model,manufactured,mileage,power,engine_cap,depreciation,road_tax,dereg_value,coe,omv,arf,price
23311,isuzu,npr85,2019.0,14329.0,111.0,2999.0,14860.0,1462.160899,13348.0,22085,37994.0,1900.0,89800.0
23623,honda,fit,2009.0,55000.0,73.0,1339.0,13510.0,885.0,123.0,14920,14211.0,14211.0,700.0
1020,toyota,sienta,2018.0,80346.0,79.0,1496.0,14530.0,682.0,25880.0,38001,17199.0,17199.0,64800.0
12645,volvo,v40,2018.0,68000.0,140.0,1969.0,15770.0,1176.0,35358.0,36901,22799.0,23919.0,92800.0
1533,kia,carens,2015.0,130000.0,122.0,1999.0,15540.0,1212.0,20117.0,58190,21074.0,21504.0,32888.0


In [None]:
# Prepare a function to log model parameters and evaluation metrics. It should be in a single line saved to a csv for easy tracking.
import csv
import os

def log_model(model_name, model_params, evaluation_metrics):
    """
    Log the model parameters and evaluation metrics to a CSV file.
    """
    # Create a new CSV file if it does not exist
    if not os.path.exists('model_logs.csv'):
        with open('model_logs.csv', 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['Model', 'Parameters', 'MAE', 'MSE', 'RMSE', 'R2'])

    # Append the results to the CSV file
    with open('model_logs.csv', 'a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([model_name, model_params, *evaluation_metrics])

In [6]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np
import joblib

# Assume you have X_train and X_valid as your training and validation datasets
# Identify categorical and numerical features
categorical_features = X_train.select_dtypes(include=['object']).columns
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing pipeline for categorical features (OneHotEncoder in this example)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing pipeline for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Create the complete pipeline including preprocessing and the XGBRegressor model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(random_state=42))
])

# Define the parameter grid to search over
param_grid = {
    'model__n_estimators': [100, 200, 500, 750, 1000],
    'model__learning_rate': [0.01, 0.05, 0.075, 0.1, 0.2],
    'model__max_depth': [3, 5, 10, 20, 30, 40, 50],
    'model__subsample': [0.5, 0.6, 0.8, 1.0],
    'model__colsample_bytree': [0.4, 0.6, 0.8, 1.0],
    'model__min_child_weight': [1, 5, 10, 20]
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,
    n_iter=50,  # Number of different combinations to try
    scoring='neg_mean_absolute_error',  # Use MAE as the scoring metric
    cv=5,  # 5-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use all available cores
)

# Fit the RandomizedSearchCV on the training data
random_search.fit(X_train, y_train)

# Get the best model and parameters
best_model = random_search.best_estimator_
best_params = random_search.best_params_

# Save the best model to a file
joblib.dump(best_model, 'best_xgb_model.pkl')

# Evaluate the model with the best parameters on validation data
y_pred = best_model.predict(X_valid)
mae = mean_absolute_error(y_valid, y_pred)
mse = mean_squared_error(y_valid, y_pred)
rmse = np.sqrt(mse)

print(f'Best Parameters: {best_params}')
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')




Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Parameters: {'model__subsample': 0.8, 'model__n_estimators': 1000, 'model__min_child_weight': 5, 'model__max_depth': 40, 'model__learning_rate': 0.01, 'model__colsample_bytree': 1.0}
MAE: 5311.081275683594
MSE: 721255507.9106631
RMSE: 26856.20054867522


In [19]:
test_predictions = model_pipeline.predict(test_df)
output = pd.DataFrame({'Id': test_df.index, 'Predicted': test_predictions})
output.to_csv('predictions_v1.1.csv', index=False)

print(output.head())

   Id      Predicted
0   0   19547.953125
1   1   35736.707031
2   2  148128.781250
3   3   78754.007812
4   4   26880.107422


In [23]:
test_df.head()

Unnamed: 0,listing_id,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,...,dereg_value,mileage,omv,arf,opc_scheme,lifespan,eco_category,features,accessories,indicative_price
0,1303772,Honda Vezel 1.5A X,honda,vezel,4614,2015.0,,29-apr-2015,suv,parf car,...,9582.0,112000.0,19229.0,9229.0,,,uncategorized,"powerful 1.5l i-vtec engine producing 128bhp, ...","pioneer touch screen with reverse camera, 16"" ...",
1,1323166,Mazda 3 1.6A SP (COE till 10/2027),mazda,3,extremely well maintained and in pristine cond...,2007.0,,26-oct-2007,mid-sized sedan,"coe car, premium ad car, low mileage car",...,13644.0,120000.0,14347.0,15782.0,,,uncategorized,fuel efficient 1.6l 4-cylinder inline 16-valve...,"multi-function steering wheel, keyless entry, ...",
2,1308405,MINI Cooper S Countryman 2.0A,mini,cooper,1 owner! beautiful island blue color! eurokars...,2019.0,,27-mar-2020,sports car,parf car,...,54818.0,43000.0,39863.0,47809.0,,,uncategorized,"output of 141kw, 189bhp at 5000rpm to 6000rpm,...","18"" sports rims, sports leather seats, navigat...",
3,1216706,Toyota Vios 1.5A G,toyota,vios,fully agent maintain! genuine low mileage at 5...,2019.0,,28-jun-2019,mid-sized sedan,"parf car, premium ad car",...,26363.0,53300.0,15573.0,15573.0,,,uncategorized,"1.5l 4 cylinder 16 valves dohc vvt-i engine, 7...","push start button, toyota factory player, reve...",
4,1298206,Mazda 3 HB 1.5A,mazda,3,workshop check/sta evaluation available. accid...,2015.0,,19-nov-2015,hatchback,"parf car, premium ad car",...,15197.0,149000.0,18097.0,13097.0,,,uncategorized,1.5l 4 cylinder inline dohc 16 valves skyactiv...,factory fitted audio with audio & multi functi...,


In [7]:
import joblib
import pandas as pd

preprocessor = DataPreprocessor()

train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

train_df = train_df.dropna(subset=[target])

preprocessor.fit(train_df)

train_df = preprocessor.transform(train_df)
test_df = preprocessor.transform(test_df)
test_df = test_df[features]

# Load the best model with the pipeline, which includes preprocessing steps
best_model = joblib.load('best_xgb_model.pkl')

# Ensure that the target column 'price' is not in the test dataset
if 'price' in test_df.columns:
    test_df = test_df.drop(columns=['price'])

# Make predictions using the test dataset with the best model
test_predictions = best_model.predict(test_df)

# Create a DataFrame with the predictions
output = pd.DataFrame({'Id': test_df.index, 'Predicted': test_predictions})

# Save the predictions to a CSV file
output.to_csv('predictions_v1.1.csv', index=False)

# Display the first few rows of the output DataFrame
print(output.head())


   Id      Predicted
0   0   20147.593750
1   1   35544.964844
2   2  146033.046875
3   3   79854.781250
4   4   26131.941406
