In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
class DataPreprocessor:
    def __init__(self):
        self.model_to_make_mapping = None
        self.overall_medians = {}

    def fit(self, train_df):
        """
        Fit the preprocessing on the training data to create mappings and overall statistics.
        """
        # Create model_to_make mapping using non-null entries in the training dataset
        self.model_to_make_mapping = train_df.dropna(subset=['make']).set_index('model')['make'].str.lower().to_dict()

        # Calculate overall medians for columns where necessary
        self.overall_medians = {
            'manufactured': train_df['manufactured'].median(),
            'power': train_df['power'].median(),
            'mileage': train_df['mileage'].median(),
            'engine_cap': train_df['engine_cap'].median(),
            'depreciation': train_df['depreciation'].median(),
            'road_tax': train_df['road_tax'].mean(),
            'dereg_value': train_df['dereg_value'].mean(),
            'coe': train_df['coe'].mean(),
            'omv': train_df['omv'].mean(),
            'arf': train_df['arf'].mean(),
        }

    def fill_missing_make(self, df):
        """
        Fill missing values in the 'make' column using the 'model' column based on the mapping dictionary.
        """
        def derive_make_from_model(row):
            if pd.isnull(row['make']):
                return self.model_to_make_mapping.get(row['model'].lower(), None)
            else:
                return row['make']

        df['make'] = df.apply(derive_make_from_model, axis=1)

    def fill_missing_with_group_mode_or_median(self, df, column_name):
        """
        Fill missing values in a specified column using the mode of each group (model).
        If the mode is not available, use the median of the training dataset.
        """
        overall_median = self.overall_medians[column_name]
        df[column_name] = df.groupby('model')[column_name].transform(
            lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else overall_median)
        )

    def fill_missing_with_group_mean_or_median(self, df, column_name):
        """
        Fill missing values in a specified column using the mean of each group (model).
        If the group mean is not available, use the overall mean from the training dataset.
        """
        overall_mean = self.overall_medians[column_name]
        df[column_name] = df.groupby('model')[column_name].transform(
            lambda x: x.fillna(x.mean() if not x.mode().empty else overall_mean)
        )

    def transform(self, df):
        """
        Apply the preprocessing steps to a dataset (training or test) using the fitted parameters.
        """
        # Fill missing 'make' values using model information
        self.fill_missing_make(df)

        # Fill missing values for other columns using appropriate methods
        self.fill_missing_with_group_mode_or_median(df, 'manufactured')
        self.fill_missing_with_group_mode_or_median(df, 'power')
        self.fill_missing_with_group_mode_or_median(df, 'mileage')
        self.fill_missing_with_group_mode_or_median(df, 'engine_cap')
        self.fill_missing_with_group_mode_or_median(df, 'depreciation')
        self.fill_missing_with_group_mode_or_median(df, 'road_tax')
        self.fill_missing_with_group_mean_or_median(df, 'dereg_value')
        self.fill_missing_with_group_mean_or_median(df, 'coe')
        self.fill_missing_with_group_mean_or_median(df, 'omv')
        self.fill_missing_with_group_mean_or_median(df, 'arf')

        return df


In [4]:
features = ['make', 'model','type_of_vehicle', 'manufactured', 'mileage', 'power','engine_cap', 'depreciation','road_tax','dereg_value','coe', 'omv', 'arf' ]
target = 'price'
numerical_features = ['manufactured', 'mileage', 'power','engine_cap', 'depreciation','road_tax','dereg_value','coe', 'omv', 'arf']
categorical_features = ['make', 'model','type_of_vehicle']

preprocessor = DataPreprocessor()

train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

train_df = train_df.dropna(subset=[target])

preprocessor.fit(train_df)

train_df = preprocessor.transform(train_df)
test_df = preprocessor.transform(test_df)

X = train_df[features]
y = train_df[target]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
X_train.head()

Unnamed: 0,make,model,type_of_vehicle,manufactured,mileage,power,engine_cap,depreciation,road_tax,dereg_value,coe,omv,arf
23311,isuzu,npr85,truck,2019.0,14329.0,111.0,2999.0,14860.0,1462.160899,13348.0,22085,37994.0,1900.0
23623,honda,fit,hatchback,2009.0,55000.0,73.0,1339.0,13510.0,885.0,123.0,14920,14211.0,14211.0
1020,toyota,sienta,mpv,2018.0,80346.0,79.0,1496.0,14530.0,682.0,25880.0,38001,17199.0,17199.0
12645,volvo,v40,hatchback,2018.0,68000.0,140.0,1969.0,15770.0,1176.0,35358.0,36901,22799.0,23919.0
1533,kia,carens,mpv,2015.0,130000.0,122.0,1999.0,15540.0,1212.0,20117.0,58190,21074.0,21504.0


In [None]:
# Prepare a function to log model parameters and evaluation metrics. It should be in a single line saved to a csv for easy tracking.
import csv
import os

def log_model(model_name, model_params, evaluation_metrics):
    """
    Log the model parameters and evaluation metrics to a CSV file.
    """
    # Create a new CSV file if it does not exist
    if not os.path.exists('model_logs.csv'):
        with open('model_logs.csv', 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['Model', 'Parameters', 'MAE', 'MSE', 'RMSE', 'R2'])

    # Append the results to the CSV file
    with open('model_logs.csv', 'a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([model_name, model_params, *evaluation_metrics])

In [7]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np
import joblib

# Assume you have X_train and X_valid as your training and validation datasets
# Identify categorical and numerical features
categorical_features = X_train.select_dtypes(include=['object']).columns
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing pipeline for categorical features (OneHotEncoder in this example)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing pipeline for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Create the complete pipeline including preprocessing and the KNeighborsRegressor model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', KNeighborsRegressor())
])

# Define the parameter grid to search over for KNeighborsRegressor
param_grid = {
    'model__n_neighbors': [3, 5, 7, 10, 15, 20],
    'model__weights': ['uniform', 'distance'],
    'model__p': [1, 2],  # 1 for Manhattan distance, 2 for Euclidean distance
    'model__metric': ['minkowski', 'euclidean', 'manhattan']
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,
    n_iter=50,  # Number of different combinations to try
    scoring='neg_mean_absolute_error',  # Use MAE as the scoring metric
    cv=5,  # 5-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use all available cores
)

# Fit the RandomizedSearchCV on the training data
random_search.fit(X_train, y_train)

# Get the best model and parameters
best_model = random_search.best_estimator_
best_params = random_search.best_params_

# Save the best model to a file
joblib.dump(best_model, 'best_knn_model.pkl')

# Evaluate the model with the best parameters on validation data
y_pred = best_model.predict(X_valid)
mae = mean_absolute_error(y_valid, y_pred)
mse = mean_squared_error(y_valid, y_pred)
rmse = np.sqrt(mse)

print(f'Best Parameters: {best_params}')
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Parameters: {'model__weights': 'distance', 'model__p': 1, 'model__n_neighbors': 5, 'model__metric': 'minkowski'}
MAE: 11587.255677017325
MSE: 1174114642.438395
RMSE: 34265.35630105712


In [19]:
test_predictions = pipeline.predict(test_df)
output = pd.DataFrame({'Id': test_df.index, 'Predicted': test_predictions})
output.to_csv('predictions_v1.1.csv', index=False)

print(output.head())

   Id      Predicted
0   0   19547.953125
1   1   35736.707031
2   2  148128.781250
3   3   78754.007812
4   4   26880.107422


In [23]:
test_df.head()

Unnamed: 0,listing_id,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,...,dereg_value,mileage,omv,arf,opc_scheme,lifespan,eco_category,features,accessories,indicative_price
0,1303772,Honda Vezel 1.5A X,honda,vezel,4614,2015.0,,29-apr-2015,suv,parf car,...,9582.0,112000.0,19229.0,9229.0,,,uncategorized,"powerful 1.5l i-vtec engine producing 128bhp, ...","pioneer touch screen with reverse camera, 16"" ...",
1,1323166,Mazda 3 1.6A SP (COE till 10/2027),mazda,3,extremely well maintained and in pristine cond...,2007.0,,26-oct-2007,mid-sized sedan,"coe car, premium ad car, low mileage car",...,13644.0,120000.0,14347.0,15782.0,,,uncategorized,fuel efficient 1.6l 4-cylinder inline 16-valve...,"multi-function steering wheel, keyless entry, ...",
2,1308405,MINI Cooper S Countryman 2.0A,mini,cooper,1 owner! beautiful island blue color! eurokars...,2019.0,,27-mar-2020,sports car,parf car,...,54818.0,43000.0,39863.0,47809.0,,,uncategorized,"output of 141kw, 189bhp at 5000rpm to 6000rpm,...","18"" sports rims, sports leather seats, navigat...",
3,1216706,Toyota Vios 1.5A G,toyota,vios,fully agent maintain! genuine low mileage at 5...,2019.0,,28-jun-2019,mid-sized sedan,"parf car, premium ad car",...,26363.0,53300.0,15573.0,15573.0,,,uncategorized,"1.5l 4 cylinder 16 valves dohc vvt-i engine, 7...","push start button, toyota factory player, reve...",
4,1298206,Mazda 3 HB 1.5A,mazda,3,workshop check/sta evaluation available. accid...,2015.0,,19-nov-2015,hatchback,"parf car, premium ad car",...,15197.0,149000.0,18097.0,13097.0,,,uncategorized,1.5l 4 cylinder inline dohc 16 valves skyactiv...,factory fitted audio with audio & multi functi...,


In [7]:
import joblib
import pandas as pd

# Load the data
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

# Drop rows with missing target values
train_df = train_df.dropna(subset=[target])

# Assume 'DataPreprocessor()' is a class that handles all preprocessing (fitting, transforming)
preprocessor = DataPreprocessor()
preprocessor.fit(train_df)

# Transform both training and test data
train_df = preprocessor.transform(train_df)
test_df = preprocessor.transform(test_df)

# Make sure we use only the features required for predictions
test_df = test_df[features]

# Load the best model with the pipeline, which includes preprocessing steps
best_model = joblib.load('best_knn_model.pkl')  # Updated to reflect KNN model

# Ensure that the target column is not in the test dataset
if target in test_df.columns:
    test_df = test_df.drop(columns=[target])

# Make predictions using the test dataset with the best model
test_predictions = best_model.predict(test_df)

# Create a DataFrame with the predictions
output = pd.DataFrame({'Id': test_df.index, 'Predicted': test_predictions})

# Save the predictions to a CSV file
output.to_csv('predictions_v3.1.csv', index=False)

# Display the first few rows of the output DataFrame
print(output.head())


   Id      Predicted
0   0   20147.593750
1   1   35544.964844
2   2  146033.046875
3   3   79854.781250
4   4   26131.941406
