In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
class DataPreprocessor:
    def __init__(self):
        self.model_to_make_mapping = None
        self.overall_medians = {}

    def fit(self, train_df):
        """
        Fit the preprocessing on the training data to create mappings and overall statistics.
        """
        # Create model_to_make mapping using non-null entries in the training dataset
        self.model_to_make_mapping = train_df.dropna(subset=['make']).set_index('model')['make'].str.lower().to_dict()

        # Calculate overall medians for columns where necessary
        self.overall_medians = {
            'manufactured': train_df['manufactured'].median(),
            'power': train_df['power'].median(),
            'mileage': train_df['mileage'].median(),
            'engine_cap': train_df['engine_cap'].median(),
            'depreciation': train_df['depreciation'].median(),
            'road_tax': train_df['road_tax'].mean(),
            'dereg_value': train_df['dereg_value'].mean(),
            'coe': train_df['coe'].mean(),
            'omv': train_df['omv'].mean(),
            'arf': train_df['arf'].mean(),
        }

    def fill_missing_make(self, df):
        """
        Fill missing values in the 'make' column using the 'model' column based on the mapping dictionary.
        """
        def derive_make_from_model(row):
            if pd.isnull(row['make']):
                return self.model_to_make_mapping.get(row['model'].lower(), None)
            else:
                return row['make']

        df['make'] = df.apply(derive_make_from_model, axis=1)

    def fill_missing_with_group_mode_or_median(self, df, column_name):
        """
        Fill missing values in a specified column using the mode of each group (model).
        If the mode is not available, use the median of the training dataset.
        """
        overall_median = self.overall_medians[column_name]
        df[column_name] = df.groupby('model')[column_name].transform(
            lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else overall_median)
        )

    def fill_missing_with_group_mean_or_median(self, df, column_name):
        """
        Fill missing values in a specified column using the mean of each group (model).
        If the group mean is not available, use the overall mean from the training dataset.
        """
        overall_mean = self.overall_medians[column_name]
        df[column_name] = df.groupby('model')[column_name].transform(
            lambda x: x.fillna(x.mean() if not x.mode().empty else overall_mean)
        )

    def transform(self, df):
        """
        Apply the preprocessing steps to a dataset (training or test) using the fitted parameters.
        """
        # Fill missing 'make' values using model information
        self.fill_missing_make(df)

        # Fill missing values for other columns using appropriate methods
        self.fill_missing_with_group_mode_or_median(df, 'manufactured')
        self.fill_missing_with_group_mode_or_median(df, 'power')
        self.fill_missing_with_group_mode_or_median(df, 'mileage')
        self.fill_missing_with_group_mode_or_median(df, 'engine_cap')
        self.fill_missing_with_group_mode_or_median(df, 'depreciation')
        self.fill_missing_with_group_mode_or_median(df, 'road_tax')
        self.fill_missing_with_group_mean_or_median(df, 'dereg_value')
        self.fill_missing_with_group_mean_or_median(df, 'coe')
        self.fill_missing_with_group_mean_or_median(df, 'omv')
        self.fill_missing_with_group_mean_or_median(df, 'arf')

        return df


In [3]:
features = ['make', 'model','type_of_vehicle','category', 'manufactured', 'mileage', 'power','engine_cap', 'depreciation','road_tax','dereg_value','coe', 'omv', 'arf' ]
target = 'price'
numerical_features = ['manufactured', 'mileage', 'power','engine_cap', 'depreciation','road_tax','dereg_value','coe', 'omv', 'arf']
categorical_features = ['make', 'model','type_of_vehicle','category']

preprocessor = DataPreprocessor()

train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

train_df = train_df.dropna(subset=[target])

preprocessor.fit(train_df)

train_df = preprocessor.transform(train_df)
test_df = preprocessor.transform(test_df)

X = train_df[features]
y = train_df[target]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
X_train.head()

Unnamed: 0,make,model,type_of_vehicle,category,manufactured,mileage,power,engine_cap,depreciation,road_tax,dereg_value,coe,omv,arf
23311,isuzu,npr85,truck,premium ad car,2019.0,14329.0,111.0,2999.0,14860.0,1462.160899,13348.0,22085,37994.0,1900.0
23623,honda,fit,hatchback,coe car,2009.0,55000.0,73.0,1339.0,13510.0,885.0,123.0,14920,14211.0,14211.0
1020,toyota,sienta,mpv,"parf car, premium ad car",2018.0,80346.0,79.0,1496.0,14530.0,682.0,25880.0,38001,17199.0,17199.0
12645,volvo,v40,hatchback,parf car,2018.0,68000.0,140.0,1969.0,15770.0,1176.0,35358.0,36901,22799.0,23919.0
1533,kia,carens,mpv,parf car,2015.0,130000.0,122.0,1999.0,15540.0,1212.0,20117.0,58190,21074.0,21504.0


In [5]:
# Prepare a function to log model parameters and evaluation metrics. It should be in a single line saved to a csv for easy tracking.
import csv
import os

def log_model(model_name, model_params, evaluation_metrics):
    """
    Log the model parameters and evaluation metrics to a CSV file.
    """
    # Create a new CSV file if it does not exist
    if not os.path.exists('model_logs.csv'):
        with open('model_logs.csv', 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['Model', 'Parameters', 'MAE', 'MSE', 'RMSE', 'R2'])

    # Append the results to the CSV file
    with open('model_logs.csv', 'a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([model_name, model_params, *evaluation_metrics])

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import joblib

# Define a PyTorch neural network model
import torch
import torch.nn as nn

class PyTorchRegressor(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, activation='ReLU', dropout_rate=0.2):
        super(PyTorchRegressor, self).__init__()
        self.activation = getattr(nn, activation)()  # Dynamically choose activation function

        # Define the layers of the neural network
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.dropout1 = nn.Dropout(dropout_rate)

        self.fc2 = nn.Linear(hidden_dim, hidden_dim * 2)
        self.bn2 = nn.BatchNorm1d(hidden_dim * 2)
        self.dropout2 = nn.Dropout(dropout_rate)

        self.fc3 = nn.Linear(hidden_dim * 2, hidden_dim * 2)
        self.bn3 = nn.BatchNorm1d(hidden_dim * 2)
        self.dropout3 = nn.Dropout(dropout_rate)

        self.fc4 = nn.Linear(hidden_dim * 2, hidden_dim)
        self.bn4 = nn.BatchNorm1d(hidden_dim)
        self.dropout4 = nn.Dropout(dropout_rate)

        self.fc5 = nn.Linear(hidden_dim, 1)  # Final output layer

    def forward(self, x):
        x = self.dropout1(self.activation(self.bn1(self.fc1(x))))
        x = self.dropout2(self.activation(self.bn2(self.fc2(x))))
        x = self.dropout3(self.activation(self.bn3(self.fc3(x))))
        x = self.dropout4(self.activation(self.bn4(self.fc4(x))))
        x = self.fc5(x)  # Output layer for regression
        return x


# Create a wrapper to integrate the PyTorch model with scikit-learn
class PyTorchRegressorWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, hidden_dim=128, activation='ReLU', dropout_rate=0.2, learning_rate=0.001, epochs=100, batch_size=32):
        self.hidden_dim = hidden_dim
        self.activation = activation
        self.dropout_rate = dropout_rate
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.model = None
        self.input_dim = None  # To be determined during fitting

    def fit(self, X, y):
        # Ensure that input data is converted to dense format and then to a NumPy array
        X_dense = X if isinstance(X, np.ndarray) else X.toarray()
        y_array = y.values if hasattr(y, 'values') else y

        # Set input dimension based on the shape of the preprocessed data
        self.input_dim = X_dense.shape[1]

        # Initialize the more complex PyTorch model with the determined input dimension
        self.model = PyTorchRegressor(self.input_dim, self.hidden_dim, self.activation, self.dropout_rate)
        criterion = nn.MSELoss()
        optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)

        # Convert the data to PyTorch tensors
        X_tensor = torch.tensor(X_dense.astype(np.float32))
        y_tensor = torch.tensor(y_array.astype(np.float32)).view(-1, 1)

        # Training loop with batch processing
        dataset = torch.utils.data.TensorDataset(X_tensor, y_tensor)
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

        for epoch in range(self.epochs):
            self.model.train()
            epoch_loss = 0
            for batch_X, batch_y in dataloader:
                optimizer.zero_grad()
                outputs = self.model(batch_X)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item()

            # Print the average loss for each epoch for monitoring purposes
            if (epoch + 1) % 10 == 0 or epoch == 0:
                print(f'Epoch [{epoch + 1}/{self.epochs}], Loss: {epoch_loss / len(dataloader):.4f}')

        return self

    def predict(self, X):
        # Ensure that input data is converted to dense format and then to a NumPy array
        X_dense = X if isinstance(X, np.ndarray) else X.toarray()

        self.model.eval()
        with torch.no_grad():
            X_tensor = torch.tensor(X_dense.astype(np.float32))
            predictions = self.model(X_tensor).numpy()
        return predictions.flatten()


# Assume you have X_train and X_valid as your training and validation datasets
# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing pipeline for categorical features (OneHotEncoder in this example)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # Ensure dense output
])

# Preprocessing pipeline for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Create the complete pipeline including preprocessing and the PyTorch model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', PyTorchRegressorWrapper())
])

# Define the parameter grid to search over
param_grid = {
    'model__hidden_dim': [64, 128, 256, 512],
    'model__activation': ['ReLU', 'LeakyReLU', 'Tanh'],
    'model__dropout_rate': [0.1, 0.2, 0.3, 0.5],
    'model__learning_rate': [0.0001, 0.001, 0.01, 0.1],
    'model__epochs': [100, 200, 300],
    'model__batch_size': [32, 64, 128]
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,
    n_iter=10,  # Number of different combinations to try
    scoring='neg_mean_absolute_error',  # Use MAE as the scoring metric
    cv=3,  # 3-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use all available cores
)

# Fit the RandomizedSearchCV on the training data
random_search.fit(X, y)

# Get the best model and parameters
best_model = random_search.best_estimator_
best_params = random_search.best_params_

# Save the best model to a file
joblib.dump(best_model, 'best_pytorch_model.pkl')

# Evaluate the model with the best parameters on validation data
y_pred = best_model.predict(X_valid)
mae = mean_absolute_error(y_valid, y_pred)
mse = mean_squared_error(y_valid, y_pred)
rmse = np.sqrt(mse)

print(f'Best Parameters: {best_params}')
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')


Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [7]:
import joblib
import pandas as pd
import torch

# Assuming ComplexPyTorchRegressorWrapper and DataPreprocessor are already defined
preprocessor = DataPreprocessor()

# Load training and test data
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

# Specify the target variable and features
target = 'price'  # Replace with the actual target column name if different
features = [col for col in train_df.columns if col != target]

# Drop rows with missing target values in the training dataset
train_df = train_df.dropna(subset=[target])

# Fit the preprocessor on the training data
preprocessor.fit(train_df)

# Transform the training and test data using the fitted preprocessor
train_df = preprocessor.transform(train_df)
test_df = preprocessor.transform(test_df)
test_df = test_df[features]  # Keep only the features in the test set

# Load the best neural network model with the preprocessing pipeline
best_model = joblib.load('best_neural_network_model.pkl')

# Ensure that the target column 'price' is not in the test dataset
if target in test_df.columns:
    test_df = test_df.drop(columns=[target])

# Convert the test data to a PyTorch tensor for prediction
X_test_tensor = torch.tensor(test_df.values.astype(np.float32))

# Make predictions using the test dataset with the best neural network model
best_model.model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    test_predictions = best_model.model(X_test_tensor).numpy().flatten()

# Create a DataFrame with the predictions
output = pd.DataFrame({'Id': test_df.index, 'Predicted': test_predictions})

# Save the predictions to a CSV file
output.to_csv('predictions_v2.0.csv', index=False)

# Display the first few rows of the output DataFrame
print(output.head())


   Id      Predicted
0   0   20147.593750
1   1   35544.964844
2   2  146033.046875
3   3   79854.781250
4   4   26131.941406
