# Construct preprocessing functions


In [333]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.base import BaseEstimator, OutlierMixin
from sklearn.cluster import KMeans, DBSCAN
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import LocalOutlierFactor
from sklearn.tree import DecisionTreeClassifier
from imblearn import FunctionSampler
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [334]:
non_gaussian_features = [21, 26, 32]


# Create a custom transformer to remove outliers for gaussian features
class GaussianOutlierDetector(BaseEstimator, OutlierMixin):
    def __init__(self, sd=2):
        self.sd = sd

    def fit(self, X, y=None):
        return self

    def fit_predict(self, X, y=None):
        likelihoods = np.zeros_like(X, dtype=float)
        filters = np.zeros_like(X, dtype=float)

        for col in range(X.shape[1]):
            # Calculate the mean and standard deviation for the column
            mean = X[:, col].mean()
            std_dev = X[:, col].std()
            # Calculate the PDF (likelihood) for each row in the column
            likelihoods[:, col] = stats.norm.pdf(X[:, col], loc=mean, scale=std_dev)
            filters[:, col] = stats.norm.pdf(
                (mean + (self.sd * std_dev)), loc=mean, scale=std_dev
            )

        condition_mask = np.any(likelihoods < filters, axis=1)

        # Convert the condition mask to -1 values being outliers and 1 values being inliers
        condition_mask = np.where(condition_mask, -1, 1)
        return condition_mask


dbscan = DBSCAN()
lof = LocalOutlierFactor()
isolation_forest = IsolationForest()
gaussian_outlier_detector = GaussianOutlierDetector()


def gaussian_outlier_removal(X, y, **kwargs):
    god = GaussianOutlierDetector(**kwargs)
    mask = god.fit_predict(X)
    return X[mask == 1], y[mask == 1]


def lof_outlier_removal(X, y, **kwargs):
    lof = LocalOutlierFactor(**kwargs)
    mask = lof.fit_predict(X)
    return X[mask == 1], y[mask == 1]


def isolation_forest_outlier_removal(X, y, **kwargs):
    isolation_forest = IsolationForest(**kwargs)
    mask = isolation_forest.fit_predict(X)
    return X[mask == 1], y[mask == 1]


def dbscan_outlier_removal(X, y, **kwargs):
    dbscan = DBSCAN(**kwargs)
    mask = dbscan.fit_predict(X)
    return X[mask == 1], y[mask == 1]


def k_means_outlier_removal(X, y, **kwargs):
    k_means = KMeans(**kwargs)
    mask = k_means.fit_predict(X)
    return X[mask == 1], y[mask == 1]


# Imputers
numerical_imputer = SimpleImputer()
nominal_imputer = SimpleImputer(strategy="most_frequent")
# knn_imputer = KNNImputer()

# Make the pipelines


## Common column transformers across all pipelines


In [335]:
impute_missing_values = ColumnTransformer(
    [
        ("numerical_imputer", numerical_imputer, list(range(100))),
        ("nominal_imputer", nominal_imputer, list(range(100, 128))),
    ]
)

scale_numerical_features = ColumnTransformer(
    [
        ("numerical_scaler", StandardScaler(), list(range(100))),
    ]
)

## Reusable parameters across all pipelines


In [336]:
contamination_range = np.arange(0.03, 0.12, 0.03)


simple_imputer_params = {}

common_params = {
    "impute__numerical_imputer": [numerical_imputer],
    "impute__numerical_imputer__strategy": ["mean"],
    "remove_gaussian_outliers__kw_args": [{"sd": 3}],
}


common_params_simple_imputer = {**simple_imputer_params, **common_params}

# common_params_knn_imputer = {**knn_imputer_params, **common_params}

lof_params = {
    "remove_non_gaussian_outliers__func": [lof_outlier_removal],
    "remove_non_gaussian_outliers__kw_args": [
        {"n_neighbors": n_neighbors, "contamination": contamination, "n_jobs": -1}
        for n_neighbors, contamination in zip(np.arange(3, 9, 2), contamination_range)
    ],
}

isolation_forest_params = {
    "remove_non_gaussian_outliers__func": [isolation_forest_outlier_removal],
    "remove_non_gaussian_outliers__kw_args": [
        {"contamination": contamination, "n_jobs": -1}
        for contamination in contamination_range
    ],
}

dbscan_params = {
    "remove_non_gaussian_outliers__func": [dbscan_outlier_removal],
    "remove_non_gaussian_outliers__kw_args": [
        {"eps": eps, "n_jobs": -1} for eps in np.arange(0.5, 2, 0.5)
    ],
}


dt_param_grid = [lof_params, isolation_forest_params, dbscan_params]


def create_params(custom_model_params):
    # for imputer_method in [simple_imputer_params, knn_imputer_params]:
    for outlier_method in [lof_params, isolation_forest_params]:
        yield {**common_params, **outlier_method, **custom_model_params}

## Decision Tree Pipeline


In [337]:
dt_pipeline = Pipeline(
    steps=[
        ("impute", impute_missing_values),
        (
            "remove_gaussian_outliers",
            FunctionSampler(func=gaussian_outlier_removal, kw_args={}),
        ),
        (
            "remove_non_gaussian_outliers",
            FunctionSampler(
                func=None,
                kw_args={},
            ),
        ),
        ("dt", DecisionTreeClassifier()),
    ]
)

dt_param_grid = list(
    create_params(
        {
            "dt__criterion": ["gini", "entropy"],
            "dt__max_depth": [None, 3, 5],
        }
    )
)

## Random Forest pipeline


In [338]:
rf_pipeline = Pipeline(
    steps=[
        ("impute", impute_missing_values),
        (
            "remove_gaussian_outliers",
            FunctionSampler(func=gaussian_outlier_removal, kw_args={}),
        ),
        (
            "remove_non_gaussian_outliers",
            FunctionSampler(
                func=None,
                kw_args={},
            ),
        ),
        ("rf", RandomForestClassifier()),
    ]
)

rf_param_grid = list(
    create_params(
        {
            "rf__criterion": ["gini", "entropy"],
            "rf__max_depth": [None, 3, 5],
            "rf__n_jobs": [-1],
        }
    )
)

print(rf_param_grid)

[{'impute__numerical_imputer': [SimpleImputer()], 'impute__numerical_imputer__strategy': ['mean'], 'remove_gaussian_outliers__kw_args': [{'sd': 3}], 'remove_non_gaussian_outliers__func': [<function lof_outlier_removal at 0x174b1cf70>], 'remove_non_gaussian_outliers__kw_args': [{'n_neighbors': 3, 'contamination': 0.03, 'n_jobs': -1}, {'n_neighbors': 5, 'contamination': 0.06, 'n_jobs': -1}, {'n_neighbors': 7, 'contamination': 0.09, 'n_jobs': -1}], 'rf__criterion': ['gini', 'entropy'], 'rf__max_depth': [None, 3, 5], 'rf__n_jobs': [-1]}, {'impute__numerical_imputer': [SimpleImputer()], 'impute__numerical_imputer__strategy': ['mean'], 'remove_gaussian_outliers__kw_args': [{'sd': 3}], 'remove_non_gaussian_outliers__func': [<function isolation_forest_outlier_removal at 0x174b1f400>], 'remove_non_gaussian_outliers__kw_args': [{'contamination': 0.03, 'n_jobs': -1}, {'contamination': 0.06, 'n_jobs': -1}, {'contamination': 0.09, 'n_jobs': -1}], 'rf__criterion': ['gini', 'entropy'], 'rf__max_depth

## K-NN Pipeline


In [339]:
from sklearn.neighbors import KNeighborsClassifier

knn_pipeline = Pipeline(
    steps=[
        ("impute", impute_missing_values),
        (
            "remove_gaussian_outliers",
            FunctionSampler(func=gaussian_outlier_removal, kw_args={}),
        ),
        (
            "remove_non_gaussian_outliers",
            FunctionSampler(
                func=None,
                kw_args={},
            ),
        ),
        ("scale", scale_numerical_features),
        ("knn", KNeighborsClassifier()),
    ]
)


knn_param_grid = list(
    create_params({"knn__n_neighbors": np.arange(3, 13, 2), "knn__n_jobs": [-1]})
)

## Gaussian Naive Bayes pipeline


In [340]:
from sklearn.naive_bayes import GaussianNB

nb_pipeline = Pipeline(
    steps=[
        ("impute", impute_missing_values),
        (
            "remove_gaussian_outliers",
            FunctionSampler(func=gaussian_outlier_removal, kw_args={}),
        ),
        (
            "remove_non_gaussian_outliers",
            FunctionSampler(
                func=None,
                kw_args={},
            ),
        ),
        ("scale", scale_numerical_features),
        ("nb", GaussianNB()),
    ]
)

nb_param_gid = list(create_params({}))

# Run the pipelines via grid search cv


## Prepare the data for grid search cv


In [341]:
import pandas as pd

data = pd.concat([pd.read_csv("train_1.csv"), pd.read_csv("train_2.csv")])

## Make a function to run grid search cv and get the best model


In [342]:
# Import joblib
import joblib


def get_best_estimator(pipeline, param_grid, dataset):
    # Need to loop through each training data in order to get a variety of test metrics
    X = dataset.iloc[:, :-1]
    y = dataset.iloc[:, -1]
    grid = GridSearchCV(
        pipeline,
        param_grid,
        cv=10,
        scoring="f1_macro",
        n_jobs=-1,
        return_train_score=True,
        verbose=2,
    )
    grid.fit(X, y)
    std = grid.cv_results_["std_test_score"][grid.best_index_]
    return grid.best_score_, grid.best_estimator_, grid.best_params_, std
    # return grid.best_score_, grid.best_estimator_, grid.best_params_

## Decision trees Grid Search CV


In [343]:
dt_best_score, dt_best_estimator, dt_best_params, dt_std = get_best_estimator(
    dt_pipeline, list(dt_param_grid), data
)

print("Best Estimator: ", dt_best_estimator)
print("Best Params: ", dt_best_params)
print("Best Score: ", dt_best_score)
print("Std Dev: ", dt_std)

Fitting 10 folds for each of 36 candidates, totalling 360 fits
[CV] END dt__criterion=gini, dt__max_depth=None, impute__numerical_imputer=SimpleImputer(), impute__numerical_imputer__strategy=mean, remove_gaussian_outliers__kw_args={'sd': 3}, remove_non_gaussian_outliers__func=<function lof_outlier_removal at 0x11d3dfe20>, remove_non_gaussian_outliers__kw_args={'n_neighbors': 3, 'contamination': 0.03, 'n_jobs': -1}; total time=   0.8s
[CV] END dt__criterion=gini, dt__max_depth=None, impute__numerical_imputer=SimpleImputer(), impute__numerical_imputer__strategy=mean, remove_gaussian_outliers__kw_args={'sd': 3}, remove_non_gaussian_outliers__func=<function lof_outlier_removal at 0x130ba7e20>, remove_non_gaussian_outliers__kw_args={'n_neighbors': 3, 'contamination': 0.03, 'n_jobs': -1}; total time=   0.9s
[CV] END dt__criterion=gini, dt__max_depth=None, impute__numerical_imputer=SimpleImputer(), impute__numerical_imputer__strategy=mean, remove_gaussian_outliers__kw_args={'sd': 3}, remove_n

## Random Forest Grid Search CV


In [344]:
rf_best_score, rf_best_estimator, rf_best_params, rf_std = get_best_estimator(
    rf_pipeline, rf_param_grid, data
)

print("Best Estimator: ", rf_best_estimator)
print("Best Params: ", rf_best_params)
print("Best Score: ", rf_best_score)
print("Std Dev: ", rf_std)

Fitting 10 folds for each of 36 candidates, totalling 360 fits
[CV] END impute__numerical_imputer=SimpleImputer(), impute__numerical_imputer__strategy=mean, remove_gaussian_outliers__kw_args={'sd': 3}, remove_non_gaussian_outliers__func=<function lof_outlier_removal at 0x114837d90>, remove_non_gaussian_outliers__kw_args={'n_neighbors': 3, 'contamination': 0.03, 'n_jobs': -1}, rf__criterion=gini, rf__max_depth=None, rf__n_jobs=-1; total time=   2.2s
[CV] END impute__numerical_imputer=SimpleImputer(), impute__numerical_imputer__strategy=mean, remove_gaussian_outliers__kw_args={'sd': 3}, remove_non_gaussian_outliers__func=<function lof_outlier_removal at 0x11c3bf760>, remove_non_gaussian_outliers__kw_args={'n_neighbors': 3, 'contamination': 0.03, 'n_jobs': -1}, rf__criterion=gini, rf__max_depth=None, rf__n_jobs=-1; total time=   2.5s
[CV] END impute__numerical_imputer=SimpleImputer(), impute__numerical_imputer__strategy=mean, remove_gaussian_outliers__kw_args={'sd': 3}, remove_non_gaussia

## K-NN Grid Search CV


In [345]:
knn_best_score, knn_best_estimator, knn_best_params, knn_std = get_best_estimator(
    knn_pipeline, knn_param_grid, data
)

print("Best Estimator: ", knn_best_estimator)
print("Best Params: ", knn_best_params)
print("Best Score: ", knn_best_score)
print("Std Dev: ", knn_std)

Fitting 10 folds for each of 30 candidates, totalling 300 fits
[CV] END impute__numerical_imputer=SimpleImputer(), impute__numerical_imputer__strategy=mean, knn__n_jobs=-1, knn__n_neighbors=3, remove_gaussian_outliers__kw_args={'sd': 3}, remove_non_gaussian_outliers__func=<function lof_outlier_removal at 0x114f67e20>, remove_non_gaussian_outliers__kw_args={'n_neighbors': 3, 'contamination': 0.03, 'n_jobs': -1}; total time=   0.4s
[CV] END impute__numerical_imputer=SimpleImputer(), impute__numerical_imputer__strategy=mean, knn__n_jobs=-1, knn__n_neighbors=3, remove_gaussian_outliers__kw_args={'sd': 3}, remove_non_gaussian_outliers__func=<function lof_outlier_removal at 0x115337e20>, remove_non_gaussian_outliers__kw_args={'n_neighbors': 3, 'contamination': 0.03, 'n_jobs': -1}; total time=   0.4s
[CV] END impute__numerical_imputer=SimpleImputer(), impute__numerical_imputer__strategy=mean, knn__n_jobs=-1, knn__n_neighbors=3, remove_gaussian_outliers__kw_args={'sd': 3}, remove_non_gaussian_

# Naive Bayes Grid Search CV


In [346]:
nb_best_score, nb_best_estimator, nb_best_params, nb_std = get_best_estimator(
    nb_pipeline, nb_param_gid, data
)

print("Best Estimator: ", nb_best_estimator)
print("Best Params: ", nb_best_params)
print("Best Score: ", nb_best_score)
print("Std Dev: ", nb_std)

Fitting 10 folds for each of 6 candidates, totalling 60 fits
[CV] END impute__numerical_imputer=SimpleImputer(), impute__numerical_imputer__strategy=mean, remove_gaussian_outliers__kw_args={'sd': 3}, remove_non_gaussian_outliers__func=<function lof_outlier_removal at 0x114f67d90>, remove_non_gaussian_outliers__kw_args={'n_neighbors': 3, 'contamination': 0.03, 'n_jobs': -1}; total time=   0.4s
[CV] END impute__numerical_imputer=SimpleImputer(), impute__numerical_imputer__strategy=mean, remove_gaussian_outliers__kw_args={'sd': 3}, remove_non_gaussian_outliers__func=<function lof_outlier_removal at 0x103431510>, remove_non_gaussian_outliers__kw_args={'n_neighbors': 3, 'contamination': 0.03, 'n_jobs': -1}; total time=   0.5s
[CV] END impute__numerical_imputer=SimpleImputer(), impute__numerical_imputer__strategy=mean, remove_gaussian_outliers__kw_args={'sd': 3}, remove_non_gaussian_outliers__func=<function lof_outlier_removal at 0x102d89510>, remove_non_gaussian_outliers__kw_args={'n_neighb

## Ensemble of ensembles Grid Search CV


In [351]:
# Import bagging classifier
from sklearn.ensemble import BaggingClassifier, VotingClassifier

ee_param_grid_1 = {
    "estimators": [
        [
            ("rf", rf_best_estimator),
            ("knn", BaggingClassifier(knn_best_estimator)),
            ("nb", BaggingClassifier(nb_best_estimator)),
        ]
    ],
}

ee_param_grid_2 = {
    "estimators": [
        [
            ("rf", rf_best_estimator),
            ("knn", BaggingClassifier(knn_best_estimator)),
        ]
    ],
}

ee_param_grid_3 = {
    "estimators": [
        [
            ("rf", rf_best_estimator),
            ("nb", BaggingClassifier(nb_best_estimator)),
        ]
    ],
}

ee_param_grid_4 = {
    "estimators": [
        [
            ("knn", BaggingClassifier(knn_best_estimator)),
            ("nb", BaggingClassifier(nb_best_estimator)),
        ]
    ],
}

ee_param_grid_5 = {
    "estimators": [[("rf", rf_best_estimator), ("knn", knn_best_estimator)]],
}

ee_param_grid_6 = {
    "estimators": [[("rf", rf_best_estimator), ("nb", nb_best_estimator)]],
}

ee_param_grid_7 = {
    "estimators": [[("knn", knn_best_estimator), ("nb", nb_best_estimator)]],
}

params = [
    ee_param_grid_1,
    ee_param_grid_2,
    ee_param_grid_3,
    ee_param_grid_4,
    ee_param_grid_5,
    ee_param_grid_6,
    ee_param_grid_7,
]

ee = (
    VotingClassifier(
        estimators=[
            # ("rf", rf_best_estimator),
            # ("knn", BaggingClassifier(knn_best_estimator)),
            # ("nb", BaggingClassifier(nb_best_estimator)),
        ],
        voting="hard",
    ),
)

# Perform grid search
ee_best_score, ee_best_estimator, ee_best_params, ee_std = get_best_estimator(
    ee[0], params, data
)

Fitting 10 folds for each of 7 candidates, totalling 70 fits


[CV] END estimators=[('rf', Pipeline(steps=[('impute',
                 ColumnTransformer(transformers=[('numerical_imputer',
                                                  SimpleImputer(),
                                                  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                   10, 11, 12, 13, 14, 15, 16,
                                                   17, 18, 19, 20, 21, 22, 23,
                                                   24, 25, 26, 27, 28, 29, ...]),
                                                 ('nominal_imputer',
                                                  SimpleImputer(strategy='most_frequent'),
                                                  [100, 101, 102, 103, 104, 105,
                                                   106, 107, 108, 109, 110, 111,
                                                   112, 113, 114, 115, 116, 117,
                                                   118, 119, 120, 121, 122, 123,
  

In [352]:
# np.average(cv["test_score"])
print(ee_best_score)
print(ee_best_estimator)
print(ee_best_params)
print(ee_std)
# estimator = ee_trained["estimator"][0]

0.967521900878517
VotingClassifier(estimators=[('rf',
                              Pipeline(steps=[('impute',
                                               ColumnTransformer(transformers=[('numerical_imputer',
                                                                                SimpleImputer(),
                                                                                [0,
                                                                                 1,
                                                                                 2,
                                                                                 3,
                                                                                 4,
                                                                                 5,
                                                                                 6,
                                                                                 7,
                   

# Run best model on test set


In [349]:
# Load the test data
df_test = pd.read_csv("test.csv")

In [355]:
from sklearn.metrics import f1_score

# Make predictions on the test data using the best estimator from dt
predictions = ee_best_estimator.predict(df_test)

# Generate a dataframe with the predictions
df_predictions = pd.DataFrame(predictions)


# Append the accuracy and f1 scores of the best model on the training data
# via cross-validatoion to the dataframe,
# as a new row, accuracy in the first column and f1 in the second

# df_predictions = df_predictions.reset_index()

# Save the dataframe to a csv file
df_predictions.to_csv("s47850385_2.csv", index=False, header=False)

# Chosen model and final code


In [373]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, OutlierMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_validate
from imblearn import FunctionSampler
from imblearn.pipeline import Pipeline
from scipy import stats

non_gaussian_features = [21, 26, 32]


# Create a custom transformer to remove outliers for gaussian features
class GaussianOutlierDetector(BaseEstimator, OutlierMixin):
    def __init__(self, sd=2):
        self.sd = sd

    def fit(self, X, y=None):
        return self

    def fit_predict(self, X, y=None):
        likelihoods = np.zeros_like(X, dtype=float)
        filters = np.zeros_like(X, dtype=float)

        for col in range(X.shape[1]):
            # Calculate the mean and standard deviation for the column
            mean = X[:, col].mean()
            std_dev = X[:, col].std()
            # Calculate the PDF (likelihood) for each row in the column
            likelihoods[:, col] = stats.norm.pdf(X[:, col], loc=mean, scale=std_dev)
            filters[:, col] = stats.norm.pdf(
                (mean + (self.sd * std_dev)), loc=mean, scale=std_dev
            )

        condition_mask = np.any(likelihoods < filters, axis=1)

        # Convert the condition mask to -1 values being outliers and 1 values being inliers
        condition_mask = np.where(condition_mask, -1, 1)
        return condition_mask


isolation_forest = IsolationForest()
gaussian_outlier_detector = GaussianOutlierDetector()


def gaussian_outlier_removal(X, y, **kwargs):
    god = GaussianOutlierDetector(**kwargs)
    mask = god.fit_predict(X)
    return X[mask == 1], y[mask == 1]


def isolation_forest_outlier_removal(X, y, **kwargs):
    isolation_forest = IsolationForest(**kwargs)
    mask = isolation_forest.fit_predict(X)
    return X[mask == 1], y[mask == 1]


# Imputers
numerical_imputer = SimpleImputer(strategy="mean")
nominal_imputer = SimpleImputer(strategy="most_frequent")


impute_missing_values = ColumnTransformer(
    [
        ("numerical_imputer", numerical_imputer, list(range(100))),
        ("nominal_imputer", nominal_imputer, list(range(100, 128))),
    ]
)

rf_pipeline = Pipeline(
    steps=[
        ("impute", impute_missing_values),
        (
            "remove_gaussian_outliers",
            FunctionSampler(func=gaussian_outlier_removal, kw_args={"sd": 3}),
        ),
        (
            "remove_non_gaussian_outliers",
            FunctionSampler(
                func=isolation_forest_outlier_removal,
                kw_args={"contamination": 0.06, "n_jobs": -1},
            ),
        ),
        ("rf", RandomForestClassifier(criterion="entropy", n_jobs=-1)),
    ]
)

# Load the training
data = pd.concat([pd.read_csv("train.csv"), pd.read_csv("add_train.csv")])
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

test_data = pd.read_csv("test.csv")

# Train the model on the training data
rf_trained = rf_pipeline.fit(X, y)

# Get th F1 score and accuracy of the model on the training data via cross-validation
cv = cross_validate(
    rf_trained,
    X,
    y,
    cv=10,
    scoring=["f1_macro", "accuracy"],
    return_train_score=True,
    n_jobs=-1,
)

accuracy = round(np.average(cv["test_accuracy"]), 3)
f1 = round(np.average(cv["test_f1_macro"]), 3)

# Make a prediction on the test data
predictions = rf_trained.predict(test_data)

# Write the predcitions and scores to a csv file
df_predictions = pd.DataFrame(predictions)
df_predictions[1] = np.nan
df_predictions.loc[len(df_predictions)] = [accuracy, f1]

df_predictions.to_csv("s47850385.csv", index=False, header=False)