# Importing Libraries

In [54]:
!python -m pip install gurobi-optimods
!pip install catboost

from sklearn import datasets
from sklearn.model_selection import train_test_split

from gurobi_optimods.regression import LADRegression
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_percentage_error

import sklearn.pipeline
import sklearn.neighbors


import pandas as pd
import numpy as np

import sklearn



In [55]:
# Used for styling of graphs
!pip install SciencePlots
import matplotlib.pyplot as plt
import scienceplots
plt.style.use(['science', 'grid', 'no-latex'])



# Preparing Data

In [56]:
# Loads the diabetes dataset
diabetes = datasets.load_diabetes()

# test

In [None]:
# reference: https://www.kaggle.com/code/mateuszk013/playground-series-s3e25-mohs-hardness#-5-%7C-Modelling-%E2%86%91

# remove outliers for a given detector
def remove_outliers(data, detector):
    if not isinstance(data, pd.DataFrame):
        raise TypeError(f"'data' must be {pd.DataFrame!r} instance")

    result = detector.fit_predict(data)
    outlierIds = pd.Series(result == -1, index=data.index, dtype=bool)
    dataIds = pd.Series(np.ones_like(data.index), index=data.index, dtype=bool)

    return data[~(outlierIds & dataIds)]

# also can use isolation forest
detector = sklearn.pipeline.make_pipeline(
    sklearn.preprocessing.PowerTransformer(method="yeo-johnson", standardize=True),
    sklearn.neighbors.LocalOutlierFactor(),
)

reg = CatBoostRegressor(iterations=250,
                          learning_rate=1,
                          depth=8,
                          silent=True) # any regression model

kFold = sklearn.model_selection.KFold(n_splits=3, shuffle=True, random_state=42) # maybe lower depending on data


hyperparameter = "localoutlierfactor__contamination"
hyperparameterValues = [None] + np.arange(0.01, 0.25, 0.01).tolist()
noOutliersMedae = {}

for k, (trainIds, validIds) in enumerate(kFold.split(diabetes["data"], diabetes["target"]), start=1):
    xTrain, yTrain = diabetes["data"][trainIds], diabetes["target"][trainIds]
    xValid, yValid = diabetes["data"][validIds], diabetes["target"][validIds]

    # default loss
    reg.fit(xTrain, yTrain)
    defaultMedae = sklearn.metrics.median_absolute_error(yValid, reg.predict(xValid))

    for hpValue in hyperparameterValues:
        if hpValue is None:
            # save default
            noOutliersMedae[f"0 - {k}"] = defaultMedae
            continue

        # new params
        detector.set_params(**{hyperparameter: hpValue})
        xNoOutliers = remove_outliers(pd.DataFrame(xTrain), detector)
        yNoOutliers = yTrain[xNoOutliers.index]

        reg.fit(xNoOutliers, yNoOutliers)
        cleanMedae = sklearn.metrics.median_absolute_error(yValid, reg.predict(xValid))
        noOutliersMedae[f"{hpValue} - {k}"] = cleanMedae

In [None]:
# clean data

noOutliersMedaeClean = {}
outliersMedae = {}
for i in noOutliersMedae.keys():
  strTemp = i.split()

  if strTemp[0] == '0':
    noOutliersMedaeClean[strTemp[2]] = noOutliersMedae[i]
  try:
    outliersMedae[strTemp[2]].append(noOutliersMedae[i] - noOutliersMedaeClean[strTemp[2]])
  except:
    outliersMedae[strTemp[2]] = []
    outliersMedae[strTemp[2]].append(noOutliersMedae[i] - noOutliersMedaeClean[strTemp[2]])

# Plot

In [None]:
X_LABEL = "x-axis label"

# Initialises settings of the graph
plt.figure(figsize=(7,4))

plt.xlabel(X_LABEL)
plt.ylabel('Median Absolute Error')

# Plots the regression
xPlot1 = hyperparameter_values

plt.plot(xPlot1, outliers_medae['1'], label='Fold 1')
plt.plot(xPlot1, outliers_medae['2'], label='Fold 2')
plt.plot(xPlot1, outliers_medae['3'], label='Fold 3')

plt.legend()
plt.show()