In [2]:
import pandas as pd
import numpy as np
import random
import math

from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import (ColumnTransformer, make_column_selector as selector)
from sklearn.model_selection import (KFold, LeaveOneOut)
from sklearn.utils import resample
from sklearn.metrics import (mean_squared_error, mean_absolute_error)

In [2]:
df = pd.read_csv("data60.csv", low_memory=False)

# replace ? for NAN
df = df.replace('?', np.nan)

# Delete rows with missing values
df = df.dropna()
df = df.astype(float)

df = df.sample(n = 1000, random_state = 42)

target_columns = df.iloc[:,-33:].columns
y = df.loc[:, target_columns].astype(float).max(axis=1)
df.drop(columns=target_columns, inplace=True)

In [3]:
# transform data
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

transform = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, selector(dtype_exclude="category")),
        ("cat", categorical_transformer, selector(dtype_include="category"))
    ]
)

#models
dtr = Pipeline(
    steps = [
        ('model', DecisionTreeRegressor(max_depth = 5, min_samples_split = 20, min_samples_leaf = 10))
    ]
)

svr = Pipeline(
    steps = [
        ('transform', transform),
        ('model', SVR(kernel = 'rbf', epsilon = 0.01))
    ]
)

knr = Pipeline(
    steps = [
        ('transform', transform),
        ('model', KNeighborsRegressor(n_neighbors = 10, p = 1, weights = "distance"))
    ]
)

mlp = Pipeline(
    steps = [
        ('transform', transform),
        ('model', MLPRegressor(hidden_layer_sizes=(10,10), alpha = 0.01,max_iter = 300))
    ]
)

models = [dtr, svr, knr, mlp]

# Train/Test split

In [None]:
n_iter = 10
train_test_rez = {}
for j in range(n_iter):
    # train test data
    X_train, X_test, y_train, y_test = train_test_split(df, y, train_size=0.7, random_state=100+j)

    for i in models:
        # fit
        i.fit(X_train, y_train)

        #prediction
        y_pred = i.predict(X_test)

        # score - Mean Squared Error
        score = mean_squared_error(y_pred, y_test)

        col_name = f"Model_{i}"
        
        if col_name not in train_test_rez:
            train_test_rez[col_name] = []

        train_test_rez[col_name].append(score)

In [None]:
train_test_rezultati = pd.DataFrame(train_test_rez)
#train_test_rezultati.to_csv('train_test_rezultati.csv', index=False)

column_names = {train_test_rezultati.columns[0]: 'DesisionTreeRegressor_mse',
                train_test_rezultati.columns[1]: 'SVR_mse',
                train_test_rezultati.columns[2]: 'KNeighborsRegressor_mse',
                train_test_rezultati.columns[3]: 'MLPRegressor_mse'}

train_test_rezultati = train_test_rezultati.rename(columns=column_names)
train_test_rezultati.to_csv('train_test_rezultati_max.csv', index=False)
train_test_rezultati

In [6]:
train_test_rezultati = pd.read_csv("train_test_rezultati_max.csv")
train_test_rezultati


Unnamed: 0,DesisionTreeRegressor_mse,SVR_mse,KNeighborsRegressor_mse,MLPRegressor_mse
0,0.022688,0.021432,0.023015,0.127994
1,0.022688,0.021432,0.023015,0.153937
2,0.022688,0.021432,0.023015,0.169872
3,0.022688,0.021432,0.023015,0.184118
4,0.022688,0.021432,0.023015,0.107671
5,0.022688,0.021432,0.023015,0.161581
6,0.022688,0.021432,0.023015,0.392963
7,0.022688,0.021432,0.023015,0.529274
8,0.022688,0.021432,0.023015,0.100334
9,0.022688,0.021432,0.023015,0.146155


In [3]:
train_test_rezultati = pd.read_csv("train_test_rezultati_max.csv")
train_test_rezultati = pd.DataFrame(train_test_rezultati)[['DesisionTreeRegressor_mse',
                                                           'SVR_mse',
                                                           'KNeighborsRegressor_mse',
                                                           'MLPRegressor_mse']].agg(['mean', 'var']).round(5)
train_test_rezultati

Unnamed: 0,DesisionTreeRegressor_mse,SVR_mse,KNeighborsRegressor_mse,MLPRegressor_mse
mean,0.02269,0.02143,0.02301,0.20739
var,0.0,0.0,0.0,0.0196


# Cross Validation

In [None]:
n_repeats = 100
n_splits = 5
cv_rez = {}
cv_rez['n_repeats'] = []
cv_rez['n_split'] = []

for i in range(n_repeats):
    
    kf = KFold(n_splits=n_splits, random_state=42, shuffle=True)
    for j, (train_index, test_index) in enumerate(kf.split(df)):
        cv_rez['n_repeats'].append(i)
        cv_rez['n_split'].append(j)

        # train and test split
        X_train = df.iloc[train_index, :]
        y_train = y.iloc[train_index]
        X_test = df.iloc[test_index, :]
        y_test = y.iloc[test_index]

        for k in models:
            # fit
            k.fit(X_train, y_train)

            # predict
            y_pred = k.predict(X_test)
            
            # score - Mean Squared Error
            score = mean_squared_error(y_pred, y_test)

            col_name = f"Model_{k}"
            
            if col_name not in cv_rez:
                cv_rez[col_name] = []

            cv_rez[col_name].append(score)

In [None]:
cv_rezultati = pd.DataFrame(cv_rez)

column_names = {cv_rezultati.columns[0]: 'n_repeats',
                cv_rezultati.columns[1]: 'n_split',
                cv_rezultati.columns[2]: 'DesisionTreeRegressor_mse',
                cv_rezultati.columns[3]: 'SVR_mse',
                cv_rezultati.columns[4]: 'KNeighborsRegressor_mse',
                cv_rezultati.columns[5]: 'MLPRegressor_mse'}

cv_rezultati = cv_rezultati.rename(columns=column_names)
cv_rezultati.to_csv('cv_rezultati_max.csv', index=False)

In [4]:
cv_rezultati = pd.read_csv("cv_rezultati_max.csv")
cv_rezultati

Unnamed: 0,n_repeats,n_split,DesisionTreeRegressor_mse,SVR_mse,KNeighborsRegressor_mse,MLPRegressor_mse
0,0,0,0.019740,0.025177,0.024313,0.092048
1,0,1,0.014931,0.019483,0.024240,0.045766
2,0,2,0.018408,0.021725,0.022635,0.175425
3,0,3,0.023300,0.024440,0.024323,0.114298
4,0,4,0.016858,0.021001,0.022153,0.080805
...,...,...,...,...,...,...
495,99,0,0.019740,0.025177,0.024313,0.059330
496,99,1,0.014931,0.019483,0.024240,0.072421
497,99,2,0.018408,0.021725,0.022635,0.366861
498,99,3,0.021342,0.024440,0.024323,0.087687


In [5]:
cv_rezultati = pd.read_csv("cv_rezultati_max.csv")
cv_rezultati = pd.DataFrame(cv_rezultati)[['n_repeats',
                                           'DesisionTreeRegressor_mse', 
                                           'SVR_mse',
                                           'KNeighborsRegressor_mse',
                                           'MLPRegressor_mse']].groupby('n_repeats').agg(['mean'])
cv_rezultati

Unnamed: 0_level_0,DesisionTreeRegressor_mse,SVR_mse,KNeighborsRegressor_mse,MLPRegressor_mse
Unnamed: 0_level_1,mean,mean,mean,mean
n_repeats,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,0.018647,0.022365,0.023533,0.101668
1,0.018256,0.022365,0.023533,0.151024
2,0.018269,0.022365,0.023533,0.182190
3,0.018266,0.022365,0.023533,0.079429
4,0.018256,0.022365,0.023533,0.238858
...,...,...,...,...
95,0.018256,0.022365,0.023533,3.023619
96,0.018256,0.022365,0.023533,0.234662
97,0.018256,0.022365,0.023533,0.093770
98,0.018255,0.022365,0.023533,0.148184


In [7]:
cv_rezultati = pd.DataFrame(cv_rezultati)[['DesisionTreeRegressor_mse', 
                                           'SVR_mse',
                                           'KNeighborsRegressor_mse',
                                           'MLPRegressor_mse']].groupby('n_repeats').agg(['mean']).agg(['mean', 'var']).round(5)

cv_rezultati

Unnamed: 0_level_0,DesisionTreeRegressor_mse,SVR_mse,KNeighborsRegressor_mse,MLPRegressor_mse
Unnamed: 0_level_1,mean,mean,mean,mean
Unnamed: 0_level_2,mean,mean,mean,mean
mean,0.0183,0.02237,0.02353,0.33858
var,0.0,0.0,0.0,0.4396


# Bootstrap

In [None]:
repeat = 10
n_bootstrap_samples = 5
bootstrap_rez = {}
bootstrap_rez['repeat'] = []
bootstrap_rez['n_bootstrap_samples'] = []

for i in range(repeat):
    X_train = resample(df, replace=True)
    y_train = y.loc[X_train.index]
    test_idx = [idx for idx in df.index if idx not in X_train.index]
    X_test = df.loc[test_idx, :]
    y_test = y.loc[test_idx]

    for j in range(n_bootstrap_samples):
        bootstrap_rez['repeat'].append(i)
        bootstrap_rez['n_bootstrap_samples'].append(j)

        for k in models:
            # fit
            k.fit(X_train, y_train)

            #prediction
            y_pred = k.predict(X_test)

            # score - Mean Squared Error
            score = mean_squared_error(y_pred, y_test)

            col_name = f"Model_{k}"
            if col_name not in bootstrap_rez:
                bootstrap_rez[col_name] = []
            bootstrap_rez[col_name].append(score)

In [None]:
bootstrap_rezultati = pd.DataFrame(bootstrap_rez)

column_names = {bootstrap_rezultati.columns[0]: 'n_repeats',
                bootstrap_rezultati.columns[1]: 'n_bootstrap_samples',
                bootstrap_rezultati.columns[2]: 'DesisionTreeRegressor_mse',
                bootstrap_rezultati.columns[3]: 'SVR_mse',
                bootstrap_rezultati.columns[4]: 'KNeighborsRegressor_mse',
                bootstrap_rezultati.columns[5]: 'MLPRegressor_mse'}

bootstrap_rezultati = bootstrap_rezultati.rename(columns=column_names)
bootstrap_rezultati.to_csv('bootstrap_rezultati_max.csv', index=False)
bootstrap_rezultati

In [8]:
bootstrap_rezultati = pd.read_csv("bootstrap_rezultati_max.csv")
bootstrap_rezultati

Unnamed: 0,n_repeats,n_bootstrap_samples,DesisionTreeRegressor_mse,SVR_mse,KNeighborsRegressor_mse,MLPRegressor_mse
0,0,0,0.018601,0.025459,0.026905,0.164712
1,0,1,0.018601,0.025459,0.026905,0.077645
2,0,2,0.018601,0.025459,0.026905,0.141134
3,0,3,0.018601,0.025459,0.026905,0.105622
4,0,4,0.018601,0.025459,0.026905,0.13705
5,1,0,0.026114,0.025548,0.026997,2.901451
6,1,1,0.026114,0.025548,0.026997,0.25424
7,1,2,0.026114,0.025548,0.026997,0.792395
8,1,3,0.026114,0.025548,0.026997,1111.97836
9,1,4,0.026114,0.025548,0.026997,0.459443


In [9]:

bootstrap_rezultati = pd.DataFrame(bootstrap_rezultati)[['n_repeats',
                                                         'DesisionTreeRegressor_mse',
                                                         'SVR_mse',
                                                         'KNeighborsRegressor_mse',
                                                         'MLPRegressor_mse']].groupby('n_repeats').agg(['mean']).agg(['mean', 'var']).round(7)

bootstrap_rezultati

Unnamed: 0_level_0,DesisionTreeRegressor_mse,SVR_mse,KNeighborsRegressor_mse,MLPRegressor_mse
Unnamed: 0_level_1,mean,mean,mean,mean
mean,0.021919,0.024396,0.027673,22.943936
var,1.8e-05,1.2e-05,9e-06,4956.507254


# Leave one out (LOOCV)

In [None]:
df = df.sample(n = 100, random_state = 42)
repeat = 10
loocv_rez = {}
loocv_rez['repeat'] = []
loocv_rez['indeks'] = []

for i in range(repeat):

    loo = LeaveOneOut()
    for j, (train_index, test_index) in enumerate(loo.split(df)):
        loocv_rez['repeat'].append(i)
        loocv_rez['indeks'].append(j)
        
        # train and test split
        X_train = df.iloc[train_index, :]
        y_train = y.iloc[train_index]
        X_test = df.iloc[test_index, :]
        y_test = y.iloc[test_index]

        for k in models:
            # fit
            k.fit(X_train, y_train)

            #predict
            y_pred = k.predict(X_test)
            
            # score - Mean Squared Error
            score = mean_squared_error(y_pred, y_test)

            col_name = f"Model_{k}"
            
            if col_name not in loocv_rez:
                loocv_rez[col_name] = []
            loocv_rez[col_name].append(score)


In [None]:
loocv_rezultati = pd.DataFrame(loocv_rez)

column_names = {loocv_rezultati.columns[0]: 'n_repeats',
                loocv_rezultati.columns[1]: 'indeks',
                loocv_rezultati.columns[2]: 'DesisionTreeRegressor_mse',
                loocv_rezultati.columns[3]: 'SVR_mse',
                loocv_rezultati.columns[4]: 'KNeighborsRegressor_mse',
                loocv_rezultati.columns[5]: 'MLPRegressor_mse'}

loocv_rezultati = loocv_rezultati.rename(columns=column_names)
loocv_rezultati.to_csv('loocv_rezultati_max.csv', index=False)
loocv_rezultati

In [10]:
loocv_rezultati = pd.read_csv("loocv_rezultati_max.csv")
loocv_rezultati

Unnamed: 0,n_repeats,indeks,DesisionTreeRegressor_mse,SVR_mse,KNeighborsRegressor_mse,MLPRegressor_mse
0,0,0,0.157197,0.024759,0.050741,0.005374
1,0,1,0.011928,0.009349,0.000280,0.039991
2,0,2,0.001376,0.001268,0.005332,0.177172
3,0,3,0.005558,0.020030,0.003918,0.003500
4,0,4,0.013417,0.043381,0.028170,53.595917
...,...,...,...,...,...,...
995,9,95,0.100522,0.039380,0.075444,0.081259
996,9,96,0.000723,0.000025,0.001723,0.083318
997,9,97,0.000323,0.000958,0.000108,0.000680
998,9,98,0.041558,0.143403,0.111544,4.929243


In [11]:
loocv_rezultati = pd.DataFrame(loocv_rezultati)[['n_repeats',
                                                 'DesisionTreeRegressor_mse',
                                                 'SVR_mse',
                                                 'KNeighborsRegressor_mse',
                                                 'MLPRegressor_mse']].groupby('n_repeats').agg(['mean']).agg(['mean', 'var']).round(7)

loocv_rezultati

Unnamed: 0_level_0,DesisionTreeRegressor_mse,SVR_mse,KNeighborsRegressor_mse,MLPRegressor_mse
Unnamed: 0_level_1,mean,mean,mean,mean
mean,0.031082,0.032286,0.032019,21.512177
var,0.0,0.0,0.0,318.638068
