In [1]:
import pandas as pd
import numpy as np
import random
import math

from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import (ColumnTransformer, make_column_selector as selector)
from sklearn.model_selection import (KFold, LeaveOneOut)
from sklearn.utils import resample
from sklearn.metrics import (mean_squared_error, mean_absolute_error)

In [2]:
df = pd.read_csv("data60.csv", low_memory=False)

# replace ? for NAN
df = df.replace('?', np.nan)

# Delete rows with missing values
df = df.dropna()
df = df.astype(float)

df = df.sample(n = 1000, random_state = 42)

target_columns = df.iloc[:,-33:].columns
y = df.loc[:, target_columns].astype(float).sum(axis=1)
df.drop(columns=target_columns, inplace=True)

In [None]:
#df2 = df
#df2['sum'] = y
#df.to_csv('data_sum.csv', index = False)

In [3]:
# transform data
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

transform = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, selector(dtype_exclude="category")),
        ("cat", categorical_transformer, selector(dtype_include="category"))
    ]
)

#models
dtr = Pipeline(
    steps = [
        ('model', DecisionTreeRegressor(max_depth = 5, min_samples_split = 20, min_samples_leaf = 10))
    ]
)

svr = Pipeline(
    steps = [
        ('transform', transform),
        ('model', SVR(kernel = 'rbf', epsilon = 0.01))
    ]
)

knr = Pipeline(
    steps = [
        ('transform', transform),
        ('model', KNeighborsRegressor(n_neighbors = 10, p = 1, weights = "distance"))
    ]
)

mlp = Pipeline(
    steps = [
        ('transform', transform),
        ('model', MLPRegressor(hidden_layer_sizes=(10,10), alpha = 0.01,max_iter = 300))
    ]
)

models = [dtr, svr, knr, mlp]

# Train/Test split

In [None]:
n_iter = 100
train_test_rez = {}
for j in range(n_iter):
    # train test data
    X_train, X_test, y_train, y_test = train_test_split(df, y, train_size=0.7, random_state=100)

    for i in models:
        # fit
        i.fit(X_train, y_train)

        #prediction
        y_pred = i.predict(X_test)

        # score - Mean Squared Error
        score = mean_squared_error(y_pred, y_test)

        col_name = f"Model_{i}"
        
        if col_name not in train_test_rez:
            train_test_rez[col_name] = []

        train_test_rez[col_name].append(score)

In [None]:
train_test_rezultati = pd.DataFrame(train_test_rez)
#train_test_rezultati.to_csv('train_test_rezultati.csv', index=False)

column_names = {train_test_rezultati.columns[0]: 'DesisionTreeRegressor_mse',
                train_test_rezultati.columns[1]: 'SVR_mse',
                train_test_rezultati.columns[2]: 'KNeighborsRegressor_mse',
                train_test_rezultati.columns[3]: 'MLPRegressor_mse'}

train_test_rezultati = train_test_rezultati.rename(columns=column_names)
train_test_rezultati.to_csv('train_test_rezultati_sum.csv', index=False)
train_test_rezultati

In [4]:
train_test_rezultati = pd.read_csv("train_test_rezultati_sum.csv")
train_test_rezultati

Unnamed: 0,DesisionTreeRegressor_mse,SVR_mse,KNeighborsRegressor_mse,MLPRegressor_mse
0,0.002303,0.052932,0.308832,0.040895
1,0.002303,0.052932,0.308832,0.116115
2,0.002303,0.052932,0.308832,0.091066
3,0.002303,0.052932,0.308832,0.129802
4,0.002303,0.052932,0.308832,0.370049
...,...,...,...,...
95,0.002303,0.052932,0.308832,0.069164
96,0.002303,0.052932,0.308832,0.084895
97,0.002303,0.052932,0.308832,0.080778
98,0.002303,0.052932,0.308832,0.055536


In [5]:
cv_rezultati = pd.read_csv("train_test_rezultati_sum.csv")
train_test_rezultati = pd.DataFrame(train_test_rezultati)[['DesisionTreeRegressor_mse',
                                                           'SVR_mse',
                                                           'KNeighborsRegressor_mse',
                                                           'MLPRegressor_mse']].agg(['mean', 'var']).round(5)
train_test_rezultati

Unnamed: 0,DesisionTreeRegressor_mse,SVR_mse,KNeighborsRegressor_mse,MLPRegressor_mse
mean,0.0023,0.05293,0.30883,0.15395
var,0.0,0.0,0.0,0.00645


# Cross Validation

In [None]:
n_repeats = 100
n_splits = 5
cv_rez = {}
cv_rez['n_repeats'] = []
cv_rez['n_split'] = []

for i in range(n_repeats):
    
    kf = KFold(n_splits=n_splits, random_state=42+i, shuffle=True)
    for j, (train_index, test_index) in enumerate(kf.split(df)):
        cv_rez['n_repeats'].append(i)
        cv_rez['n_split'].append(j)

        # train and test split
        X_train = df.iloc[train_index, :]
        y_train = y.iloc[train_index]
        X_test = df.iloc[test_index, :]
        y_test = y.iloc[test_index]

        for k in models:
            # fit
            k.fit(X_train, y_train)
            #predict
            y_pred = k.predict(X_test)
            
            # score - Mean Squared Error
            score = mean_squared_error(y_pred, y_test)

            col_name = f"Model_{k}"
            
            if col_name not in cv_rez:
                cv_rez[col_name] = []

            cv_rez[col_name].append(score)
            

In [None]:
cv_rezultati = pd.DataFrame(cv_rez)

column_names = {cv_rezultati.columns[0]: 'n_repeats',
                cv_rezultati.columns[1]: 'n_split',
                cv_rezultati.columns[2]: 'DesisionTreeRegressor_mse',
                cv_rezultati.columns[3]: 'SVR_mse',
                cv_rezultati.columns[4]: 'KNeighborsRegressor_mse',
                cv_rezultati.columns[5]: 'MLPRegressor_mse'}

cv_rezultati = cv_rezultati.rename(columns=column_names)
cv_rezultati.to_csv('cv_rezultati_sum.csv', index=False)
cv_rezultati

In [6]:
cv_rezultati = pd.read_csv("cv_rezultati_sum.csv")
cv_rezultati

Unnamed: 0,n_repeats,n_split,DesisionTreeRegressor_mse,SVR_mse,KNeighborsRegressor_mse,MLPRegressor_mse
0,0,0,0.536630,0.310115,0.341660,1.237374
1,0,1,0.523731,0.326053,0.494702,0.636685
2,0,2,0.542332,0.387452,0.440100,1.244355
3,0,3,0.502751,0.322201,0.495175,0.657435
4,0,4,0.603925,0.417710,0.510620,0.529149
...,...,...,...,...,...,...
495,99,0,0.536630,0.310115,0.341660,1.660422
496,99,1,0.523731,0.326053,0.494702,0.532002
497,99,2,0.556222,0.387452,0.440100,6.353344
498,99,3,0.502751,0.322201,0.495175,0.638659


In [7]:
cv_rezultati = pd.DataFrame(cv_rezultati)[['n_repeats',
                                           'DesisionTreeRegressor_mse', 
                                           'SVR_mse',
                                           'KNeighborsRegressor_mse',
                                           'MLPRegressor_mse']].groupby('n_repeats').agg(['mean'])

cv_rezultati

Unnamed: 0_level_0,DesisionTreeRegressor_mse,SVR_mse,KNeighborsRegressor_mse,MLPRegressor_mse
Unnamed: 0_level_1,mean,mean,mean,mean
n_repeats,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,0.541874,0.352706,0.456452,0.861000
1,0.541742,0.352706,0.456452,2.181460
2,0.539563,0.352706,0.456452,0.723559
3,0.540806,0.352706,0.456452,1.094019
4,0.541742,0.352706,0.456452,0.910204
...,...,...,...,...
95,0.543116,0.352706,0.456452,0.593525
96,0.539563,0.352706,0.456452,0.830264
97,0.543248,0.352706,0.456452,2.782559
98,0.539563,0.352706,0.456452,0.876196


In [8]:
cv_rezultati = pd.read_csv("cv_rezultati_sum.csv")
cv_rezultati = pd.DataFrame(cv_rezultati)[['n_repeats',
                                           'DesisionTreeRegressor_mse', 
                                           'SVR_mse',
                                           'KNeighborsRegressor_mse',
                                           'MLPRegressor_mse']].groupby('n_repeats').agg(['mean']).agg(['mean', 'var']).round(7)

cv_rezultati

Unnamed: 0_level_0,DesisionTreeRegressor_mse,SVR_mse,KNeighborsRegressor_mse,MLPRegressor_mse
Unnamed: 0_level_1,mean,mean,mean,mean
mean,0.542034,0.352706,0.456452,1.223178
var,3e-06,0.0,0.0,0.574562


# Bootstrap

In [None]:
repeat = 10
n_bootstrap_samples = 3
bootstrap_rez = {}
bootstrap_rez['repeat'] = []
bootstrap_rez['n_bootstrap_samples'] = []

for i in range(repeat):
    X_train = resample(df, replace=True)
    y_train = y.loc[X_train.index]
    test_idx = [idx for idx in df.index if idx not in X_train.index]
    X_test = df.loc[test_idx, :]
    y_test = y.loc[test_idx]

    for j in range(n_bootstrap_samples):
        bootstrap_rez['repeat'].append(i)
        bootstrap_rez['n_bootstrap_samples'].append(j)

        for k in models:
            # fit
            k.fit(X_train, y_train)

            #prediction
            y_pred = k.predict(X_test)

            # score - Mean Squared Error
            score = mean_squared_error(y_pred, y_test)

            col_name = f"Model_{k}"
            if col_name not in bootstrap_rez:
                bootstrap_rez[col_name] = []
            bootstrap_rez[col_name].append(score)

In [None]:
bootstrap_rezultati = pd.DataFrame(bootstrap_rez)

column_names = {bootstrap_rezultati.columns[0]: 'n_repeats',
                bootstrap_rezultati.columns[1]: 'n_bootstrap_samples',
                bootstrap_rezultati.columns[2]: 'DesisionTreeRegressor_mse',
                bootstrap_rezultati.columns[3]: 'SVR_mse',
                bootstrap_rezultati.columns[4]: 'KNeighborsRegressor_mse',
                bootstrap_rezultati.columns[5]: 'MLPRegressor_mse'}

bootstrap_rezultati = bootstrap_rezultati.rename(columns=column_names)
bootstrap_rezultati.to_csv('bootstrap_rezultati_sum.csv', index=False)
bootstrap_rezultati

In [9]:
bootstrap_rezultati = pd.read_csv("bootstrap_rezultati_sum.csv")
bootstrap_rezultati

Unnamed: 0,n_repeats,n_bootstrap_samples,DesisionTreeRegressor_mse,SVR_mse,KNeighborsRegressor_mse,MLPRegressor_mse
0,0,0,0.550473,0.386886,0.496479,0.629764
1,0,1,0.547831,0.386886,0.496479,0.874631
2,0,2,0.552259,0.386886,0.496479,0.622173
3,1,0,0.560899,0.359298,0.507576,0.724059
4,1,1,0.557535,0.359298,0.507576,0.796448
5,1,2,0.560899,0.359298,0.507576,0.790171
6,2,0,0.53285,0.387681,0.492842,4.467721
7,2,1,0.53285,0.387681,0.492842,7.985326
8,2,2,0.53285,0.387681,0.492842,24.114
9,3,0,0.564781,0.394853,0.569298,1.601676


In [10]:
bootstrap_rezultati = pd.DataFrame(bootstrap_rezultati)[['n_repeats',
                                                         'DesisionTreeRegressor_mse',
                                                         'SVR_mse',
                                                         'KNeighborsRegressor_mse',
                                                         'MLPRegressor_mse']].groupby('n_repeats').agg(['mean']).agg(['mean', 'var']).round(7)

bootstrap_rezultati

Unnamed: 0_level_0,DesisionTreeRegressor_mse,SVR_mse,KNeighborsRegressor_mse,MLPRegressor_mse
Unnamed: 0_level_1,mean,mean,mean,mean
mean,0.561388,0.374747,0.529831,2.147538
var,0.001134,0.000796,0.000879,12.734953


# Leave one out (LOOCV)

In [None]:
df = df.sample(n = 100, random_state = 42)
repeat = 10
loocv_rez = {}
loocv_rez['repeat'] = []
loocv_rez['indeks'] = []

for i in range(repeat):

    loo = LeaveOneOut()
    for j, (train_index, test_index) in enumerate(loo.split(df)):
        loocv_rez['repeat'].append(i)
        loocv_rez['indeks'].append(j)
        
        # train and test split
        X_train = df.iloc[train_index, :]
        y_train = y.iloc[train_index]
        X_test = df.iloc[test_index, :]
        y_test = y.iloc[test_index]

        for k in models:
            # fit
            k.fit(X_train, y_train)

            #predict
            y_pred = k.predict(X_test)
            
            # score - Mean Squared Error
            score = mean_squared_error(y_pred, y_test)

            col_name = f"Model_{k}"
            
            if col_name not in loocv_rez:
                loocv_rez[col_name] = []
            loocv_rez[col_name].append(score)


In [None]:
loocv_rezultati = pd.DataFrame(loocv_rez)

column_names = {loocv_rezultati.columns[0]: 'n_repeats',
                loocv_rezultati.columns[1]: 'indeks',
                loocv_rezultati.columns[2]: 'DesisionTreeRegressor_mse',
                loocv_rezultati.columns[3]: 'SVR_mse',
                loocv_rezultati.columns[4]: 'KNeighborsRegressor_mse',
                loocv_rezultati.columns[5]: 'MLPRegressor_mse'}

loocv_rezultati = loocv_rezultati.rename(columns=column_names)
loocv_rezultati.to_csv('loocv_rezultati_sum.csv', index=False)
loocv_rezultati

In [11]:
loocv_rezultati = pd.read_csv("loocv_rezultati_sum.csv")
loocv_rezultati

Unnamed: 0,n_repeats,indeks,DesisionTreeRegressor_mse,SVR_mse,KNeighborsRegressor_mse,MLPRegressor_mse
0,0,0,0.600970,0.875778,0.938502,0.000018
1,0,1,0.000872,0.245061,0.226098,0.000151
2,0,2,0.127805,0.001407,0.001979,2.163419
3,0,3,0.542975,0.874434,1.051786,3.087933
4,0,4,0.369973,0.210683,0.638640,30.331078
...,...,...,...,...,...,...
995,9,95,1.060842,0.904349,1.305105,0.100728
996,9,96,2.976541,1.139645,1.207944,4.047036
997,9,97,0.945690,0.081326,0.317523,0.044571
998,9,98,0.007565,0.813013,0.408227,4.384282


In [12]:
loocv_rezultati = pd.DataFrame(loocv_rezultati)[['n_repeats',
                                                 'DesisionTreeRegressor_mse',
                                                 'SVR_mse',
                                                 'KNeighborsRegressor_mse',
                                                 'MLPRegressor_mse']].groupby('n_repeats').agg(['mean']).agg(['mean', 'var']).round(7)

loocv_rezultati

Unnamed: 0_level_0,DesisionTreeRegressor_mse,SVR_mse,KNeighborsRegressor_mse,MLPRegressor_mse
Unnamed: 0_level_1,mean,mean,mean,mean
mean,0.857505,0.727768,0.730993,2.220602
var,0.000118,0.0,0.0,0.08212
