In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import random
import math

from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import (ColumnTransformer, make_column_selector as selector)
from sklearn.model_selection import (RepeatedKFold, LeaveOneOut)
from sklearn.utils import resample
from sklearn.metrics import (mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, 
                             r2_score, 
                             recall_score, accuracy_score, roc_auc_score)

In [2]:
df = pd.read_csv("data60.csv", low_memory=False)

# replace ? for NAN
df = df.replace('?', np.nan)

# Delete rows with missing values
df = df.dropna()
df = df.astype(float)

df = df.sample(n = 10000, random_state = 42)

target_columns = df.iloc[:,-33:].columns
y = df.loc[:, target_columns].astype(float).sum(axis=1)
df.drop(columns=target_columns, inplace=True)

In [54]:
df2 = df
df2['sum'] = y
#df.to_csv('data_sum.csv', index = False)

In [61]:
# transform data
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

transform = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, selector(dtype_exclude="category")),
        ("cat", categorical_transformer, selector(dtype_include="category"))
    ]
)

dtr = Pipeline(
    steps = [
        ('model', DecisionTreeRegressor(max_depth = 5, min_samples_split = 20, min_samples_leaf = 10))
    ]
)

svr = Pipeline(
    steps = [
        ('transform', transform),
        ('model', SVR(kernel = 'rbf', epsilon = 0.01))
    ]
)

knr = Pipeline(
    steps = [
        ('transform', transform),
        ('model', KNeighborsRegressor(n_neighbors = 10, p = 1, weights = "distance"))
    ]
)

mlp = Pipeline(
    steps = [
        ('transform', transform),
        ('model', MLPRegressor(hidden_layer_sizes=(10,10), alpha = 0.01,max_iter = 200))
    ]
)

models = [dtr, svr, knr, mlp]

# Train/Test split

In [62]:
n_iter = 100
train_test_rez = {}
for j in range(n_iter):
    # train test data
    X_train, X_test, y_train, y_test = train_test_split(df, y, train_size=0.7, random_state=100)

    for i in models:
        # fit
        i.fit(X_train, y_train)

        #prediction
        y_pred = i.predict(X_test)

        # score - Mean Squared Error
        score = mean_squared_error(y_pred, y_test)

        col_name = f"Model_{i}"
        
        if col_name not in train_test_rez:
            train_test_rez[col_name] = []

        train_test_rez[col_name].append(score)



In [48]:
train_test_rezultati = pd.DataFrame(train_test_rez)
#train_test_rezultati.to_csv('train_test_rezultati.csv', index=False)

column_names = {train_test_rezultati.columns[0]: 'DesisionTreeRegressor_mse',
                train_test_rezultati.columns[1]: 'SVR_mse',
                train_test_rezultati.columns[2]: 'KNeighborsRegressor_mse',
                train_test_rezultati.columns[3]: 'MLPRegressor_mse'}

train_test_rezultati = train_test_rezultati.rename(columns=column_names)
train_test_rezultati.to_csv('train_test_rezultati.csv', index=False)
train_test_rezultati

Unnamed: 0,DesisionTreeRegressor_r2,SVR_r2,KNeighborsRegressor_r2,MLPRegressor_r2
0,0.236559,0.648663,0.376528,0.544283
1,0.236559,0.648663,0.376528,0.617840
2,0.236398,0.648663,0.376528,0.560547
3,0.236559,0.648663,0.376528,0.563959
4,0.236559,0.648663,0.376528,0.618803
...,...,...,...,...
95,0.236398,0.648663,0.376528,0.599428
96,0.236559,0.648663,0.376528,0.523390
97,0.236398,0.648663,0.376528,0.551323
98,0.236559,0.648663,0.376528,0.552497


In [49]:
train_test_rezultati = pd.DataFrame(train_test_rezultati)[['DesisionTreeRegressor_mse', 
                                                       'SVR_mse',
                                                       'KNeighborsRegressor_mse',
                                                       'MLPRegressor_mse']].agg(['mean', 'var']).round(5)
train_test_rezultati

Unnamed: 0,DesisionTreeRegressor_r2,SVR_r2,KNeighborsRegressor_r2,MLPRegressor_r2
mean,0.2365,0.64866,0.37653,0.57862
var,0.0,0.0,0.0,0.00265


# Cross Validation

In [None]:
n_repeats = 100
n_splits = 10
cv_rez = {}
cv_rez['n_repeats'] = [i for spl in [[j+1] * n_splits for j in range(n_repeats)] for i in spl]

rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=42)
for j, (train_index, test_index) in enumerate(rkf.split(df)):
    # train and test split
    X_train = df.iloc[train_index, :]
    y_train = y.iloc[train_index]
    X_test = df.iloc[test_index, :]
    y_test = y.iloc[test_index]

    for i in models:
        # fit
        i.fit(X_train, y_train)

        #predict
        y_pred = i.predict(X_test)
        
        # score - Mean Squared Error
        score = mean_squared_error(y_pred, y_test)

        col_name = f"Model_{i}"
        
        if col_name not in cv_rez:
            cv_rez[col_name] = []

        cv_rez[col_name].append(score)
        

In [None]:
cv_rezultati = pd.DataFrame(cv_rez)

column_names = {cv_rezultati.columns[0]: 'n_repeats',
                cv_rezultati.columns[1]: 'DesisionTreeRegressor_mse',
                cv_rezultati.columns[2]: 'SVR_mse',
                cv_rezultati.columns[3]: 'KNeighborsRegressor_mse',
                cv_rezultati.columns[4]: 'MLPRegressor_mse'}

cv_rezultati = cv_rezultati.rename(columns=column_names)
cv_rezultati.to_csv('cv_rezultati.csv', index=False)
cv_rezultati

In [None]:
cv_rezultati = pd.DataFrame(cv_rezultati)[['n_repeats',
                                           'DesisionTreeRegressor_mse', 
                                           'SVR_mse',
                                           'KNeighborsRegressor_mse',
                                           'MLPRegressor_mse']].groupby('n_repeats').agg(['mean']).agg(['mean', 'var']).round(5)

cv_rezultati

# Bootstrap