In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import random
import math

from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import (ColumnTransformer, make_column_selector as selector)
from sklearn.model_selection import (KFold, LeaveOneOut)
from sklearn.utils import resample
from sklearn.metrics import (mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, 
                             r2_score, 
                             recall_score, accuracy_score, roc_auc_score)

In [52]:
df = pd.read_csv("data60.csv", low_memory=False)

# replace ? for NAN
df = df.replace('?', np.nan)

# Delete rows with missing values
df = df.dropna()
df = df.astype(float)

df = df.sample(n = 20, random_state = 42)

target_columns = df.iloc[:,-33:].columns
y = df.loc[:, target_columns].astype(float).sum(axis=1)
df.drop(columns=target_columns, inplace=True)

In [48]:
# transform data
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

transform = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, selector(dtype_exclude="category")),
        ("cat", categorical_transformer, selector(dtype_include="category"))
    ]
)

dtr = Pipeline(
    steps = [
        ('model', DecisionTreeRegressor(max_depth = 5, min_samples_split = 20, min_samples_leaf = 10))
    ]
)

svr = Pipeline(
    steps = [
        ('transform', transform),
        ('model', SVR(kernel = 'rbf', epsilon = 0.01))
    ]
)

knr = Pipeline(
    steps = [
        ('transform', transform),
        ('model', KNeighborsRegressor(n_neighbors = 10, p = 1, weights = "distance"))
    ]
)

mlp = Pipeline(
    steps = [
        ('transform', transform),
        ('model', MLPRegressor(hidden_layer_sizes=(10,10), alpha = 0.01,max_iter = 300))
    ]
)

models = [dtr, svr, knr, mlp]

In [51]:
df = df.sample(n = 20, random_state = 42)

ValueError: Cannot take a larger sample than population when 'replace=False'

In [53]:
n_repeats = 2
n_splits = 2
cv_rez = {}
cv_rez['n_repeats'] = []
cv_rez['n_split'] = []

for i in range(n_repeats):
    
    kf = KFold(n_splits=n_splits, random_state=42, shuffle=True)
    for j, (train_index, test_index) in enumerate(kf.split(df)):
        cv_rez['n_repeats'].append(i)
        cv_rez['n_split'].append(j)

        # train and test split
        X_train = df.iloc[train_index, :]
        y_train = y.iloc[train_index]
        X_test = df.iloc[test_index, :]
        y_test = y.iloc[test_index]

        for k in models:
            # fit
            k.fit(X_train, y_train)

            #predict
            y_pred = k.predict(X_test)
            
            # score - Mean Squared Error
            score = mean_squared_error(y_pred, y_test)

            col_name = f"Model_{k}"
            
            if col_name not in cv_rez:
                cv_rez[col_name] = []

            cv_rez[col_name].append(score)



In [55]:
cv_rezultati = pd.DataFrame(cv_rez)

column_names = {cv_rezultati.columns[0]: 'n_repeats',
                cv_rezultati.columns[1]: 'n_split',
                cv_rezultati.columns[2]: 'DesisionTreeRegressor_mse',
                cv_rezultati.columns[3]: 'SVR_mse',
                cv_rezultati.columns[4]: 'KNeighborsRegressor_mse',
                cv_rezultati.columns[5]: 'MLPRegressor_mse'}

cv_rezultati = cv_rezultati.rename(columns=column_names)
#cv_rezultati.to_csv('cv_rezultati.csv', index=False)
cv_rezultati

Unnamed: 0,n_repeats,n_split,DesisionTreeRegressor_mse,SVR_mse,KNeighborsRegressor_mse,MLPRegressor_mse
0,0,0,0.793807,0.870161,0.762588,2205.601424
1,0,1,0.876868,0.97347,0.835048,155.585669
2,1,0,0.793807,0.870161,0.762588,3479.195018
3,1,1,0.876868,0.97347,0.835048,161.629699
