In [33]:
import pandas as pd
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
import xgboost
from scipy.spatial.distance import pdist, squareform
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import os

K = 10

In [34]:
file_dir = os.path.dirname(os.path.realpath('__file__'))

data = pd.read_csv(os.path.join(file_dir, 'data\\iris.csv'), delimiter=',', header=0)
data = data.drop('variety', axis=1)

data.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [35]:
distances = pdist(data.values, metric='euclidean')
dist_matrix = squareform(distances)

rep = []
for row in dist_matrix:
    mean_dist = np.mean(np.sort(row)[K])

    represent = 1 / (1 + mean_dist)

    rep.append(represent)

data['rep'] = rep
data.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,rep
0,5.1,3.5,1.4,0.2,0.769231
1,4.9,3.0,1.4,0.2,0.750941
2,4.7,3.2,1.3,0.2,0.759747
3,4.6,3.1,1.5,0.2,0.769231
4,5.0,3.6,1.4,0.2,0.779519


In [36]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# split the dataset

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)

### Model Selection

In [37]:
# Using the models with default parameters

svr = SVR()
enet = ElasticNet()
xgb = xgboost.XGBRegressor()

svr.fit(X_train, y_train)
enet.fit(X_train, y_train)
xgb.fit(X_train, y_train)

pred_svr = svr.predict(X_test)
pred_enet = enet.predict(X_test)
pred_xgb = xgb.predict(X_test)

mse_svr = mean_squared_error(y_test, pred_svr)
mse_enet = mean_squared_error(y_test, pred_enet)
mse_xgb = mean_squared_error(y_test, pred_xgb)

print("Support Vector Machine MSE: ", mse_svr, "\n Elastic Net MSE: ", mse_enet, "\n XGBoost MSE: ", mse_xgb)

Support Vector Machine MSE:  0.0027735417341868667 
 Elastic Net MSE:  0.004542296300686732 
 XGBoost MSE:  0.001602397165373108


## XGBoost achieves best results, therefore I will use it in the final API.
I will continue using default parameters, beacuse the model already achieves satysfying performance.

### Optimizing hyper-parameters

In [38]:
#In the algorithm, there are 2 external parameters: K and L. I will find the optimal value of these parameters.

def calculate_mse(data):
    xgb = xgboost.XGBRegressor()
    X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(data.iloc[:, :-1], data.iloc[:,-1], test_size=0.1, random_state=0)
    xgb.fit(X_train_split, y_train_split)

    pred_xgb = xgb.predict(X_test)

    return mean_squared_error(y_test, pred_xgb)

min_mse = 1
for K in range(1,12):
    for L in range(1,10):
        shuffled = X_train.sample(frac=1)
        data_split = np.array_split(shuffled, L)

        for split in data_split :
            distances = pdist(split.values, metric='euclidean')
            dist_matrix = squareform(distances)

            rep = []
            for row in dist_matrix:
                mean_dist = np.mean(np.sort(row)[K])

                represent = 1 / (1 + mean_dist)

                rep.append(represent)

            split['rep'] = rep


            mse_split = calculate_mse(data=split)

            if mse_split < min_mse :
                min_mse = mse_split
                ans = {'K':K, 'L':L, "min_mse":min_mse}

ans

{'K': 5, 'L': 2, 'min_mse': 0.0021318575960917324}

## {'K': 5, 'L': 2, 'min_mse': 0.0021318575960917324} are the optimal parameters