In [2]:
# %matplotlib inline

import pandas as pd
import numpy as np
import time
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import DistanceMetric
from sklearn_extra.cluster import KMedoids
import matplotlib.pyplot as plt
import statistics
import pyclustering
from pyclustering.cluster.kmedoids import kmedoids
import gower
import seaborn as sns
import pyclustering
from sklearn.metrics.pairwise import pairwise_distances
from pyclustering.cluster import cluster_visualizer
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.cluster.kmedoids import kmedoids

# Read dataset

In [3]:
df = pd.read_excel("artificial_dataset.xlsx")
df

Unnamed: 0,alpha,beta,gamma,delta,label
0,B,C,381.513614,0.229575,0
1,B,E,348.888315,0.360779,0
2,B,E,313.475708,0.720267,0
3,A,D,391.312302,0.623395,0
4,A,D,342.378427,0.506764,1
...,...,...,...,...,...
95,B,E,366.991464,0.408907,0
96,A,D,334.777715,0.751939,0
97,A,D,314.411841,0.619102,0
98,B,E,234.261070,1.061910,0


Below we define which features are categorical and which are numerical. 

In [4]:
#Create lists of categorical and numerical features
cat_features = ['alpha','beta']
num_features = ['gamma','delta'] 

# k-medoids clustering

In [5]:
#This function compute the Gower distance matrix
# input: NxM feature matrix
# output: NxN distance matrix 

def gower_distance(matrix):
    output = gower.gower_matrix(matrix)
    return output

In [74]:
#This function gives a summary of the cluster
# input: NxM feature matrix, dictionary with clusters, number of cluster 
# output: summary of given cluster number 

def summarize_cluster(matrix,clusters,cluster_number):
    c_indices = clusters[cluster_number]
    matrix_subset = matrix.loc[c_indices,:]
    print('----------------------------------------------------------------------------------------------')
    print("Summary of cluster: " , cluster_number)
    print('Samples in cluster: ' , len(c_indices))
    print('---------------------------------------Features-----------------------------------------------')
    for col in matrix_subset.columns:
        print(col ,'\t', matrix_subset[col].tolist())
    print('----------------------------------------------------------------------------------------------')

Below we define the feature matrix (X) and the lavel vector (y). We also choose the number of clusters k and initialize the first cluster. 

In [84]:
X = df.loc[: , df.columns != 'label']
y = df.loc[: , df.columns == 'label']
k = 5
n = len(X)

initial_medoids = np.sort(np.random.choice(n,k))
kmedoids_instance = kmedoids(gower_distance(X),initial_medoids,data_type='distance_matrix',iter_max=1000)
kmedoids_instance.process()
clusters = kmedoids_instance.get_clusters()
centers = kmedoids_instance.get_medoids()

# Train ML algorithm 

In [9]:
import xgboost as xgb
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error

In [100]:
data = df.loc[: , df.columns != 'label']
label = df.loc[: , df.columns == 'label']

le = preprocessing.LabelEncoder()
for column in cat_features:
    le.fit(data[column])
    data[column] = le.fit_transform(data[column])

X_train, X_test, y_train, y_test = train_test_split(data, np.ravel(label), test_size=0.30, random_state=78) # 70% training and 30% test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column] = le.fit_transform(data[column])


### 1. Logistic Regression 

In [101]:
#train logistic regression model and check performance
clf = LogisticRegression(random_state=100).fit(X_train, y_train)
clf_predictions_train = clf.predict(X_train)
clf_predictions_test = clf.predict(X_test)
clf_predictions_total = clf.predict(data)

p_correct_train = 1 - (abs(clf_predictions_train-y_train).sum() / len(y_train))
p_correct_test = 1 - (abs(clf_predictions_test-y_test).sum() / len(y_test))
p_correct_total = 1 - (abs(clf_predictions_total-np.ravel(label)).sum() / len(np.ravel(label)))

print('correct predictions train set' ,'\t', p_correct_train*100)
print('correct predictions test set' , '\t',  p_correct_test*100)
print('correct predictions total set' , '\t', p_correct_total*100)

correct predictions train set 	 65.71428571428571
correct predictions test set 	 63.33333333333333
correct predictions total set 	 65.0


### 2. XGboost classifier

In [102]:
#train XGboost
xg_clf = xgb.XGBClassifier(max_depth=5, gamma=1)
xg_clf.fit(X_train, y_train)

xg_predictions_train = xg_clf.predict(X_train)
xg_predictions_test = xg_clf.predict(X_test)
xg_predictions_total = xg_clf.predict(data)

p_correct_train = 1 - (abs(xg_predictions_train-y_train).sum() / len(y_train))
p_correct_test = 1 - (abs(xg_predictions_test-y_test).sum() / len(y_test))
p_correct_total = 1 - (abs(xg_predictions_total-np.ravel(label)).sum() / len(np.ravel(label)))

print('correct predictions train set' ,'\t', p_correct_train*100)
print('correct predictions test set' , '\t',  p_correct_test*100)
print('correct predictions total set' , '\t', p_correct_total*100)

correct predictions train set 	 84.28571428571429
correct predictions test set 	 56.666666666666664
correct predictions total set 	 76.0




### 3. Support Vector Machine classifier 

In [103]:
from sklearn import svm

svm_clf = svm.SVC()
svm_clf.fit(X_train,y_train)

svm_predictions_train = svm_clf.predict(X_train)
svm_predictions_test = svm_clf.predict(X_test)
svm_predictions_total = svm_clf.predict(data)

p_correct_train = 1 - (abs(svm_predictions_train-y_train).sum() / len(y_train))
p_correct_test = 1 - (abs(svm_predictions_test-y_test).sum() / len(y_test))
p_correct_total = 1 - (abs(svm_predictions_total-np.ravel(label)).sum() / len(np.ravel(label)))

print('correct predictions train set' ,'\t', p_correct_train*100)
print('correct predictions test set' , '\t',  p_correct_test*100)
print('correct predictions total set' , '\t', p_correct_total*100)

correct predictions train set 	 65.71428571428571
correct predictions test set 	 63.33333333333333
correct predictions total set 	 65.0


# Validation

For each cluster we will compute the percentage correct predicted labels.

In [104]:
#input: array with predicted labels, array with true labels
#output: percentage correct predicted labels 

def correct_predicted(predicted_labels,true_labels):
    output = (1 - (abs(predicted_labels-true_labels).sum() / len(true_labels))) * 100
    return output

In [105]:
#summarize clusters
for i in np.arange(k):
    summarize_cluster(X,clusters,i)

----------------------------------------------------------------------------------------------
Summary of cluster:  0
Samples in cluster:  17
---------------------------------------Features-----------------------------------------------
alpha 	 ['B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B']
beta 	 ['D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D']
gamma 	 [316.4427812731012, 438.6389583103979, 363.5354911387662, 258.1767777232494, 401.3809827383549, 377.9564017133732, 286.3695336330434, 398.6828607234717, 387.2588567343811, 309.0108283148952, 369.9955956928401, 285.5834653587883, 355.1922344420338, 368.8696890709799, 261.6543443069412, 331.1820581715405, 394.8745207400268]
delta 	 [0.389054482289937, 0.4677094006869512, 0.07033947845761684, 0.5116565728780655, 0.8182538768899543, 0.7506398299475989, 0.288842717051486, 0.439229424737304, 0.6637188521557412, 0.71574559680886, 0.7450268891793663, 0.3217983488176098,

### 1. Logistic Regression 

In [106]:
for i in np.arange(k):
    indices = clusters[i]
    subset_X = data.loc[indices,:]
    subset_y = label.loc[indices,:]
    clf_predictions_cluster = clf.predict(subset_X)
    print('Cluster ', i , '\t', correct_predicted(clf_predictions_cluster,np.ravel(subset_y)))

Cluster  0 	 70.58823529411764
Cluster  1 	 50.0
Cluster  2 	 57.14285714285714
Cluster  3 	 75.0
Cluster  4 	 65.38461538461539


### 2. XGboost classifier 

In [107]:
for i in np.arange(k):
    indices = clusters[i]
    subset_X = data.loc[indices,:]
    subset_y = label.loc[indices,:]
    xg_predictions_cluster = xg_clf.predict(subset_X)
    print('Cluster ', i , '\t', correct_predicted(xg_predictions_cluster,np.ravel(subset_y)))

Cluster  0 	 82.35294117647058
Cluster  1 	 91.66666666666666
Cluster  2 	 66.66666666666667
Cluster  3 	 83.33333333333334
Cluster  4 	 65.38461538461539


### 3. SVM classifier 

In [108]:
for i in np.arange(k):
    indices = clusters[i]
    subset_X = data.loc[indices,:]
    subset_y = label.loc[indices,:]
    svm_predictions_cluster = svm_clf.predict(subset_X)
    print('Cluster ', i , '\t', correct_predicted(svm_predictions_cluster,np.ravel(subset_y)))

Cluster  0 	 70.58823529411764
Cluster  1 	 50.0
Cluster  2 	 57.14285714285714
Cluster  3 	 75.0
Cluster  4 	 65.38461538461539
