# Gautam Phadke
# USC ID: 2661440757

Import all the essential Python libraries. 

In [1]:
#Import essential libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
warnings.filterwarnings("ignore", category=ConvergenceWarning)
%matplotlib inline

Upload the Anuran Calls dataset from UCI repository

In [2]:
data_frogs = pd.read_csv('https://github.com/GautamSPhadke/FrogsDataset/blob/master/Frogs_MFCCs.csv?raw=true')

Drop the RecordID column as it won't be needed for analysis

In [3]:
data_frogs = data_frogs.drop(columns = ['RecordID'])

Display first few rows of the dataset

In [4]:
data_frogs.head()

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,...,MFCCs_16,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22,Family,Genus,Species
0,1.0,0.152936,-0.105586,0.200722,0.317201,0.260764,0.100945,-0.150063,-0.171128,0.124676,...,-0.024017,-0.108351,-0.077623,-0.009568,0.057684,0.11868,0.014038,Leptodactylidae,Adenomera,AdenomeraAndre
1,1.0,0.171534,-0.098975,0.268425,0.338672,0.268353,0.060835,-0.222475,-0.207693,0.170883,...,0.012022,-0.090974,-0.05651,-0.035303,0.02014,0.082263,0.029056,Leptodactylidae,Adenomera,AdenomeraAndre
2,1.0,0.152317,-0.082973,0.287128,0.276014,0.189867,0.008714,-0.242234,-0.219153,0.232538,...,0.083536,-0.050691,-0.02359,-0.066722,-0.025083,0.099108,0.077162,Leptodactylidae,Adenomera,AdenomeraAndre
3,1.0,0.224392,0.118985,0.329432,0.372088,0.361005,0.015501,-0.194347,-0.098181,0.270375,...,-0.050224,-0.136009,-0.177037,-0.130498,-0.054766,-0.018691,0.023954,Leptodactylidae,Adenomera,AdenomeraAndre
4,1.0,0.087817,-0.068345,0.306967,0.330923,0.249144,0.006884,-0.265423,-0.1727,0.266434,...,0.062837,-0.048885,-0.053074,-0.08855,-0.031346,0.10861,0.079244,Leptodactylidae,Adenomera,AdenomeraAndre


Dimensions of the dataset

In [5]:
data_frogs.shape

(7195, 25)

## Choose 70% of data randomly as the training set

In [6]:
label_family = data_frogs['Family']
label_genus = data_frogs['Genus']
label_species = data_frogs['Species']

In [7]:
data_frogs_features = data_frogs.drop(columns = ['Family','Genus','Species'])
label_frogs = pd.concat([label_family, label_genus, label_species], axis = 1)

Use the train_test_split function to split the data into training set and test set. 'stratify = label_frogs' indicates that the class distribution in both training and test sets is almost equal. 

In [8]:
data_frog_features_train, data_frog_features_test, label_frogs_train, label_frogs_test = train_test_split(data_frogs_features,
                                                                                                         label_frogs,
                                                                                                         test_size = 0.3,
                                                                                                         random_state = 42,
                                                                                                         stratify = label_frogs)

In [9]:
data_frog_features_train.shape, data_frog_features_test.shape, label_frogs_train.shape, label_frogs_test.shape

((5036, 22), (2159, 22), (5036, 3), (2159, 3))

In [10]:
label_family_train = label_frogs_train['Family']
label_family_test = label_frogs_test['Family']
label_genus_train = label_frogs_train['Genus']
label_genus_test = label_frogs_test['Genus']
label_species_train = label_frogs_train['Species']
label_species_test = label_frogs_test['Species']

##  Exact match and hamming score/ loss

1. Exact match: For multi-label classification, the exact match/ accuracy score is defined as follows: If the entire set of predicted labels for a sample strictly matches with the true set of labels, then the subset accuracy is 1.0; otherwise it is 0.0. If $ \hat{y}_i $ is the predicted value of $ i $-th sample and $ y_i $ is the corresponding true value, then the fraction of correct predictions over $ n_\text{samples} $ is defined as:

\begin{align}
\texttt{accuracy}(y, \hat{y}) = \frac{1}{n_\text{samples}} \sum_{i=0}^{n_\text{samples}-1} 1(\hat{y}_i = y_i)
\end{align}

2. Hamming loss: If $\hat{y}_j$ is the predicted value for the $j$-th label of a given sample, $y_j$ is the corresponding true value, and $n_\text{labels} $ is the number of classes or labels, then the Hamming loss between two samples is defined as:

\begin{align}
L_{Hamming}(y, \hat{y}) = \frac{1}{n_\text{labels}} \sum_{j=0}^{n_\text{labels} - 1} 1(\hat{y}_j \not= y_j)
\end{align}

where, $ 1(\hat{y}_j \not= y_j)$ represents the indicator function.
##### Reference: Sckit learn model evaluation docs

## Train SVM using Gaussian kernel and one vs all classifier

A function for Gaussian SVM Kernel is defined. Stratified K Fold cross-validation is used for the purpose of selecting best set of hyperparameters. 

In [11]:
def svm_gaussian_kernel(data_train, label_train):
    gamma_values = np.logspace(-6,6, num = 10)
    c_values = np.logspace(-2,7, num = 10)
    best_score = 0
    best_c = 0
    best_gamma = 0
    
    skf = StratifiedKFold(n_splits = 10, random_state = 42)
    for gamma_val in gamma_values:
        for c_val in c_values:
            gaussian_svc = SVC(kernel = 'rbf', C = c_val, gamma = gamma_val, decision_function_shape= 'ovr')
            score = cross_val_score(gaussian_svc, data_train, label_train, cv = skf)
            score = score.mean()
            if(score > best_score):
                best_score = score
                best_c = c_val
                best_gamma = gamma_val
            print("c =", c_val, ', gamma =', gamma_val)
            print("Score values =", score)
            print()
    
    return best_c, best_gamma                

Return the most optimal 'c' and 'gamma' values for gaussian kernel

In [12]:
best_c_family, best_gamma_family = svm_gaussian_kernel(data_frog_features_train, label_family_train)

c = 0.01 , gamma = 1e-06
Score values = 0.6141802418989304

c = 0.1 , gamma = 1e-06
Score values = 0.6141802418989304

c = 1.0 , gamma = 1e-06
Score values = 0.6141802418989304

c = 10.0 , gamma = 1e-06
Score values = 0.6141802418989304

c = 100.0 , gamma = 1e-06
Score values = 0.6141802418989304

c = 1000.0 , gamma = 1e-06
Score values = 0.6245080193760528

c = 10000.0 , gamma = 1e-06
Score values = 0.8679475175270974

c = 100000.0 , gamma = 1e-06
Score values = 0.9271233448386793

c = 1000000.0 , gamma = 1e-06
Score values = 0.9398297330150456

c = 10000000.0 , gamma = 1e-06
Score values = 0.940822198832981

c = 0.01 , gamma = 2.1544346900318823e-05
Score values = 0.6141802418989304

c = 0.1 , gamma = 2.1544346900318823e-05
Score values = 0.6141802418989304

c = 1.0 , gamma = 2.1544346900318823e-05
Score values = 0.6141802418989304

c = 10.0 , gamma = 2.1544346900318823e-05
Score values = 0.6141802418989304

c = 100.0 , gamma = 2.1544346900318823e-05
Score values = 0.7297355255487751

### SVM Hyperparameters for label family

In [13]:
print("Optimal Weight of SVM penalty for family:", best_c_family)
print("Optimal width of Gaussian kernel for family:", best_gamma_family)

Optimal Weight of SVM penalty for family: 10.0
Optimal width of Gaussian kernel for family: 4.641588833612772


In [14]:
best_c_genus, best_gamma_genus = svm_gaussian_kernel(data_frog_features_train, label_genus_train)

c = 0.01 , gamma = 1e-06
Score values = 0.5766567476294664

c = 0.1 , gamma = 1e-06
Score values = 0.5766567476294664

c = 1.0 , gamma = 1e-06
Score values = 0.5766567476294664

c = 10.0 , gamma = 1e-06
Score values = 0.5766567476294664

c = 100.0 , gamma = 1e-06
Score values = 0.5766567476294664

c = 1000.0 , gamma = 1e-06
Score values = 0.5768543760879644

c = 10000.0 , gamma = 1e-06
Score values = 0.8071949885543631

c = 100000.0 , gamma = 1e-06
Score values = 0.9304984451275692

c = 1000000.0 , gamma = 1e-06
Score values = 0.9559040702326087

c = 10000000.0 , gamma = 1e-06
Score values = 0.9630426914193482

c = 0.01 , gamma = 2.1544346900318823e-05
Score values = 0.5766567476294664

c = 0.1 , gamma = 2.1544346900318823e-05
Score values = 0.5766567476294664

c = 1.0 , gamma = 2.1544346900318823e-05
Score values = 0.5766567476294664

c = 10.0 , gamma = 2.1544346900318823e-05
Score values = 0.5766567476294664

c = 100.0 , gamma = 2.1544346900318823e-05
Score values = 0.652902461617510

### SVM Hyperparameters for label Genus

In [15]:
print("Optimal Weight of SVM penalty for genus:", best_c_genus)
print("Optimal width of Gaussian kernel for genus:", best_gamma_genus)

Optimal Weight of SVM penalty for genus: 10.0
Optimal width of Gaussian kernel for genus: 4.641588833612772


In [16]:
best_c_species, best_gamma_species = svm_gaussian_kernel(data_frog_features_train, label_species_train)

c = 0.01 , gamma = 1e-06
Score values = 0.483326702841924

c = 0.1 , gamma = 1e-06
Score values = 0.483326702841924

c = 1.0 , gamma = 1e-06
Score values = 0.483326702841924

c = 10.0 , gamma = 1e-06
Score values = 0.483326702841924

c = 100.0 , gamma = 1e-06
Score values = 0.483326702841924

c = 1000.0 , gamma = 1e-06
Score values = 0.5182812022536518

c = 10000.0 , gamma = 1e-06
Score values = 0.8443280799574755

c = 100000.0 , gamma = 1e-06
Score values = 0.9414251962705157

c = 1000000.0 , gamma = 1e-06
Score values = 0.9714011244871801

c = 10000000.0 , gamma = 1e-06
Score values = 0.975780087147841

c = 0.01 , gamma = 2.1544346900318823e-05
Score values = 0.483326702841924

c = 0.1 , gamma = 2.1544346900318823e-05
Score values = 0.483326702841924

c = 1.0 , gamma = 2.1544346900318823e-05
Score values = 0.483326702841924

c = 10.0 , gamma = 2.1544346900318823e-05
Score values = 0.483326702841924

c = 100.0 , gamma = 2.1544346900318823e-05
Score values = 0.6330461620884276

c = 100

### SVM Hyperparameters for label species

In [17]:
print("Optimal Weight of SVM penalty for species:", best_c_species)
print("Optimal width of Gaussian kernel for species:", best_gamma_species)

Optimal Weight of SVM penalty for species: 1000.0
Optimal width of Gaussian kernel for species: 0.21544346900318823


Retrain the model using best set of hyperparameters again, this time on the entire training dataset

In [18]:
svc_gaussian_family = SVC(kernel = 'rbf', C = best_c_family, 
                          gamma = best_gamma_family, decision_function_shape= 'ovr')
svc_gaussian_genus = SVC(kernel = 'rbf', C = best_c_genus, 
                          gamma = best_gamma_genus, decision_function_shape= 'ovr')
svc_gaussian_species = SVC(kernel = 'rbf', C = best_c_species, 
                          gamma = best_gamma_species, decision_function_shape= 'ovr')

In [19]:
svc_gaussian_famiy = svc_gaussian_family.fit(data_frog_features_train, label_family_train)
predicted_family_values = svc_gaussian_family.predict(data_frog_features_test)

svc_gaussian_genus = svc_gaussian_genus.fit(data_frog_features_train, label_genus_train)
predicted_genus_values = svc_gaussian_genus.predict(data_frog_features_test)

svc_gaussian_species = svc_gaussian_species.fit(data_frog_features_train, label_species_train)
predicted_species_values = svc_gaussian_species.predict(data_frog_features_test)

Calculate the Exact match score and the Hamming loss for above model

In [20]:
def exact_match_score(family_real, genus_real, species_real, family_pred, genus_pred, species_pred):
    indicator_value = 0
    for i in range(len(family_real)):
        if((family_real[i] == family_pred[i]) and (genus_real[i] == genus_pred[i]) and (species_real[i] 
                                                                                       == species_pred[i])):
            indicator_value += 1
    
    exact_match = indicator_value/(len(family_real))
    return exact_match

def hamming_loss_score(family_real, genus_real, species_real, family_pred, genus_pred, species_pred):
    hamming_loss_family = hamming_loss(family_real, family_pred)
    hamming_loss_genus = hamming_loss(genus_real, genus_pred)
    hamming_loss_species = hamming_loss(species_real, species_pred)
    
    hamming_loss_value = (hamming_loss_family + hamming_loss_genus + hamming_loss_species)/3
    return hamming_loss_value, 1 - hamming_loss_value

In [21]:
exact_match_svm_gaussian = exact_match_score(np.array(label_family_test), np.array(label_genus_test),
                                            np.array(label_species_test), predicted_family_values,
                                            predicted_genus_values, predicted_species_values)

### Value of Exact match for Gaussian SVM 

In [22]:
print ("Gaussian SVM exact match value:", exact_match_svm_gaussian)

Gaussian SVM exact match value: 0.9759147753589624


In [23]:
hamming_loss_svm_gaussian, hamming_score_svm_gaussian = hamming_loss_score(np.array(label_family_test), np.array(label_genus_test),
                                            np.array(label_species_test), predicted_family_values,
                                            predicted_genus_values, predicted_species_values)

### Value of Hamming loss for Gaussian SVM

In [24]:
print("Gaussian SVM hamming loss value:",hamming_loss_svm_gaussian)

Gaussian SVM hamming loss value: 0.012351397251814111


##  L1 penalized SVM

Use the 'StandardScaler()' function to standardize the features. 

In [25]:
## Standardize the features for L1
def standardize_features(data_train, data_test):
    sc = StandardScaler()
    data_train_transform = sc.fit_transform(data_train)
    data_test_transform = sc.transform(data_test)
    
    return data_train_transform, data_test_transform

In [26]:
data_frog_features_train_transform, data_frog_features_test_transform = standardize_features(data_frog_features_train,
                                                                                            data_frog_features_test)

Follow the same process as done for kernelized SVM

In [31]:
def l1_penalized_svm(data_train, label_train):
    best_score = 0
    best_penalty = 0
    penalty_value = np.logspace(-5,5,10)
    skf = StratifiedKFold(n_splits = 10, random_state = 42)
    for pen_val in penalty_value:
        lin_svc = LinearSVC(penalty = 'l1', C = pen_val, dual = False)
        score = cross_val_score(lin_svc, data_train, label_train, cv = skf)
        score = score.mean()
        if(score > best_score):
            best_score = score
            best_penalty = pen_val
        print("Penalty:", pen_val)
        print("score:", score)
        print()
    return best_penalty

In [32]:
pen_value_family = l1_penalized_svm(data_frog_features_train_transform, label_family_train)

Penalty: 1e-05
score: 0.009529775019086747

Penalty: 0.0001291549665014884
score: 0.07525759963762792

Penalty: 0.0016681005372000592
score: 0.8719252903362922

Penalty: 0.021544346900318846
score: 0.9273261123352375

Penalty: 0.2782559402207126
score: 0.9334816803130579

Penalty: 3.593813663804626
score: 0.9340769199798442

Penalty: 46.41588833612782
score: 0.9340769199798442

Penalty: 599.4842503189421
score: 0.9340769199798442

Penalty: 7742.636826811277
score: 0.9340769199798442

Penalty: 100000.0
score: 0.9340769199798442



### Optimal value of penalty for family label

In [33]:
print("Optimal value of penalty term for family:", pen_value_family)

Optimal value of penalty term for family: 3.593813663804626


In [34]:
pen_value_genus = l1_penalized_svm(data_frog_features_train_transform, label_genus_train)

Penalty: 1e-05
score: 0.5766567476294664

Penalty: 0.0001291549665014884
score: 0.5766567476294664

Penalty: 0.0016681005372000592
score: 0.8331942267721164

Penalty: 0.021544346900318846
score: 0.9245206910254792

Penalty: 0.2782559402207126
score: 0.941801102083946

Penalty: 3.593813663804626
score: 0.9457690431864266

Penalty: 46.41588833612782
score: 0.9475547809523779

Penalty: 599.4842503189421
score: 0.9475547809523779

Penalty: 7742.636826811277
score: 0.9475547809523779

Penalty: 100000.0
score: 0.9475547809523779



### Optimal value of penalty for genus label

In [35]:
print("Optimal value of penalty term for genus:", pen_value_genus)

Optimal value of penalty term for genus: 46.41588833612782


In [36]:
pen_value_species = l1_penalized_svm(data_frog_features_train_transform, label_species_train)

Penalty: 1e-05
score: 0.0933300447875424

Penalty: 0.0001291549665014884
score: 0.0933300447875424

Penalty: 0.0016681005372000592
score: 0.8264531771368109

Penalty: 0.021544346900318846
score: 0.9277052864193853

Penalty: 0.2782559402207126
score: 0.9565115999453144

Penalty: 3.593813663804626
score: 0.9575016940335288

Penalty: 46.41588833612782
score: 0.9573040655750308

Penalty: 599.4842503189421
score: 0.9575052506165488

Penalty: 7742.636826811277
score: 0.9575052506165488

Penalty: 100000.0
score: 0.9575052506165488



### Optimal value of penalty for species label

In [37]:
print("Optimal value of penalty term for species:", pen_value_species)

Optimal value of penalty term for species: 599.4842503189421


In [38]:
svc_l1_family = LinearSVC(penalty = 'l1', C = pen_value_family, dual = False)
svc_l1_genus = LinearSVC(penalty = 'l1', C = pen_value_genus, dual = False)
svc_l1_species = LinearSVC(penalty = 'l1', C = pen_value_species, dual = False)

In [39]:
svc_l1_famiy = svc_l1_family.fit(data_frog_features_train, label_family_train)
predicted_family_values_l1 = svc_l1_family.predict(data_frog_features_test)

svc_l1_genus = svc_l1_genus.fit(data_frog_features_train, label_genus_train)
predicted_genus_values_l1 = svc_l1_genus.predict(data_frog_features_test)

svc_l1_species = svc_l1_species.fit(data_frog_features_train, label_species_train)
predicted_species_values_l1 = svc_l1_species.predict(data_frog_features_test)

### Exact match for L1 penalized SVM

In [40]:
exact_match_svm_l1 = exact_match_score(np.array(label_family_test), np.array(label_genus_test),
                                            np.array(label_species_test), predicted_family_values_l1,
                                            predicted_genus_values_l1, predicted_species_values_l1)

In [41]:
print ("L1 penalized SVM exact match value:", exact_match_svm_l1)

L1 penalized SVM exact match value: 0.9133858267716536


In [42]:
hamming_loss_svm_l1, hamming_score_svm_l1 = hamming_loss_score(np.array(label_family_test), np.array(label_genus_test),
                                            np.array(label_species_test), predicted_family_values_l1,
                                            predicted_genus_values_l1, predicted_species_values_l1)

### Hamming Loss for L1 penalized SVM

In [43]:
print("L1 penalized SVM hamming loss value:",hamming_loss_svm_l1)

L1 penalized SVM hamming loss value: 0.049251196541608776


##  L1 penalized svm with balanced weights to remedy class imbalance

In [44]:
#Instead of SMOTE, I have set the class_weight = "balanced", in order to remedy the class imbalance
def l1_penalized_svm_balanced(data_train, label_train):
    best_score = 0
    best_penalty = 0
    penalty_value = np.logspace(-5,5,10)
    skf = StratifiedKFold(n_splits = 10, random_state = 42)
    for pen_val in penalty_value:
        lin_svc = LinearSVC(penalty = 'l1', C = pen_val, dual = False, class_weight = 'balanced')
        score = cross_val_score(lin_svc, data_train, label_train, cv = skf)
        score = score.mean()
        if(score > best_score):
            best_score = score
            best_penalty = pen_val
        print("Penalty:", pen_val)
        print("score:", score)
        print()
    return best_penalty

In [45]:
pen_value_family_balanced = l1_penalized_svm_balanced(data_frog_features_train_transform, label_family_train)

Penalty: 1e-05
score: 0.009529775019086747

Penalty: 0.0001291549665014884
score: 0.009529775019086747

Penalty: 0.0016681005372000592
score: 0.8379799541795357

Penalty: 0.021544346900318846
score: 0.9169963405816614

Penalty: 0.2782559402207126
score: 0.9237483221634276

Penalty: 3.593813663804626
score: 0.9255360181623955

Penalty: 46.41588833612782
score: 0.9253383991053283

Penalty: 599.4842503189421
score: 0.9253383991053283

Penalty: 7742.636826811277
score: 0.9253383991053283

Penalty: 100000.0
score: 0.9253383991053283



### Penalty term for family label

In [46]:
print("Optimal value of penalty term for family with balanced dataset:", pen_value_family_balanced)

Optimal value of penalty term for family with balanced dataset: 3.593813663804626


In [47]:
pen_value_genus_balanced = l1_penalized_svm_balanced(data_frog_features_train_transform, label_genus_train)

Penalty: 1e-05
score: 0.5766567476294664

Penalty: 0.0001291549665014884
score: 0.5766567476294664

Penalty: 0.0016681005372000592
score: 0.9019075823979407

Penalty: 0.021544346900318846
score: 0.9364540678495856

Penalty: 0.2782559402207126
score: 0.9396413930215456

Penalty: 3.593813663804626
score: 0.9420132944160198

Penalty: 46.41588833612782
score: 0.9420125039271816

Penalty: 599.4842503189421
score: 0.9418148754686836

Penalty: 7742.636826811277
score: 0.9418148754686836

Penalty: 100000.0
score: 0.9416172470101856



### Penalty term for genus label

In [48]:
print("Optimal value of penalty term for genus with balanced dataest:", pen_value_genus_balanced)

Optimal value of penalty term for genus with balanced dataest: 3.593813663804626


In [49]:
pen_value_species_balanced = l1_penalized_svm_balanced(data_frog_features_train_transform, label_species_train)

Penalty: 1e-05
score: 0.0933300447875424

Penalty: 0.0001291549665014884
score: 0.0933300447875424

Penalty: 0.0016681005372000592
score: 0.893769374359399

Penalty: 0.021544346900318846
score: 0.9487524063344074

Penalty: 0.2782559402207126
score: 0.9547238173942963

Penalty: 3.593813663804626
score: 0.9569024435942144

Penalty: 46.41588833612782
score: 0.9567048340515092

Penalty: 599.4842503189421
score: 0.9567076063946143

Penalty: 7742.636826811277
score: 0.9567076063946143

Penalty: 100000.0
score: 0.9567076063946143



### Penalty term for species label

In [50]:
print("Optimal value of penalty term for species with balanced dataset:", pen_value_species_balanced)

Optimal value of penalty term for species with balanced dataset: 3.593813663804626


In [51]:
svc_l1_balanced_family = LinearSVC(penalty = 'l1', C = pen_value_family_balanced,
                                   dual = False, class_weight = 'balanced')
svc_l1_balanced_genus = LinearSVC(penalty = 'l1', C = pen_value_genus_balanced,
                                   dual = False, class_weight = 'balanced')
svc_l1_balanced_species = LinearSVC(penalty = 'l1', C = pen_value_species_balanced,
                                   dual = False, class_weight = 'balanced')

In [52]:
svc_l1_balanced_famiy = svc_l1_balanced_family.fit(data_frog_features_train, label_family_train)
predicted_family_values_l1_balanced = svc_l1_balanced_family.predict(data_frog_features_test)

svc_l1_balanced_genus = svc_l1_balanced_genus.fit(data_frog_features_train, label_genus_train)
predicted_genus_values_l1_balanced = svc_l1_balanced_genus.predict(data_frog_features_test)

svc_l1_balanced_species = svc_l1_balanced_species.fit(data_frog_features_train, label_species_train)
predicted_species_values_l1_balanced = svc_l1_balanced_species.predict(data_frog_features_test)

In [53]:
exact_match_svm_l1_balanced = exact_match_score(np.array(label_family_test), np.array(label_genus_test),
                                            np.array(label_species_test), predicted_family_values_l1_balanced,
                                            predicted_genus_values_l1_balanced, predicted_species_values_l1_balanced)

### Exact match for L1 penalized SVM with balanced weights

In [54]:
print ("L1 penalized balanced weights SVM exact match value:", exact_match_svm_l1_balanced)

L1 penalized balanced weights SVM exact match value: 0.9133858267716536


In [55]:
hamming_loss_svm_l1_balanced, hamming_score_svm_l1_balanced = hamming_loss_score(np.array(label_family_test),
                                                                                 np.array(label_genus_test),
                                            np.array(label_species_test), predicted_family_values_l1_balanced,
                                            predicted_genus_values_l1_balanced, predicted_species_values_l1_balanced)

### Hamming loss for L1 penalized SVM with balanced weights

In [56]:
print("L1 penalized balanced weights SVM hamming loss value:",hamming_loss_svm_l1_balanced)

L1 penalized balanced weights SVM hamming loss value: 0.049251196541608776


##  Monte-carlo simulation, Choosing best value of k, and determining which family is in majority by reading true labels

In [58]:
##Lists for Hamming distance, hamming score and Hamming loss
hamming_distance_list = []
hamming_score_list = []
hamming_loss_list = []

##Monte Carlo Estimation
for i in range(1, 51):
    print("Iteration number:", i)
    best_k_value = 0
    best_score = 0
    
    ##Select the best value of k (no. of clusters) based on Silhouette score
    for k in range(2,51):
        k_means = KMeans(n_clusters= k, random_state = np.random.randint(1000))
        cluster_labels = k_means.fit_predict(data_frogs_features)
        silhouette_avg = silhouette_score(data_frogs_features, cluster_labels)
        if (silhouette_avg > best_score):
            best_score = silhouette_avg
            best_k_value = k
    print("Best k value:", best_k_value)
    
    #Choose the best value of k and perform K means clustering
    k_means = KMeans(n_clusters = best_k_value, random_state = np.random.randint(1000))
    clusters_predict = k_means.fit_predict(data_frogs_features)
    
    #Form a Dataframe of Clusters and corresponding labels
    k_means_family = pd.concat([pd.DataFrame(clusters_predict, columns = ['Cluster']), label_family], axis = 1)
    k_means_genus = pd.concat([pd.DataFrame(clusters_predict, columns = ['Cluster']), label_genus], axis = 1)
    k_means_species = pd.concat([pd.DataFrame(clusters_predict, columns = ['Cluster']), label_species], axis = 1)
    
    
    #Compare the values in the cluster to the true label
    for cluster in range(best_k_value):
        temp_family_pred = k_means_family.loc[k_means_family['Cluster'] == cluster]['Family'].to_list()
        temp_genus_pred = k_means_genus.loc[k_means_genus['Cluster'] == cluster]['Genus'].to_list()
        temp_species_pred = k_means_species.loc[k_means_species['Cluster'] == cluster]['Species'].to_list()
        
    k_means_family_predict = []
    k_means_genus_predict = []
    k_means_species_predict = []
    
    #Find the majority labels in every cluster
    family_list = pd.crosstab(k_means_family['Family'], k_means_family['Cluster']).idxmax(axis=0)
    genus_list = pd.crosstab(k_means_genus['Genus'], k_means_genus['Cluster']).idxmax(axis=0)
    species_list = pd.crosstab(k_means_species['Species'], k_means_species['Cluster']).idxmax(axis=0)
    
    print("Family cluster majority")
    print(family_list)
    
    print()
    
    print("Genus Cluster majority")
    print(genus_list)
    print()
    
    print("Species_Cluster majority")
    print(species_list)
    print()
    
    #Form a list of Predicted labels due to comparison between KMeans and majority labels
    for fam in k_means_family['Cluster']:
        k_means_family_predict.append(family_list[fam])
    
    for gen in k_means_genus['Cluster']:
        k_means_genus_predict.append(genus_list[gen])
        
    for spe in k_means_species['Cluster']:
        k_means_species_predict.append(species_list[spe])
        
        
    #Calculate the Hamming loss
    temp_hamming_loss = (hamming_loss(label_family,k_means_family_predict) + hamming_loss(label_genus,
                                                                                         k_means_genus_predict) + 
                        hamming_loss(label_species, k_means_species_predict))/3
    
    print("Hamming loss is:",temp_hamming_loss)
    hamming_loss_list.append(temp_hamming_loss)
    
    predicted_label_frogs = pd.concat([pd.DataFrame(k_means_family_predict), pd.DataFrame(k_means_genus_predict),
                          pd.DataFrame(k_means_species_predict)], axis = 1)
    
    #Calculate the hamming distance
    hamming_distance_k_means = np.sum(np.sum(np.not_equal(label_frogs,predicted_label_frogs), axis = 1))/len(k_means_family_predict)
    print("Hamming distance is:", hamming_distance_k_means)
    
    hamming_distance_list.append(hamming_distance_k_means)
    
    #Calculate the Hamming score
    hamming_score_k_means = ((np.average(np.equal(label_family,k_means_family_predict))) +
                             (np.average(np.equal(label_genus,k_means_genus_predict))) +
                              (np.average(np.equal(label_species,k_means_species_predict))))/3
    
    print("Hamming score is:", hamming_score_k_means)
    hamming_score_list.append(hamming_score_k_means)
    print()
    print()

Iteration number: 1
Best k value: 4
Family cluster majority
Cluster
0    Leptodactylidae
1            Hylidae
2            Hylidae
3      Dendrobatidae
dtype: object

Genus Cluster majority
Cluster
0    Adenomera
1    Hypsiboas
2    Hypsiboas
3     Ameerega
dtype: object

Species_Cluster majority
Cluster
0    AdenomeraHylaedactylus
1         HypsiboasCordobae
2      HypsiboasCinerascens
3        Ameeregatrivittata
dtype: object

Hamming loss is: 0.22242297892054666
Hamming distance is: 0.66726893676164
Hamming score is: 0.7775770210794534


Iteration number: 2
Best k value: 4
Family cluster majority
Cluster
0    Leptodactylidae
1            Hylidae
2      Dendrobatidae
3            Hylidae
dtype: object

Genus Cluster majority
Cluster
0    Adenomera
1    Hypsiboas
2     Ameerega
3    Hypsiboas
dtype: object

Species_Cluster majority
Cluster
0    AdenomeraHylaedactylus
1         HypsiboasCordobae
2        Ameeregatrivittata
3      HypsiboasCinerascens
dtype: object

Hamming loss is: 0.2

Best k value: 4
Family cluster majority
Cluster
0    Leptodactylidae
1            Hylidae
2      Dendrobatidae
3            Hylidae
dtype: object

Genus Cluster majority
Cluster
0    Adenomera
1    Hypsiboas
2     Ameerega
3    Hypsiboas
dtype: object

Species_Cluster majority
Cluster
0    AdenomeraHylaedactylus
1      HypsiboasCinerascens
2        Ameeregatrivittata
3         HypsiboasCordobae
dtype: object

Hamming loss is: 0.22228399351401437
Hamming distance is: 0.6668519805420431
Hamming score is: 0.7777160064859857


Iteration number: 17
Best k value: 4
Family cluster majority
Cluster
0      Dendrobatidae
1    Leptodactylidae
2            Hylidae
3            Hylidae
dtype: object

Genus Cluster majority
Cluster
0     Ameerega
1    Adenomera
2    Hypsiboas
3    Hypsiboas
dtype: object

Species_Cluster majority
Cluster
0        Ameeregatrivittata
1    AdenomeraHylaedactylus
2      HypsiboasCinerascens
3         HypsiboasCordobae
dtype: object

Hamming loss is: 0.22228399351401437


Best k value: 4
Family cluster majority
Cluster
0    Leptodactylidae
1            Hylidae
2      Dendrobatidae
3            Hylidae
dtype: object

Genus Cluster majority
Cluster
0    Adenomera
1    Hypsiboas
2     Ameerega
3    Hypsiboas
dtype: object

Species_Cluster majority
Cluster
0    AdenomeraHylaedactylus
1      HypsiboasCinerascens
2        Ameeregatrivittata
3         HypsiboasCordobae
dtype: object

Hamming loss is: 0.22242297892054666
Hamming distance is: 0.66726893676164
Hamming score is: 0.7775770210794534


Iteration number: 32
Best k value: 4
Family cluster majority
Cluster
0    Leptodactylidae
1            Hylidae
2            Hylidae
3      Dendrobatidae
dtype: object

Genus Cluster majority
Cluster
0    Adenomera
1    Hypsiboas
2    Hypsiboas
3     Ameerega
dtype: object

Species_Cluster majority
Cluster
0    AdenomeraHylaedactylus
1         HypsiboasCordobae
2      HypsiboasCinerascens
3        Ameeregatrivittata
dtype: object

Hamming loss is: 0.22242297892054666
Ha

Best k value: 4
Family cluster majority
Cluster
0            Hylidae
1    Leptodactylidae
2      Dendrobatidae
3            Hylidae
dtype: object

Genus Cluster majority
Cluster
0    Hypsiboas
1    Adenomera
2     Ameerega
3    Hypsiboas
dtype: object

Species_Cluster majority
Cluster
0      HypsiboasCinerascens
1    AdenomeraHylaedactylus
2        Ameeregatrivittata
3         HypsiboasCordobae
dtype: object

Hamming loss is: 0.22242297892054666
Hamming distance is: 0.66726893676164
Hamming score is: 0.7775770210794534


Iteration number: 47
Best k value: 4
Family cluster majority
Cluster
0    Leptodactylidae
1            Hylidae
2            Hylidae
3      Dendrobatidae
dtype: object

Genus Cluster majority
Cluster
0    Adenomera
1    Hypsiboas
2    Hypsiboas
3     Ameerega
dtype: object

Species_Cluster majority
Cluster
0    AdenomeraHylaedactylus
1         HypsiboasCordobae
2      HypsiboasCinerascens
3        Ameeregatrivittata
dtype: object

Hamming loss is: 0.22242297892054666
Ha

## In the above output, I have mentioned the following:
#### 1. Iteration number
#### 2. Best value of k (Which equals 4 when Silhouettes is used)
#### 3. Majority family in each cluster, majority genus in each cluster, and majority species in each cluster respecitvely.
#### 4. Hamming loss, hamming distance and hamming score for each iteration.

##  Average Hamming Distance, score and loss

In [62]:
print("Average Hamming distance:", np.mean(hamming_distance_list))
print("Standard deviation of Hamming distance", np.std(hamming_distance_list))

Average Hamming distance: 0.6631494093120222
Standard deviation of Hamming distance 0.02920548416189047


In [63]:
print("Average Hamming score:", np.mean(hamming_score_list))

Average Hamming score: 0.7789501968959927


In [64]:
print("Average Hamming loss:", np.mean(hamming_loss_list))

Average Hamming loss: 0.2210498031040074
