### Name: Mukund Tamizharasan
### USC ID: 7355725345

# Multi-class and Multi-Label Classification Using Support Vector Machines

In [364]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV
from sklearn import svm
from sklearn.metrics import hamming_loss
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import warnings
warnings.filterwarnings("ignore")

In [365]:
df = pd.read_csv("../data/Frogs_MFCCs.csv")
del df["RecordID"]
df

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,...,MFCCs_16,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22,Family,Genus,Species
0,1.0,0.152936,-0.105586,0.200722,0.317201,0.260764,0.100945,-0.150063,-0.171128,0.124676,...,-0.024017,-0.108351,-0.077623,-0.009568,0.057684,0.118680,0.014038,Leptodactylidae,Adenomera,AdenomeraAndre
1,1.0,0.171534,-0.098975,0.268425,0.338672,0.268353,0.060835,-0.222475,-0.207693,0.170883,...,0.012022,-0.090974,-0.056510,-0.035303,0.020140,0.082263,0.029056,Leptodactylidae,Adenomera,AdenomeraAndre
2,1.0,0.152317,-0.082973,0.287128,0.276014,0.189867,0.008714,-0.242234,-0.219153,0.232538,...,0.083536,-0.050691,-0.023590,-0.066722,-0.025083,0.099108,0.077162,Leptodactylidae,Adenomera,AdenomeraAndre
3,1.0,0.224392,0.118985,0.329432,0.372088,0.361005,0.015501,-0.194347,-0.098181,0.270375,...,-0.050224,-0.136009,-0.177037,-0.130498,-0.054766,-0.018691,0.023954,Leptodactylidae,Adenomera,AdenomeraAndre
4,1.0,0.087817,-0.068345,0.306967,0.330923,0.249144,0.006884,-0.265423,-0.172700,0.266434,...,0.062837,-0.048885,-0.053074,-0.088550,-0.031346,0.108610,0.079244,Leptodactylidae,Adenomera,AdenomeraAndre
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7190,1.0,-0.554504,-0.337717,0.035533,0.034511,0.443451,0.093889,-0.100753,0.037087,0.081075,...,-0.000861,0.069430,0.071001,0.021591,0.052449,-0.021860,-0.079860,Hylidae,Scinax,ScinaxRuber
7191,1.0,-0.517273,-0.370574,0.030673,0.068097,0.402890,0.096628,-0.116460,0.063727,0.089034,...,0.006457,0.061127,0.068978,0.017745,0.046461,-0.015418,-0.101892,Hylidae,Scinax,ScinaxRuber
7192,1.0,-0.582557,-0.343237,0.029468,0.064179,0.385596,0.114905,-0.103317,0.070370,0.081317,...,0.008696,0.082474,0.077771,-0.009688,0.027834,-0.000531,-0.080425,Hylidae,Scinax,ScinaxRuber
7193,1.0,-0.519497,-0.307553,-0.004922,0.072865,0.377131,0.086866,-0.115799,0.056979,0.089316,...,0.001924,0.051796,0.069073,0.017963,0.041803,-0.027911,-0.096895,Hylidae,Scinax,ScinaxRuber


In [366]:
train_set, test_set = train_test_split(df, test_size=0.3, random_state=42)

In [367]:
y_multi_true = test_set.iloc[:,-3:].values

#### 1. (b) i. Research exact match and hamming score/ loss methods for evaluating multi- label classification and use them in evaluating the classifiers in this problem.

Exact match ratio is the ratio of all the predicted labels that match the true labels.
Hamming loss it the total number of predicted labels that do not match the true label divided by the product of number of labels and number of instances

In [369]:
def eval_model(y_true, y_pred):
    #hamming loss
    cnt=0
    for i in range(y_true.shape[0]):
        for j in range(y_true.shape[1]):
            if y_true[i,j]!=y_pred[i,j]:
                cnt+=1
    hamming_loss = (1/(y_true.shape[0]*y_true.shape[1]))*cnt
    
    #exact match ratio
    cnt=0
    for i in range(y_true.shape[0]):
        if all(y_true[i] == y_pred[i]):
            cnt+=1
    exact_match_ratio = cnt / y_true.shape[0]
    
    print("Hamming Loss: ",hamming_loss)
    print("Exact Match Ratio: ",exact_match_ratio)
    
    return {"hamming_loss":hamming_loss,"exact_match_ratio":exact_match_ratio}

#### 1. (b) ii. Train a SVM for each of the labels, using Gaussian kernels and one versus all classifiers. Determine the weight of the SVM penalty and the width of the Gaussian Kernel using 10 fold cross validation.

In [371]:
# references 
# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

def best_params(xtrain,ytrain,params,estimator,scoring,cv):
    clf = GridSearchCV(estimator=estimator, param_grid=params, scoring=scoring, n_jobs=-1, cv=cv, verbose=2)
    clf.fit(xtrain,ytrain)
    
    print("Results:")
    cv_res = clf.cv_results_
    for i in range(len(cv_res['params'])):
        print(cv_res['params'][i]," gives mean test score of ", cv_res['mean_test_score'][i])
        
    print("Best score is ",clf.best_score_," for parameters ",clf.best_params_)
                                                                                          
    return clf                                                                                     

In [372]:
#Family Label

family_train = train_set
family_train = family_train.drop(columns=['Genus','Species'])
family_test = test_set
family_test = family_test.drop(columns=['Genus','Species'])

family_xtrain, family_ytrain = family_train.iloc[:,:-1].to_numpy(), family_train.iloc[:,-1].to_numpy()
family_xtest, family_ytest = family_test.iloc[:,:-1].to_numpy(), family_test.iloc[:,-1].to_numpy()

In [373]:
clf = svm.SVC(decision_function_shape='ovr', kernel='rbf', C=0.1, gamma=0.1)
clf.fit(family_xtrain, family_ytrain)

SVC(C=0.1, gamma=0.1)

In [374]:
clf.score(family_xtrain,family_ytrain)

0.8854249404289118

In [375]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
C_list = [10**i for i in range(-1,7)]
gamma_list = np.arange(0.1,1.1,0.1)
params = {"C":C_list,"gamma":gamma_list}

estimator = svm.SVC(kernel='rbf',decision_function_shape='ovr')

clf = best_params(family_xtrain,family_ytrain,params=params,estimator=estimator,scoring="f1_weighted",cv=splitter)

Fitting 10 folds for each of 80 candidates, totalling 800 fits
Results:
{'C': 0.1, 'gamma': 0.1}  gives mean test score of  0.868673139416216
{'C': 0.1, 'gamma': 0.2}  gives mean test score of  0.9196710306347746
{'C': 0.1, 'gamma': 0.30000000000000004}  gives mean test score of  0.9353689803716152
{'C': 0.1, 'gamma': 0.4}  gives mean test score of  0.9406411361220128
{'C': 0.1, 'gamma': 0.5}  gives mean test score of  0.9444227779772059
{'C': 0.1, 'gamma': 0.6}  gives mean test score of  0.9477253187347445
{'C': 0.1, 'gamma': 0.7000000000000001}  gives mean test score of  0.9506530848215077
{'C': 0.1, 'gamma': 0.8}  gives mean test score of  0.9532150076429344
{'C': 0.1, 'gamma': 0.9}  gives mean test score of  0.9555640822945832
{'C': 0.1, 'gamma': 1.0}  gives mean test score of  0.9567240158109026
{'C': 1, 'gamma': 0.1}  gives mean test score of  0.9392789455502568
{'C': 1, 'gamma': 0.2}  gives mean test score of  0.9545637207167648
{'C': 1, 'gamma': 0.30000000000000004}  gives mean

In [376]:
family_pred = clf.predict(family_xtest)

In [377]:
#Genus Label

genus_train = train_set
genus_train = genus_train.drop(columns=['Family','Species'])
genus_test = test_set
genus_test = genus_test.drop(columns=['Family','Species'])

genus_xtrain, genus_ytrain = genus_train.iloc[:,:-1].to_numpy(), genus_train.iloc[:,-1].to_numpy()
genus_xtest, genus_ytest = genus_test.iloc[:,:-1].to_numpy(), genus_test.iloc[:,-1].to_numpy()

In [378]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
C_list = [10**i for i in range(-1,7)]
gamma_list = np.arange(0.1,1.1,0.1)
params = {"C":C_list,"gamma":gamma_list}

estimator = svm.SVC(kernel='rbf',decision_function_shape='ovr')

clf = best_params(genus_xtrain,genus_ytrain,params=params,estimator=estimator,scoring="f1_weighted",cv=splitter)

Fitting 10 folds for each of 80 candidates, totalling 800 fits
Results:
{'C': 0.1, 'gamma': 0.1}  gives mean test score of  0.7680102689170887
{'C': 0.1, 'gamma': 0.2}  gives mean test score of  0.8432699003314763
{'C': 0.1, 'gamma': 0.30000000000000004}  gives mean test score of  0.8567512840581116
{'C': 0.1, 'gamma': 0.4}  gives mean test score of  0.8856318089421304
{'C': 0.1, 'gamma': 0.5}  gives mean test score of  0.9008352356533653
{'C': 0.1, 'gamma': 0.6}  gives mean test score of  0.9060913592467713
{'C': 0.1, 'gamma': 0.7000000000000001}  gives mean test score of  0.910671761409429
{'C': 0.1, 'gamma': 0.8}  gives mean test score of  0.9148756201884328
{'C': 0.1, 'gamma': 0.9}  gives mean test score of  0.9185072969726417
{'C': 0.1, 'gamma': 1.0}  gives mean test score of  0.9217456325598388
{'C': 1, 'gamma': 0.1}  gives mean test score of  0.9201669253545612
{'C': 1, 'gamma': 0.2}  gives mean test score of  0.941907212050354
{'C': 1, 'gamma': 0.30000000000000004}  gives mean 

In [379]:
genus_pred = clf.predict(genus_xtest)

In [380]:
#Species Label

species_train = train_set
species_train = species_train.drop(columns=['Family','Genus'])
species_test = test_set
species_test = species_test.drop(columns=['Family','Genus'])

species_xtrain, species_ytrain = species_train.iloc[:,:-1].to_numpy(), species_train.iloc[:,-1].to_numpy()
species_xtest, species_ytest = species_test.iloc[:,:-1].to_numpy(), species_test.iloc[:,-1].to_numpy()

In [381]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
C_list = [10**i for i in range(-1,7)]
gamma_list = np.arange(0.1,1.1,0.1)
params = {"C":C_list,"gamma":gamma_list}

estimator = svm.SVC(kernel='rbf',decision_function_shape='ovr')

clf = best_params(species_xtrain,species_ytrain,params=params,estimator=estimator,scoring="f1_weighted",cv=splitter)

Fitting 10 folds for each of 80 candidates, totalling 800 fits
Results:
{'C': 0.1, 'gamma': 0.1}  gives mean test score of  0.7882765336309369
{'C': 0.1, 'gamma': 0.2}  gives mean test score of  0.853164434516262
{'C': 0.1, 'gamma': 0.30000000000000004}  gives mean test score of  0.8862454500381698
{'C': 0.1, 'gamma': 0.4}  gives mean test score of  0.900891425838276
{'C': 0.1, 'gamma': 0.5}  gives mean test score of  0.9084030552187276
{'C': 0.1, 'gamma': 0.6}  gives mean test score of  0.9121080103426562
{'C': 0.1, 'gamma': 0.7000000000000001}  gives mean test score of  0.9151822066814399
{'C': 0.1, 'gamma': 0.8}  gives mean test score of  0.918445168174082
{'C': 0.1, 'gamma': 0.9}  gives mean test score of  0.9238803778223394
{'C': 0.1, 'gamma': 1.0}  gives mean test score of  0.9280819602303966
{'C': 1, 'gamma': 0.1}  gives mean test score of  0.9345708917094504
{'C': 1, 'gamma': 0.2}  gives mean test score of  0.9596331534550699
{'C': 1, 'gamma': 0.30000000000000004}  gives mean t

In [382]:
species_pred = clf.predict(species_xtest)

In [383]:
y_multi_pred = np.array([family_pred,genus_pred,species_pred]).T

In [384]:
summary_dict={}

In [385]:
summary_dict["SVM_Gaussian_no_std"] = eval_model(y_multi_true,y_multi_pred)

Hamming Loss:  0.008491585610622202
Exact Match Ratio:  0.9861046780917091


In [390]:
scaler = StandardScaler()
scaler.fit(train_set.iloc[:,:-3])

sc_xtrain = scaler.transform(train_set.iloc[:,:-3])
sc_xtest = scaler.transform(test_set.iloc[:,:-3])

In [391]:
family_xtrain, family_ytrain = sc_xtrain, train_set["Family"].to_numpy()
family_xtest, family_ytest = sc_xtest, test_set["Family"].to_numpy()

In [392]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
C_list = [10**i for i in range(-1,7)]
gamma_list = np.arange(0.1,1.1,0.1)
params = {"C":C_list,"gamma":gamma_list}

estimator = svm.SVC(kernel='rbf',decision_function_shape='ovr')

clf = best_params(family_xtrain,family_ytrain,params=params,estimator=estimator,scoring="f1_weighted",cv=splitter)

Fitting 10 folds for each of 80 candidates, totalling 800 fits
Results:
{'C': 0.1, 'gamma': 0.1}  gives mean test score of  0.9676481020701507
{'C': 0.1, 'gamma': 0.2}  gives mean test score of  0.972968382406143
{'C': 0.1, 'gamma': 0.30000000000000004}  gives mean test score of  0.9664316763140359
{'C': 0.1, 'gamma': 0.4}  gives mean test score of  0.9092609468319843
{'C': 0.1, 'gamma': 0.5}  gives mean test score of  0.8568517298071286
{'C': 0.1, 'gamma': 0.6}  gives mean test score of  0.8018064176676809
{'C': 0.1, 'gamma': 0.7000000000000001}  gives mean test score of  0.7543943229280274
{'C': 0.1, 'gamma': 0.8}  gives mean test score of  0.7173251563766844
{'C': 0.1, 'gamma': 0.9}  gives mean test score of  0.6970664956074097
{'C': 0.1, 'gamma': 1.0}  gives mean test score of  0.6825305406673648
{'C': 1, 'gamma': 0.1}  gives mean test score of  0.9880027181864584
{'C': 1, 'gamma': 0.2}  gives mean test score of  0.9844838519511697
{'C': 1, 'gamma': 0.30000000000000004}  gives mean

In [393]:
family_pred = clf.predict(family_xtest)

In [394]:
genus_xtrain, genus_ytrain = sc_xtrain, train_set["Genus"].to_numpy()
genus_xtest, genus_ytest = sc_xtest, test_set["Genus"].to_numpy()

In [395]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
C_list = [10**i for i in range(-1,7)]
gamma_list = np.arange(0.1,1.1,0.1)
params = {"C":C_list,"gamma":gamma_list}

estimator = svm.SVC(kernel='rbf',decision_function_shape='ovr')

clf = best_params(genus_xtrain,genus_ytrain,params=params,estimator=estimator,scoring="f1_weighted",cv=splitter)

Fitting 10 folds for each of 80 candidates, totalling 800 fits
Results:
{'C': 0.1, 'gamma': 0.1}  gives mean test score of  0.9473538937595436
{'C': 0.1, 'gamma': 0.2}  gives mean test score of  0.9054456471657776
{'C': 0.1, 'gamma': 0.30000000000000004}  gives mean test score of  0.8606419280251396
{'C': 0.1, 'gamma': 0.4}  gives mean test score of  0.8172532232731872
{'C': 0.1, 'gamma': 0.5}  gives mean test score of  0.7731646565373423
{'C': 0.1, 'gamma': 0.6}  gives mean test score of  0.7291387642473826
{'C': 0.1, 'gamma': 0.7000000000000001}  gives mean test score of  0.6815805738645466
{'C': 0.1, 'gamma': 0.8}  gives mean test score of  0.651964648585581
{'C': 0.1, 'gamma': 0.9}  gives mean test score of  0.6359519397343095
{'C': 0.1, 'gamma': 1.0}  gives mean test score of  0.624761976431642
{'C': 1, 'gamma': 0.1}  gives mean test score of  0.9847762701109788
{'C': 1, 'gamma': 0.2}  gives mean test score of  0.9754274317369841
{'C': 1, 'gamma': 0.30000000000000004}  gives mean 

In [396]:
genus_pred = clf.predict(genus_xtest)

In [397]:
species_xtrain, species_ytrain = sc_xtrain, train_set["Species"].to_numpy()
species_xtest, species_ytest = sc_xtest, test_set["Species"].to_numpy()

In [398]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
C_list = [10**i for i in range(-1,7)]
gamma_list = np.arange(0.1,1.1,0.1)
params = {"C":C_list,"gamma":gamma_list}

estimator = svm.SVC(kernel='rbf',decision_function_shape='ovr')

clf = best_params(species_xtrain,species_ytrain,params=params,estimator=estimator,scoring="f1_weighted",cv=splitter)

Fitting 10 folds for each of 80 candidates, totalling 800 fits
Results:
{'C': 0.1, 'gamma': 0.1}  gives mean test score of  0.9560529032059986
{'C': 0.1, 'gamma': 0.2}  gives mean test score of  0.9207817082510765
{'C': 0.1, 'gamma': 0.30000000000000004}  gives mean test score of  0.8665131992664392
{'C': 0.1, 'gamma': 0.4}  gives mean test score of  0.804131055493175
{'C': 0.1, 'gamma': 0.5}  gives mean test score of  0.73785799328791
{'C': 0.1, 'gamma': 0.6}  gives mean test score of  0.6739442593011596
{'C': 0.1, 'gamma': 0.7000000000000001}  gives mean test score of  0.6154655770715121
{'C': 0.1, 'gamma': 0.8}  gives mean test score of  0.5753511910120397
{'C': 0.1, 'gamma': 0.9}  gives mean test score of  0.5507665958046895
{'C': 0.1, 'gamma': 1.0}  gives mean test score of  0.5295910465650484
{'C': 1, 'gamma': 0.1}  gives mean test score of  0.9827154924722235
{'C': 1, 'gamma': 0.2}  gives mean test score of  0.9723286262503574
{'C': 1, 'gamma': 0.30000000000000004}  gives mean t

In [399]:
species_pred = clf.predict(species_xtest)

In [400]:
y_multi_pred = np.array([family_pred,genus_pred,species_pred]).T

In [401]:
summary_dict["SVM_Gaussian_std"] = eval_model(y_multi_true,y_multi_pred)

Hamming Loss:  0.011733827389223406
Exact Match Ratio:  0.983788791106994


#### 1. (b) iii. Repeat 1(b)ii with L1-penalized SVMs.

In [402]:
scaler = StandardScaler()
scaler.fit(train_set.iloc[:,:-3])

sc_xtrain = scaler.transform(train_set.iloc[:,:-3])
sc_xtest = scaler.transform(test_set.iloc[:,:-3])

In [403]:
#Family Label

family_xtrain, family_ytrain = sc_xtrain, train_set["Family"].to_numpy()
family_xtest, family_ytest = sc_xtest, test_set["Family"].to_numpy()

In [404]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
C_list = [10**i for i in range(-3,7)]

params = {"C":C_list}

estimator = svm.LinearSVC(penalty="l1", dual=False, verbose=1)

clf = best_params(family_xtrain,family_ytrain,params=params,estimator=estimator,scoring="f1_weighted",cv=splitter)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[LibLinear]Results:
{'C': 0.001}  gives mean test score of  0.8228733949753286
{'C': 0.01}  gives mean test score of  0.9265293311275651
{'C': 0.1}  gives mean test score of  0.9340622181757438
{'C': 1}  gives mean test score of  0.9368967217762026
{'C': 10}  gives mean test score of  0.9373835037156646
{'C': 100}  gives mean test score of  0.9371808706425874
{'C': 1000}  gives mean test score of  0.9371808706425874
{'C': 10000}  gives mean test score of  0.9371808706425874
{'C': 100000}  gives mean test score of  0.9371808706425874
{'C': 1000000}  gives mean test score of  0.9371808706425874
Best score is  0.9373835037156646  for parameters  {'C': 10}


In [405]:
family_pred = clf.predict(family_xtest)

In [406]:
#Genus Label

genus_xtrain, genus_ytrain = sc_xtrain, train_set["Genus"].to_numpy()
genus_xtest, genus_ytest = sc_xtest, test_set["Genus"].to_numpy()

In [407]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
C_list = [10**i for i in range(-3,7)]

params = {"C":C_list}

estimator = svm.LinearSVC(penalty="l1", dual=False, verbose=1)

clf = best_params(genus_xtrain,genus_ytrain,params=params,estimator=estimator,scoring="f1_weighted",cv=splitter)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[LibLinear]Results:
{'C': 0.001}  gives mean test score of  0.770115180413346
{'C': 0.01}  gives mean test score of  0.8935414027200412
{'C': 0.1}  gives mean test score of  0.9336781445758738
{'C': 1}  gives mean test score of  0.9449990142972574
{'C': 10}  gives mean test score of  0.9472039596183117
{'C': 100}  gives mean test score of  0.9472427696120382
{'C': 1000}  gives mean test score of  0.9474892972001158
{'C': 10000}  gives mean test score of  0.9474892972001158
{'C': 100000}  gives mean test score of  0.9474892972001158
{'C': 1000000}  gives mean test score of  0.9474892972001158
Best score is  0.9474892972001158  for parameters  {'C': 1000}


In [408]:
genus_pred = clf.predict(genus_xtest)

In [409]:
#Species Label

species_xtrain, species_ytrain = sc_xtrain, train_set["Species"].to_numpy()
species_xtest, species_ytest = sc_xtest, test_set["Species"].to_numpy()

In [410]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
C_list = [10**i for i in range(-3,7)]

params = {"C":C_list}

estimator = svm.LinearSVC(penalty="l1", dual=False, verbose=1)

clf = best_params(species_xtrain,species_ytrain,params=params,estimator=estimator,scoring="f1_weighted",cv=splitter)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[LibLinear]Results:
{'C': 0.001}  gives mean test score of  0.7309206369190463
{'C': 0.01}  gives mean test score of  0.8968970195368582
{'C': 0.1}  gives mean test score of  0.9454963140717639
{'C': 1}  gives mean test score of  0.9584510960306802
{'C': 10}  gives mean test score of  0.9589722439814052
{'C': 100}  gives mean test score of  0.9589681719300496
{'C': 1000}  gives mean test score of  0.9587872731346753
{'C': 10000}  gives mean test score of  0.9589681719300496
{'C': 100000}  gives mean test score of  0.9589681719300496
{'C': 1000000}  gives mean test score of  0.9589759158966917
Best score is  0.9589759158966917  for parameters  {'C': 1000000}


In [411]:
species_pred = clf.predict(species_xtest)

In [412]:
y_multi_pred = np.array([family_pred,genus_pred,species_pred]).T

In [413]:
summary_dict["l1_svm"] = eval_model(y_multi_true,y_multi_pred)

Hamming Loss:  0.05697081982399259
Exact Match Ratio:  0.9124594719777674


#### 1. (b) iv. Repeat 1(b)iii by using SMOTE or any other method you know to remedy class imbalance.

In [414]:
# https://stackoverflow.com/questions/35632634/how-to-pass-a-parameter-to-only-one-part-of-a-pipeline-object-in-scikit-learn

splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
C_list = [10**i for i in range(-3,7)]

params = {"l1svm__C":C_list}

estimator = Pipeline([('smote', SMOTE()), ('l1svm', svm.LinearSVC(penalty="l1", dual=False, verbose=1))])

clf = best_params(family_xtrain,family_ytrain,params=params,estimator=estimator,scoring="f1_weighted",cv=splitter)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[LibLinear]Results:
{'l1svm__C': 0.001}  gives mean test score of  0.854702099709024
{'l1svm__C': 0.01}  gives mean test score of  0.9061892857330488
{'l1svm__C': 0.1}  gives mean test score of  0.926525143075347
{'l1svm__C': 1}  gives mean test score of  0.9282154950672223
{'l1svm__C': 10}  gives mean test score of  0.9276708688349837
{'l1svm__C': 100}  gives mean test score of  0.9278875430122449
{'l1svm__C': 1000}  gives mean test score of  0.9298941619560743
{'l1svm__C': 10000}  gives mean test score of  0.9293225752819823
{'l1svm__C': 100000}  gives mean test score of  0.9274696551482486
{'l1svm__C': 1000000}  gives mean test score of  0.9276949306358275
Best score is  0.9298941619560743  for parameters  {'l1svm__C': 1000}


In [415]:
family_pred = clf.predict(family_xtest)

In [416]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
C_list = [10**i for i in range(-3,7)]

params = {"l1svm__C":C_list}

estimator = Pipeline([('smote', SMOTE()), ('l1svm', svm.LinearSVC(penalty="l1", dual=False, verbose=1))])

clf = best_params(genus_xtrain,genus_ytrain,params=params,estimator=estimator,scoring="f1_weighted",cv=splitter)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[LibLinear]Results:
{'l1svm__C': 0.001}  gives mean test score of  0.885969383236991
{'l1svm__C': 0.01}  gives mean test score of  0.9091759525357885
{'l1svm__C': 0.1}  gives mean test score of  0.9209599628272731
{'l1svm__C': 1}  gives mean test score of  0.9210247598513929
{'l1svm__C': 10}  gives mean test score of  0.9202556803968867
{'l1svm__C': 100}  gives mean test score of  0.922144622447026
{'l1svm__C': 1000}  gives mean test score of  0.9216432248976529
{'l1svm__C': 10000}  gives mean test score of  0.922043532077985
{'l1svm__C': 100000}  gives mean test score of  0.9214970311099393
{'l1svm__C': 1000000}  gives mean test score of  0.9196431839587763
Best score is  0.922144622447026  for parameters  {'l1svm__C': 100}


In [417]:
genus_pred = clf.predict(genus_xtest)

In [418]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
C_list = [10**i for i in range(-3,7)]

params = {"l1svm__C":C_list}

estimator = Pipeline([('smote', SMOTE()), ('l1svm', svm.LinearSVC(penalty="l1", dual=False, verbose=1))])

clf = best_params(species_xtrain,species_ytrain,params=params,estimator=estimator,scoring="f1_weighted",cv=splitter)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[LibLinear]Results:
{'l1svm__C': 0.001}  gives mean test score of  0.9313761290334988
{'l1svm__C': 0.01}  gives mean test score of  0.9532767026297838
{'l1svm__C': 0.1}  gives mean test score of  0.9578066343446643
{'l1svm__C': 1}  gives mean test score of  0.9563405759201347
{'l1svm__C': 10}  gives mean test score of  0.9579226627695172
{'l1svm__C': 100}  gives mean test score of  0.9585685382129002
{'l1svm__C': 1000}  gives mean test score of  0.9584064220252273
{'l1svm__C': 10000}  gives mean test score of  0.9584679111441714
{'l1svm__C': 100000}  gives mean test score of  0.9579700981694474
{'l1svm__C': 1000000}  gives mean test score of  0.9582735020776953
Best score is  0.9585685382129002  for parameters  {'l1svm__C': 100}


In [419]:
species_pred = clf.predict(species_xtest)

In [420]:
y_multi_pred = np.array([family_pred,genus_pred,species_pred]).T

In [421]:
summary_dict["l1_svm_smote"] = eval_model(y_multi_true,y_multi_pred)

Hamming Loss:  0.07626987802995214
Exact Match Ratio:  0.8559518295507179


In [429]:
summary_df = pd.DataFrame(columns=["Model","Hamming loss","Exact match ratio"])
df_idx = 0

for model in summary_dict.keys():
    summary_df.loc[df_idx] = [model, summary_dict[model]["hamming_loss"], summary_dict[model]["exact_match_ratio"]]
    df_idx+=1

summary_df

Unnamed: 0,Model,Hamming loss,Exact match ratio
0,SVM_Gaussian_no_std,0.008492,0.986105
1,SVM_Gaussian_std,0.011734,0.983789
2,l1_svm,0.056971,0.912459
3,l1_svm_smote,0.07627,0.855952


# K-Means Clustering on a Multi-Class and Multi-Label Data Set

In [302]:
df = pd.read_csv("../data/Frogs_MFCCs.csv")
del df["RecordID"]
df

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,...,MFCCs_16,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22,Family,Genus,Species
0,1.0,0.152936,-0.105586,0.200722,0.317201,0.260764,0.100945,-0.150063,-0.171128,0.124676,...,-0.024017,-0.108351,-0.077623,-0.009568,0.057684,0.118680,0.014038,Leptodactylidae,Adenomera,AdenomeraAndre
1,1.0,0.171534,-0.098975,0.268425,0.338672,0.268353,0.060835,-0.222475,-0.207693,0.170883,...,0.012022,-0.090974,-0.056510,-0.035303,0.020140,0.082263,0.029056,Leptodactylidae,Adenomera,AdenomeraAndre
2,1.0,0.152317,-0.082973,0.287128,0.276014,0.189867,0.008714,-0.242234,-0.219153,0.232538,...,0.083536,-0.050691,-0.023590,-0.066722,-0.025083,0.099108,0.077162,Leptodactylidae,Adenomera,AdenomeraAndre
3,1.0,0.224392,0.118985,0.329432,0.372088,0.361005,0.015501,-0.194347,-0.098181,0.270375,...,-0.050224,-0.136009,-0.177037,-0.130498,-0.054766,-0.018691,0.023954,Leptodactylidae,Adenomera,AdenomeraAndre
4,1.0,0.087817,-0.068345,0.306967,0.330923,0.249144,0.006884,-0.265423,-0.172700,0.266434,...,0.062837,-0.048885,-0.053074,-0.088550,-0.031346,0.108610,0.079244,Leptodactylidae,Adenomera,AdenomeraAndre
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7190,1.0,-0.554504,-0.337717,0.035533,0.034511,0.443451,0.093889,-0.100753,0.037087,0.081075,...,-0.000861,0.069430,0.071001,0.021591,0.052449,-0.021860,-0.079860,Hylidae,Scinax,ScinaxRuber
7191,1.0,-0.517273,-0.370574,0.030673,0.068097,0.402890,0.096628,-0.116460,0.063727,0.089034,...,0.006457,0.061127,0.068978,0.017745,0.046461,-0.015418,-0.101892,Hylidae,Scinax,ScinaxRuber
7192,1.0,-0.582557,-0.343237,0.029468,0.064179,0.385596,0.114905,-0.103317,0.070370,0.081317,...,0.008696,0.082474,0.077771,-0.009688,0.027834,-0.000531,-0.080425,Hylidae,Scinax,ScinaxRuber
7193,1.0,-0.519497,-0.307553,-0.004922,0.072865,0.377131,0.086866,-0.115799,0.056979,0.089316,...,0.001924,0.051796,0.069073,0.017963,0.041803,-0.027911,-0.096895,Hylidae,Scinax,ScinaxRuber


In [353]:
# https://medium.com/analytics-vidhya/how-to-determine-the-optimal-k-for-k-means-708505d204eb

def optimalk(x,seed):
    max_score = -1
    best_k = 0
    for k in range(2, 51):
        kmeans = KMeans(n_clusters = k, random_state=seed).fit(x)
        labels = kmeans.labels_
        silhouette_sc = silhouette_score(x, labels, metric = 'euclidean')
        
        if silhouette_sc > max_score:
            max_score = silhouette_sc
            best_k = k
    return best_k

def hamming_metrics(best_k,y_true,y_label):
    y_temp = y_true.copy()
    y_temp["label"] = y_label
    
    dist = 0
    for k in range(best_k):
        y_cluster = y_temp[y_temp["label"]==k]
        family_label = y_cluster["Family"].value_counts().sort_values(ascending=False).keys().tolist()[0]
        genus_label = y_cluster["Genus"].value_counts().sort_values(ascending=False).keys().tolist()[0]
        species_label = y_cluster["Species"].value_counts().sort_values(ascending=False).keys().tolist()[0]
        majority_label = [family_label,genus_label,species_label]

        for i in range(y_cluster.shape[0]):
            for j in range(y_cluster.shape[1]-1):
                if y_cluster.iloc[i,j] != majority_label[j]:
                    dist+=1
    hamming_distance = dist/y_true.shape[0]
    hamming_loss = (1/(y_true.shape[0]*y_true.shape[1]))*dist
    
    return hamming_loss, hamming_distance
    

In [355]:
hamming_losses=[]
hamming_distances=[]
for m in range(50):
    print("Monte Carlo Simulation ",m+1)
    best_k = optimalk(df.iloc[:,:-3],(m+1)*10)
    
    print("Optimal k :",best_k)
    kmeans = KMeans(n_clusters=best_k,random_state=(m+1)*10)
    kmeans.fit(df.iloc[:,:-3])

    y_label = kmeans.labels_
    hamming_loss, hamming_distance = hamming_metrics(best_k,df.iloc[:,-3:],y_label)
    print("Hamming loss:",hamming_loss," Hamming distance:",hamming_distance)
    print()
    hamming_losses.append(hamming_loss)
    hamming_distances.append(hamming_distance)

Monte Carlo Simulation  1
Optimal k : 4
Hamming loss: 0.22242297892054666  Hamming distance: 0.66726893676164

Monte Carlo Simulation  2
Optimal k : 4
Hamming loss: 0.23405142460041695  Hamming distance: 0.7021542738012508

Monte Carlo Simulation  3
Optimal k : 4
Hamming loss: 0.22242297892054666  Hamming distance: 0.66726893676164

Monte Carlo Simulation  4
Optimal k : 4
Hamming loss: 0.22242297892054666  Hamming distance: 0.66726893676164

Monte Carlo Simulation  5
Optimal k : 4
Hamming loss: 0.22242297892054666  Hamming distance: 0.66726893676164

Monte Carlo Simulation  6
Optimal k : 4
Hamming loss: 0.2800092656937688  Hamming distance: 0.8400277970813065

Monte Carlo Simulation  7
Optimal k : 4
Hamming loss: 0.2800092656937688  Hamming distance: 0.8400277970813065

Monte Carlo Simulation  8
Optimal k : 4
Hamming loss: 0.22242297892054666  Hamming distance: 0.66726893676164

Monte Carlo Simulation  9
Optimal k : 4
Hamming loss: 0.22242297892054666  Hamming distance: 0.6672689367616

In [356]:
print("Mean Hamming loss:",np.mean(hamming_losses))
print("Standard deviation of Hamming loss:",np.std(hamming_losses))

Mean Hamming loss: 0.2269854065323141
Standard deviation of Hamming loss: 0.012384199360472468


In [357]:
print("Mean Hamming distance:",np.mean(hamming_distances))
print("Standard deviation of Hamming distance:",np.std(hamming_distances))

Mean Hamming distance: 0.6809562195969421
Standard deviation of Hamming distance: 0.03715259808141741


In [361]:
print("Mean Hamming score:",1-np.mean(hamming_losses))
print("Standard deviation of Hamming score:",np.std(hamming_losses))

Mean Hamming score: 0.7730145934676859
Standard deviation of Hamming score: 0.012384199360472468


## 3. ISLR 12.6.2

a. Complete Linkage

<img src="https://drive.google.com/uc?export=view&id=1lDIU0aAKtVT_NS6jg4lTWJTCQrpEgVTE" width="300">

b. Single linkage

<img src="https://drive.google.com/uc?export=view&id=1Eme-ppnwv30deYlH8AkjgieJxAQ_dsci" width="300">

c. Suppose that we cut the dendrogram obtained in (a) such that two clusters result. Which observations are in each cluster?

Cluster 1 - 1,2

Cluster 2 - 3,4

d. Suppose that we cut the dendrogram obtained in (b) such that two clusters result. Which observations are in each cluster?

Cluster 1 - 1,2,3

Cluster 2 - 4

e. Draw a dendrogram that is equivalent to the dendrogram in (a), for which two or more of the leaves are repositioned, but for which the meaning of the dendrogram is the same.

<img src="https://drive.google.com/uc?export=view&id=1q9FogoVpJ1ooMVxYE7rDc8MFW_xlNlH-" width="300">