In [82]:
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
import sklearn.metrics as skm
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn import preprocessing
import imblearn.pipeline as imb
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans

# 1. Multi-Class and Multi-Label Classification using SVMs
Using the Anuran Calls (MFCCs) Data Set

## a) Splitting Training and Test Set

In [2]:
df = pd.read_csv('../data/Frogs_MFCCs.csv')
X = df.iloc[:, :22]
y = df.iloc[:, 22:25]

In [3]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=1)
train_y_fam = train_y['Family']
test_y_fam = test_y['Family']
train_y_genus = train_y['Genus']
test_y_genus = test_y['Genus']
train_y_species = train_y['Species']
test_y_species = test_y['Species']

## b) Multi-Class and Multi-Label Learning Problems

### i. Exact Match and Hamming Score
- **Hamming Loss**: Fraction of labels that are incorrectly predicted
    - This takes into account individual labels that are predicted correctly, so the metric can handle partial matched
- **Exact Match Ratio**: Percentage of labels that have all their labels classified correctly
    - More strict than hamming loss because even if a datapoint is just partially matched, the metric doesn't acknowledge it
    
### ii. L2-Penalized SVM with Gaussian Kernel
**Disclaimer about the term 'Accuracy'**: The approach is to repeat one-vs-all classifiers for each label. Therefore, the "score" and "hamming loss" are essentially the same things: simply the accuracy of predicting one classifier over the others. In part d), we will be combining the results for each of the labels, and then evaluating the Exact Match Ratio and Hamming Loss.


#### a) Family Label
#### Exploratory Analysis: I am doing a rough analysis of the bounds for lambda and gamma to narrow down my search space, so that my cross-validation method will not take too long. My threshold will be accuracy above 70%.

In [4]:
def svm_gaussian(c, gam, trainy, testy, predict):
    svm_gaussian = OneVsRestClassifier(SVC(kernel='rbf', C=c, gamma=gam))
    svm_gaussian.fit(train_X, trainy)
    print('Accuracy: ' + str(svm_gaussian.score(test_X, testy)))
    prediction = svm_gaussian.predict(test_X)
    
    if predict:
        return prediction

In [5]:
def svm_gaussian_cv(params, trainy, testy):
    svm_gaussian = OneVsRestClassifier(SVC(kernel='rbf'))
    svm_gaussian_cv = GridSearchCV(estimator=svm_gaussian, param_grid=params, n_jobs=-1, cv=10)
    svm_gaussian_cv.fit(train_X, trainy)
    
    # Extract ideal parameters from cross-validation
    ideal_C = svm_gaussian_cv.best_params_['estimator__C']
    ideal_gamma = svm_gaussian_cv.best_params_['estimator__gamma']
    print("Ideal Lambda: " + str(1/ideal_C))
    print("Ideal Gamma: " + str(ideal_gamma))
    
    return ideal_C, ideal_gamma

In [6]:
# Narrowing down the range of lambda
lamb_lb = 1e-4
gamma_lb = 0.1

lamb_ub = 1e2
gamma_ub = 5

svm_gaussian(1/lamb_lb, gamma_lb, train_y_fam, test_y_fam, False)

Accuracy: 0.9884205650764243


**Findings**
- It seems like lambda has most of an impact in determining the accuracy of the model. Higher lambdas (beyond 10) actually don't produce great results
- Bounds for log(lambda) = {-4, -3, ..., 2}
- Bounds for gamma = [0.1, 5]

In [7]:
# Parameters to tune
params_fam = {"estimator__C": 1/np.logspace(-4, 2, 20), "estimator__gamma":np.linspace(0.1, 5, 20)}

ideal_C_fam, ideal_gamma_fam = svm_gaussian_cv(params_fam, train_y_fam, test_y_fam)
prediction_fam = svm_gaussian(ideal_C_fam, ideal_gamma_fam, train_y_fam, test_y_fam, False)

Ideal Lambda: 0.01623776739188721
Ideal Gamma: 2.421052631578948
Accuracy: 0.9916628068550255


#### b) Genus Label

In [34]:
lamb_lb = 1e-5
gamma_lb = 0.2

lamb_ub = 1e2
gamma_ub = 5

print("Both LB")
svm_gaussian(1/lamb_lb, gamma_lb, train_y_genus, test_y_genus, False)

print("Both UB")
svm_gaussian(1/lamb_ub, gamma_ub, train_y_genus, test_y_genus, False)

print("Lamb_lb, Gamma_ub")
svm_gaussian(1/lamb_lb, gamma_ub, train_y_genus, test_y_genus, False)

print("Lamb_ub, Gamma_lb")
svm_gaussian(1/lamb_ub, gamma_lb, train_y_genus, test_y_genus, False)

Both LB
Accuracy: 0.9870310328855951
Both UB
Accuracy: 0.8846688281611857
Lamb_lb, Gamma_ub
Accuracy: 0.9884205650764243
Lamb_ub, Gamma_lb
Accuracy: 0.7068087077350625


**Findings**
- log(lambda) = {-5, -4, ..., 2}
- gamma = [0.2, 5]

In [11]:
params_genus = {"estimator__C": 1/np.logspace(-5, 2, 20), "estimator__gamma":np.linspace(0.2, 5, 20)}
ideal_C_genus, ideal_gamma_genus = svm_gaussian_cv(params_genus, train_y_genus, test_y_genus)
prediction_genus = svm_gaussian(ideal_C_genus, ideal_gamma_genus, train_y_genus, test_y_genus, True)

Ideal Lambda: 1e-05
Ideal Gamma: 2.7263157894736842
Exact Match Ratio: 0.9884205650764243
Hamming Loss: 0.01157943492357573


In [32]:
prediction_genus = svm_gaussian(1/1e-05, 2.7263157894736842, train_y_genus, test_y_genus, True)

Accuracy: 0.9884205650764243


#### c) Species Label

In [35]:
lamb_lb = 1e-6
gamma_lb = 0.2

lamb_ub = 1e2
gamma_ub = 6

print("Both LB")
svm_gaussian(1/lamb_lb, gamma_lb, train_y_species, test_y_species, False)

print("Both UB")
svm_gaussian(1/lamb_ub, gamma_ub, train_y_species, test_y_species, False)

print("Lamb_lb, Gamma_ub")
svm_gaussian(1/lamb_lb, gamma_ub, train_y_species, test_y_species, False)

print("Lamb_ub, Gamma_lb")
svm_gaussian(1/lamb_ub, gamma_lb, train_y_species, test_y_species, False)

Both LB
Accuracy: 0.9865678554886521
Both UB
Accuracy: 0.9059749884205651
Lamb_lb, Gamma_ub
Accuracy: 0.9870310328855951
Lamb_ub, Gamma_lb
Accuracy: 0.729504400185271


**Findings**:
- log(lambda) = {-6, -5, ..., 2}
- gamma = [0.2, 6]

In [20]:
params_species = {"estimator__C": 1/np.logspace(-6, 2, 20), "estimator__gamma":np.linspace(0.2, 6, 20)}
svm_gaussian_cv('rbf', params_species, train_y_species, test_y_species)

Ideal Lambda: 0.04281332398719396
Ideal Gamma: 2.031578947368421
Exact Match Ratio: 0.9902732746641963
Hamming Loss: 0.009726725335803613


In [36]:
prediction_species = svm_gaussian(1/0.04281332398719396, 2.031578947368421, train_y_species, test_y_species, True)

Accuracy: 0.9902732746641963


#### d) Combining Results from All 3 Classes (Multi-label evaluation)

### RESULTS:
- Hamming Score: 0.990119
- Exact Match Ratio: 0.9866

In [9]:
def print_metrics(test_y, prediction):
    prediction_numpy = prediction.to_numpy()
    testy_numpy = test_y.to_numpy()
    print("Hamming Score: " + str(np.sum(np.equal(testy_numpy, prediction_numpy))/float(testy_numpy.size)))

    def exact_match(row):
        if False in row:
            return 0
        else:
            return 1

    equal = np.equal(testy_numpy, prediction_numpy)
    not_equal = np.not_equal(testy_numpy, prediction_numpy)
    print("Exact Match Ratio: " + 
          str(sum(np.apply_along_axis(exact_match, arr=equal, axis=1))/np.size(testy_numpy, 0)))

In [79]:
prediction = pd.DataFrame({})
prediction['family'] = prediction_fam
prediction['genus'] = prediction_genus
prediction['species'] = prediction_species
    
print_metrics(test_y, prediction)

Hamming Score: 0.9901188821985487
Exact Match Ratio: 0.9865678554886521


### iii. L1-Penalized SVM
**Note on Cross-Validation function**: I scaled each fold through the Pipeline to decide the tuning parameters. However, in training the final model, I used the entire training set that was standardized.

#### a) Family Class

In [65]:
def svm_l1(c, train_X, test_X, trainy, testy, predict):
    svm_l1 = LinearSVC(C=c, max_iter=1000, dual=False)
    svm_l1.fit(train_X, trainy)
    print('Accuracy: ' + str(svm_l1.score(test_X, testy)))
    prediction = svm_l1.predict(test_X)
    
    if predict:
        return prediction
    
def svm_l1_cv(params, train_X, trainy):
    pipe = Pipeline([('scaler', StandardScaler()), 
                         ('svc', LinearSVC(max_iter=1000, dual=False))])
    svm_l1_cv = GridSearchCV(estimator=pipe, param_grid=params, n_jobs=-1, cv=10)
    svm_l1_cv.fit(train_X, trainy)
    
    # Extract ideal parameters from cross-validation
    ideal_C = svm_l1_cv.best_params_['svc__C']
    print("Ideal Lambda: " + str(1/ideal_C))
    
    return ideal_C

In [72]:
standardize = StandardScaler()
train_X_std = standardize.fit_transform(train_X)
test_X_std = standardize.fit_transform(test_X)

In [74]:
lamb_lb = 1e-4
lamb_ub = 1e6

print("lb")
svm_l1(1/lamb_lb, train_X_std, test_X_std, train_y_fam, test_y_fam, False)

print("ub")
svm_l1(1/lamb_ub, train_X_std, test_X_std, train_y_fam, test_y_fam, False)

lb
Accuracy: 0.9328392774432608
ub
Accuracy: 0.7823066234367763


In [75]:
params_fam = {"svc__C": 1/np.logspace(-4, 6, 20)}

ideal_C_fam_l1 = svm_l1_cv(params_fam, train_X, train_y_fam)
prediction_fam_l1 = svm_l1(ideal_C_fam_l1, train_X_std, test_X_std, train_y_fam, test_y_fam, True)

Ideal Lambda: 0.0001
Accuracy: 0.9328392774432608


#### b) Genus Class

In [76]:
lamb_lb = 1e-5
lamb_ub = 1e5

print("lb")
svm_l1(1/lamb_lb, train_X_std, test_X_std, train_y_genus, test_y_genus, False)

print("ub")
svm_l1(1/lamb_ub, train_X_std, test_X_std, train_y_genus, test_y_genus, False)

lb
Accuracy: 0.9458082445576655
ub
Accuracy: 0.7841593330245484


In [77]:
params_genus = {"svc__C": 1/np.logspace(-5, 5, 20)}
ideal_C_genus_l1 = svm_l1_cv(params_genus, train_X, train_y_genus)
prediction_genus_l1 = svm_l1(ideal_C_genus_l1, train_X_std, test_X_std, train_y_genus, test_y_genus, True)

Ideal Lambda: 1e-05
Accuracy: 0.9458082445576655


#### c) Species Class

In [78]:
lamb_lb = 1e-4
lamb_ub = 1e6

print("lb")
svm_l1(1/lamb_lb, train_X_std, test_X_std, train_y_species, test_y_species, False)

print("ub")
svm_l1(1/lamb_ub, train_X_std, test_X_std, train_y_species, test_y_species, False)

lb
Accuracy: 0.9555349698934692
ub
Accuracy: 0.8341824918943955


In [79]:
params_species = {"svc__C": 1/np.logspace(-4, 6, 20)}
ideal_C_species_l1 = svm_l1_cv(params_species, train_X, train_y_species)
prediction_species_l1 = svm_l1(ideal_C_species_l1, train_X_std, test_X_std, train_y_species, test_y_species, True)

Ideal Lambda: 1.6237767391887208
Accuracy: 0.9550717924965262


#### d) Combining Results from All 3 Classes (Multi-label evaluation)
**RESULTS**:
- Hamming Score: 0.9446
- Exact Match Ratio: 0.9125

In [65]:
prediction_l1 = pd.DataFrame({})
prediction_l1['family'] = prediction_fam_l1
prediction_l1['genus'] = prediction_genus_l1
prediction_l1['species'] = prediction_species_l1
    
print_metrics(test_y, prediction_l1)

Hamming Score: 0.9445731048324841
Exact Match Ratio: 0.9124594719777674


### iv. SMOTE
Note that in cross-validating to decide the hyperparameters, I baked in SMOTE in the pipeline for better results. However, in the end, I trained the final model with a train dataset that was applied SMOTE directly.

#### a) Family Class

In [90]:
def svm_l1_cv_smote(params, train_X, trainy):
    pipe = imb.Pipeline([('scaler', StandardScaler()), ('smote', SMOTE()),
                         ('svc', LinearSVC(max_iter=1000, dual=False))])
    svm_l1_cv = GridSearchCV(estimator=pipe, param_grid=params, cv=10)
    svm_l1_cv.fit(train_X, trainy)
    
    # Extract ideal parameters from cross-validation
    ideal_C = svm_l1_cv.best_params_['svc__C']
    print("Ideal Lambda: " + str(1/ideal_C))
    
    return ideal_C

In [84]:
sm = SMOTE()
train_X_smote_fam, train_y_smote_fam = sm.fit_resample(train_X_std, train_y_fam)

In [85]:
lamb_lb = 1e-6
lamb_ub = 1e5

print("lb")
svm_l1(1/lamb_lb, train_X_smote_fam, test_X_std, train_y_smote_fam, test_y_fam, False)

print("ub")
svm_l1(1/lamb_ub, train_X_smote_fam, test_X_std, train_y_smote_fam, test_y_fam, False)

lb
Accuracy: 0.9249652616952293
ub
Accuracy: 0.7693376563223715


In [91]:
params_fam = {"svc__C": 1/np.logspace(-6, 5, 20)}

# Cross-Validation with SMOTE built in the pipeline
ideal_C_fam_l1 = svm_l1_cv_smote(params_fam, train_X, train_y_fam)
prediction_fam_l1 = svm_l1(ideal_C_fam_l1, train_X_smote_fam, test_X_std, train_y_smote_fam, test_y_fam, True)

Ideal Lambda: 0.04281332398719396
Accuracy: 0.9249652616952293


#### b) Genus Class

In [92]:
sm = SMOTE()
train_X_smote_genus, train_y_smote_genus = sm.fit_resample(train_X_std, train_y_genus)

In [93]:
lamb_lb = 1e-6
lamb_ub = 1e5

print("lb")
svm_l1(1/lamb_lb, train_X_smote_genus, test_X_std, train_y_smote_genus, test_y_genus, False)

print("ub")
svm_l1(1/lamb_ub, train_X_smote_genus, test_X_std, train_y_smote_genus, test_y_genus, False)

lb
Accuracy: 0.9221861973135711
ub
Accuracy: 0.7568318666049096


In [95]:
params_genus = {"svc__C": 1/np.logspace(-6, 5, 20)}
ideal_C_genus_l1 = svm_l1_cv_smote(params_genus, train_X, train_y_genus)
prediction_genus_l1 = svm_l1(ideal_C_genus_l1, train_X_smote_genus, test_X_std, 
                             train_y_smote_genus, test_y_genus, True)

Ideal Lambda: 0.04281332398719396
Accuracy: 0.9231125521074571


#### c) Species Class

In [96]:
sm = SMOTE()
train_X_smote_species, train_y_smote_species = sm.fit_resample(train_X_std, train_y_species)

In [97]:
lamb_lb = 1e-6
lamb_ub = 1e6

print("lb")
svm_l1(1/lamb_lb, train_X_smote_species, test_X_std, train_y_smote_species, test_y_species, False)

print("ub")
svm_l1(1/lamb_ub, train_X_smote_species, test_X_std, train_y_smote_species, test_y_species, False)

lb
Accuracy: 0.9499768411301528
ub
Accuracy: 0.7684113015284854


In [98]:
params_species = {"svc__C": 1/np.logspace(-6, 6, 20)}
ideal_C_species_l1 = svm_l1_cv_smote(params_species, train_X, train_y_species)
prediction_species_l1 = svm_l1(ideal_C_species_l1, train_X_smote_species, test_X_std, train_y_smote_species, 
                               test_y_species, True)

Ideal Lambda: 37.92690190732246
Accuracy: 0.953219082908754


#### d) Combining Results from All 3 Classes (Multi-label evaluation)
**RESULTS**:
- Hamming Score: 0.9348
- Exact Match Ratio: 0.8745

In [99]:
prediction_l1_smote = pd.DataFrame({})
prediction_l1_smote['family'] = prediction_fam_l1
prediction_l1_smote['genus'] = prediction_genus_l1
prediction_l1_smote['species'] = prediction_species_l1

print_metrics(test_y, prediction_l1_smote)

Hamming Score: 0.9337656322371468
Exact Match Ratio: 0.87308939323761


# 2. K-Means Clustering
We perform 50 iterations of k-means clustering on the same Anuran Calls (MFCCs) data set. On each iteration:

    a) Choose k in {1, ..., 50} that maximizes the Silhouette Score using a specified seed
    b) In each cluster, determine the majority family, genus and species. We treat these majorities as predictions
    c) Calculate Hamming distance, score, and loss between the true labels and majority (predicted) labels

Then, we evaluate the mean and standard deviation of the Hamming Scores obtained after 50 iterations

**Hamming Distance Definition**: There doesn't seem to be a definition for Hamming Distance that suits our particular learning problem. Based off of various sources, they use Hamming distance as a metric in comparing the difference between two bitstrings. So, I am simply doing a comparison of both labels, and counting how many of the individual labels don't match the true label. In this case, then, **Hamming Distance = Hamming Loss * # Data points * # Labels each data point.**

**Hamming Distance Results after 50 simulations**:
- Mean: 5195.22
- Standard Deviation: 804.91

In [5]:
X_std = standardize.fit_transform(X)
y_fam = y['Family']
y_genus = y['Genus']
y_species = y['Species']

In [56]:
def find_max_clusters(y_label, ideal_k, labels):
    count_fam = {}
    fam_max = {}
    for i in range(ideal_k):
        count_fam[i] = []
        fam_max[i] = []

    # family
    for i in range(len(labels)):
        count_fam[labels[i]].append(y_label[i])

    for i in count_fam:
        fam_max[i] = max(count_fam[i],key=count_fam[i].count)

    return fam_max

def print_metrics_hamming(test_y, prediction):
    prediction_numpy = prediction.to_numpy()
    testy_numpy = test_y.to_numpy()
    print("c) Metrics")
    print("--> Hamming Score: " + str(np.sum(np.equal(testy_numpy, prediction_numpy))/float(testy_numpy.size)))
    print("--> Hamming Loss: " + str(np.sum(np.not_equal(testy_numpy, prediction_numpy))/float(testy_numpy.size)))
    hamming_dist = np.sum(np.not_equal(testy_numpy, prediction_numpy))
    print("--> Hamming Distance: " + str(hamming_dist))
    return hamming_dist

In [60]:
hamming_distances = []
for iterations in range(50):
    print("----- Simulation #: " + str(iterations+1) + " -----")
    sil_scores = []
    for k in range(2, 50):
        km = MiniBatchKMeans(n_clusters=k).fit(X_std)
        labels = km.labels_
        sil_scores.append(skm.silhouette_score(X_std, labels))
    
    ideal_k = np.argmax(sil_scores)+2
    print("a) Ideal # of Clusters: " + str(ideal_k))

    km = KMeans(n_clusters=ideal_k, random_state=1).fit(X_std)
    labels = km.labels_

    fam = find_max_clusters(y_fam, ideal_k, labels)
    gen = find_max_clusters(y_genus, ideal_k, labels)
    spec = find_max_clusters(y_species, ideal_k, labels)
    dic_final = {}
    
    for i in range(ideal_k):
        dic_final[i] = [fam[i], gen[i], spec[i]]
    
    print("b) Majority Label in Clusters: " + str(dic_final))
    prediction = [dic_final[cluster] for cluster in labels]

    prediction = pd.DataFrame(data=prediction, columns=['Family', 'Genus', 'Species'])
    hamming_distances.append(print_metrics_hamming(y, prediction))
    print("\n")

----- Simulation #: 1 -----
a) Ideal # of Clusters: 3
b) Majority Label in Clusters: {0: ['Hylidae', 'Hypsiboas', 'HypsiboasCinerascens'], 1: ['Hylidae', 'Hypsiboas', 'HypsiboasCordobae'], 2: ['Leptodactylidae', 'Adenomera', 'AdenomeraHylaedactylus']}
c) Metrics
--> Hamming Score: 0.7179986101459347
--> Hamming Loss: 0.2820013898540653
--> Hamming Distance: 6087


----- Simulation #: 2 -----
a) Ideal # of Clusters: 3
b) Majority Label in Clusters: {0: ['Hylidae', 'Hypsiboas', 'HypsiboasCinerascens'], 1: ['Hylidae', 'Hypsiboas', 'HypsiboasCordobae'], 2: ['Leptodactylidae', 'Adenomera', 'AdenomeraHylaedactylus']}
c) Metrics
--> Hamming Score: 0.7179986101459347
--> Hamming Loss: 0.2820013898540653
--> Hamming Distance: 6087


----- Simulation #: 3 -----
a) Ideal # of Clusters: 5
b) Majority Label in Clusters: {0: ['Leptodactylidae', 'Adenomera', 'AdenomeraHylaedactylus'], 1: ['Hylidae', 'Hypsiboas', 'HypsiboasCordobae'], 2: ['Hylidae', 'Hypsiboas', 'HypsiboasCinerascens'], 3: ['Leptodact

a) Ideal # of Clusters: 4
b) Majority Label in Clusters: {0: ['Hylidae', 'Hypsiboas', 'HypsiboasCinerascens'], 1: ['Leptodactylidae', 'Adenomera', 'AdenomeraHylaedactylus'], 2: ['Leptodactylidae', 'Adenomera', 'AdenomeraAndre'], 3: ['Hylidae', 'Hypsiboas', 'HypsiboasCordobae']}
c) Metrics
--> Hamming Score: 0.7525596479036368
--> Hamming Loss: 0.24744035209636323
--> Hamming Distance: 5341


----- Simulation #: 22 -----
a) Ideal # of Clusters: 4
b) Majority Label in Clusters: {0: ['Hylidae', 'Hypsiboas', 'HypsiboasCinerascens'], 1: ['Leptodactylidae', 'Adenomera', 'AdenomeraHylaedactylus'], 2: ['Leptodactylidae', 'Adenomera', 'AdenomeraAndre'], 3: ['Hylidae', 'Hypsiboas', 'HypsiboasCordobae']}
c) Metrics
--> Hamming Score: 0.7525596479036368
--> Hamming Loss: 0.24744035209636323
--> Hamming Distance: 5341


----- Simulation #: 23 -----
a) Ideal # of Clusters: 6
b) Majority Label in Clusters: {0: ['Leptodactylidae', 'Adenomera', 'AdenomeraHylaedactylus'], 1: ['Leptodactylidae', 'Adenome

a) Ideal # of Clusters: 5
b) Majority Label in Clusters: {0: ['Leptodactylidae', 'Adenomera', 'AdenomeraHylaedactylus'], 1: ['Hylidae', 'Hypsiboas', 'HypsiboasCordobae'], 2: ['Hylidae', 'Hypsiboas', 'HypsiboasCinerascens'], 3: ['Leptodactylidae', 'Adenomera', 'AdenomeraAndre'], 4: ['Leptodactylidae', 'Adenomera', 'AdenomeraAndre']}
c) Metrics
--> Hamming Score: 0.8033356497567755
--> Hamming Loss: 0.19666435024322446
--> Hamming Distance: 4245


----- Simulation #: 40 -----
a) Ideal # of Clusters: 5
b) Majority Label in Clusters: {0: ['Leptodactylidae', 'Adenomera', 'AdenomeraHylaedactylus'], 1: ['Hylidae', 'Hypsiboas', 'HypsiboasCordobae'], 2: ['Hylidae', 'Hypsiboas', 'HypsiboasCinerascens'], 3: ['Leptodactylidae', 'Adenomera', 'AdenomeraAndre'], 4: ['Leptodactylidae', 'Adenomera', 'AdenomeraAndre']}
c) Metrics
--> Hamming Score: 0.8033356497567755
--> Hamming Loss: 0.19666435024322446
--> Hamming Distance: 4245


----- Simulation #: 41 -----
a) Ideal # of Clusters: 3
b) Majority Labe

In [63]:
print("Mean Hamming Distance: " + str(np.mean(hamming_distances)))
print("Standard Deviation: " + str(np.std(hamming_distances)))

Mean Hamming Distance: 5195.22
Standard Deviation: 804.91463621927


# 3. ISLR 10.7.2
![alt text](3.jpg "Solutions for 10.7.2")