In [1]:
# Data manipulation
import pandas as pd
import numpy as np

# Options for pandas
pd.options.display.max_columns = 50
pd.options.display.max_rows = 50
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x))

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython.display import Image
from IPython import get_ipython
ipython = get_ipython()

# Autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

# Ignore exceptions
import expectexception

import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

# Visualizations
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

# Activate the automatic conversion for pandas
from rpy2.robjects import pandas2ri
pandas2ri.activate()

# Load the needed extension for the %%R cell magic
%load_ext rpy2.ipython

# Suppress rpy2 runtime warnings
import warnings
from rpy2.rinterface import RRuntimeWarning
warnings.filterwarnings("ignore", category=RRuntimeWarning)

# Load ipywidgets module
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

In [3]:
%%R

# Visualization
library(ggplot2)
library(ggalt)
library(ggExtra)
library(ggcorrplot)
library(ggthemes)
library(ggdendro)
library(GGally)
library(ggrepel)
theme_set(theme_light())

# Tidyverse library
library(scales)
library(dplyr)
library(tidyr)
library(forcats)
library(readr)
library(purrr)

# String manipulations
library(tools)
library(lubridate)

# Machine Learning Related
library(car)
library(caTools)
library(pROC)

# Misc
library(psych)
library(mice)

In [4]:
# 1a. Import data and train test split

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

calls = pd.read_csv('../../Homework Data/Homework 5 Data/Anuran Calls (MFCCs)/Frogs_MFCCs.csv')
X = calls.iloc[:, :22]
y = calls.iloc[:, 22:25]

In [5]:
def split(label):
    '''
    Create a stratifed train test split according to the specified label
    '''
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=408, stratify=y)
    y_train_label = y_train.loc[:, label]
    y_test_label = y_test.loc[:, label]
    
    # Standardize the features
    std = StandardScaler()
    X_train = std.fit_transform(X_train)
    X_test = std.transform(X_test)
    
    # Label Encode the targets
    lbl = LabelEncoder()
    y_train_label = lbl.fit_transform(y_train_label)
    y_test_label = lbl.transform(y_test_label)
    
    return X_train, X_test, y_train_label, y_test_label

In [6]:
X_train, X_test, y_train_family, y_test_family = split('Family')
_, _, y_train_genus, y_test_genus = split('Genus')
_, _, y_train_species, y_test_species = split('Species')

In [7]:
# 1bi. Exact Match and Hamming Score

**Exact Match** requires every label to be classified correctly for a record to be classified correct.\
**Hamming Score** accounts for the fact of partially correctly classified labels.

In [8]:
# 1bii. SVM

# Let's use Bayesian Optimization to search for the best set of C and gamma
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score


def bayes_opt(y_train):
    # Set up space dictionary with specified hyperparameters
    space = {'C'     : hp.loguniform('C', -4, 3),
             'gamma' : hp.loguniform('gamma', -9, 0)}

    # Set up objective function
    def objective(params):
        params = {'C'    : params['C'],
                  'gamma': params['gamma']}

        svc = SVC(**params) # Dictionary unpacking
        best_score = cross_val_score(svc, X_train, y_train, cv=10, 
                                     scoring='accuracy', n_jobs=-1).mean() # 10-fold cv
        loss = 1 - best_score
        return loss

    # Run the algorithm
    best = fmin(fn=objective,
                space=space, 
                max_evals=100, # Control how many evaluations to take
                rstate=np.random.RandomState(408),
                algo=tpe.suggest)

    return best


def svc(y_train):
    best_param = bayes_opt(y_train) # Get best parameters from Bayesian Optimization
    svc = SVC(**best_param) # Dictionary Unpacking
    svc.fit(X_train, y_train)
    return best_param, svc.predict(X_test) # Get best parameters and predictions

**C** controls for the SVM penalty, *larger C tends to overfit.*\
While **gamma** controls for the width of the Gaussian Kernel, *bigger gamma tends to overfit.*

In [9]:
family_param, y_pred_family = svc(y_train_family)
family_param

100%|██████████| 100/100 [01:39<00:00,  1.01it/s, best loss: 0.006351609162889127]


{'C': 6.81837089718689, 'gamma': 0.0549696314164752}

The above dictionary represents the best parameter set for the **family** label.

In [10]:
genus_param, y_pred_genus = svc(y_train_genus)
genus_param

100%|██████████| 100/100 [01:59<00:00,  1.20s/it, best loss: 0.00952634711287248]


{'C': 4.01055152682592, 'gamma': 0.062134977013934}

The above dictionary represents the best parameter set for the **genus** label.

In [11]:
species_param, y_pred_species = svc(y_train_species)
species_param

100%|██████████| 100/100 [01:54<00:00,  1.14s/it, best loss: 0.00952280161576291]


{'C': 4.696022937441232, 'gamma': 0.04838100736907537}

The above dictionary represents the best parameter set for the **species** label.

In [43]:
# Get Exact match accuracy and Hamming score

y_test = np.column_stack((y_test_family, y_test_genus, y_test_species))
y_pred = np.column_stack((y_pred_family, y_pred_genus, y_pred_species))

In [44]:
def accuracy_score(y_test, y_pred):
    return np.sum(np.all(np.equal(y_test, y_pred), axis=1)) / y_test.shape[0]

def hamming_score(y_test, y_pred):
    return np.mean((np.sum((np.equal(y_test, y_pred)), axis=1) / y_test.shape[1]))

In [45]:
accuracy_score(y_test, y_pred)
hamming_score(y_test, y_pred)

0.984251968503937

0.9895013123359578

The accuracy score (exact match) is 0.984 and the Hamming score is 0.990. **This is quite impressive!**

In [71]:
# 1biii. L1-penalized Linear SVM

from sklearn.linear_model import SGDClassifier # Much more efficient than LinearSVC,
                                               # requires less memory, allows incremental (online) learning
    
def bayes_opt2(y_train):
    # Set up space dictionary with specified hyperparameters
    space = {'alpha' : hp.loguniform('alpha', -10, 3)}

    # Set up objective function
    def objective(params):
        params = {'alpha' : params['alpha']}

        lin_svc = SGDClassifier(**params, penalty='l1', # Specify L1-penalty
                                max_iter=30000,
                                learning_rate='optimal', early_stopping=True) # Hinge Loss is used by default
        best_score = cross_val_score(lin_svc, X_train, y_train, cv=10, 
                                     scoring='accuracy', n_jobs=-1).mean() # 10-fold cv
        loss = 1 - best_score
        return loss

    # Run the algorithm
    best = fmin(fn=objective,
                space=space, 
                max_evals=200, # Control how many evaluations to take
                rstate=np.random.RandomState(408),
                algo=tpe.suggest)

    return best


def svc2(y_train):
    best_param = bayes_opt2(y_train) # Get best parameters from Bayesian Optimization
    lin_svc = SGDClassifier(**best_param, penalty='l1', 
                            max_iter=30000,
                            learning_rate='optimal', early_stopping=True)
    lin_svc.fit(X_train, y_train)
    return best_param, lin_svc.predict(X_test) # Get best parameters and predictions

In [72]:
family_param, y_pred_family = svc2(y_train_family)
family_param

100%|██████████| 200/200 [00:31<00:00,  6.45it/s, best loss: 0.06036949220218868]


{'alpha': 0.0004162382758552744}

The above dictionary represents the best parameter set for the **family** label.

In [73]:
genus_param, y_pred_genus = svc2(y_train_genus)
genus_param

100%|██████████| 200/200 [00:54<00:00,  3.64it/s, best loss: 0.051631442547194206]


{'alpha': 7.985691909245828e-05}

The above dictionary represents the best parameter set for the **genus** label.

In [74]:
species_param, y_pred_species = svc2(y_train_species)
species_param

100%|██████████| 200/200 [01:04<00:00,  3.10it/s, best loss: 0.03990385640192218]


{'alpha': 0.00015048561935824306}

The above dictionary represents the best parameter set for the **species** label.

In [75]:
# Get Exact match accuracy and Hamming score

y_test = np.column_stack((y_test_family, y_test_genus, y_test_species))
y_pred = np.column_stack((y_pred_family, y_pred_genus, y_pred_species))

In [76]:
accuracy_score(y_test, y_pred)
hamming_score(y_test, y_pred)

0.8994905048633627

0.9413308630538829

The accuracy score (exact match) is 0.900 and the Hamming score is 0.941.\
The **performace is worse** than the SVM with Guassian kernel.

In [89]:
# 1biv. L1-penalized Linear SVM with resampling
    
def bayes_opt3(X_train, y_train):
    # Set up space dictionary with specified hyperparameters
    space = {'alpha' : hp.loguniform('alpha', -12, -4)}

    # Set up objective function
    def objective(params):
        params = {'alpha' : params['alpha']}

        lin_svc = SGDClassifier(**params, penalty='l1', # Specify L1-penalty
                                max_iter=30000,
                                learning_rate='optimal', early_stopping=True) # Hinge Loss is used by default
        best_score = cross_val_score(lin_svc, X_train, y_train, cv=10, 
                                     scoring='accuracy', n_jobs=-1).mean() # 10-fold cv
        loss = 1 - best_score
        return loss

    # Run the algorithm
    best = fmin(fn=objective,
                space=space, 
                max_evals=150, # Control how many evaluations to take
                rstate=np.random.RandomState(408),
                algo=tpe.suggest)

    return best


def svc3(X_train, y_train):
    best_param = bayes_opt3(X_train, y_train) # Get best parameters from Bayesian Optimization
    lin_svc = SGDClassifier(**best_param, penalty='l1', 
                            max_iter=30000,
                            learning_rate='optimal', early_stopping=True)
    lin_svc.fit(X_train, y_train)
    return best_param, lin_svc.predict(X_test) # Get best parameters and predictions

In [90]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=408)
X_train_family_res, y_train_family_res = sm.fit_resample(X_train, y_train_family)
X_train_genus_res, y_train_genus_res = sm.fit_resample(X_train, y_train_genus)
X_train_species_res, y_train_species_res = sm.fit_resample(X_train, y_train_species)

In [91]:
family_param, y_pred_family = svc3(X_train_family_res, y_train_family_res)
family_param

100%|██████████| 150/150 [00:47<00:00,  3.16it/s, best loss: 0.05341684935797064]


{'alpha': 1.0587163720405958e-05}

The above dictionary represents the best parameter set for the **family** label.

In [92]:
genus_param, y_pred_genus = svc3(X_train_genus_res, y_train_genus_res)
genus_param

100%|██████████| 150/150 [02:44<00:00,  1.10s/it, best loss: 0.05043992179168144]


{'alpha': 6.3404696329266766e-06}

The above dictionary represents the best parameter set for the **genus** label.

In [93]:
species_param, y_pred_species = svc3(X_train_species_res, y_train_species_res)
species_param

100%|██████████| 150/150 [03:36<00:00,  1.44s/it, best loss: 0.044074748701342537]


{'alpha': 0.0003350144587853562}

The above dictionary represents the best parameter set for the **species** label.

In [94]:
# Get Exact match accuracy and Hamming score

y_test = np.column_stack((y_test_family, y_test_genus, y_test_species))
y_pred = np.column_stack((y_pred_family, y_pred_genus, y_pred_species))

In [95]:
accuracy_score(y_test, y_pred)
hamming_score(y_test, y_pred)

0.8462251042149143

0.9223405897792187

The accuracy score (exact match) is 0.846 and the Hamming score is 0.922.\
The **performace is even worse** than the original dataset without applying SMOTE.

Refer to Homework 5b for the next part!