In [1]:
import numpy as np
import os
import librosa
import keras
import pandas as pd
import time

import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

2023-05-30 13:19:09.130971: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  _warn(("h5py is running against HDF5 {0} when it was built against {1}, "


In [2]:
path = 'BirdCLEF/birdnet_embeddings'
species = [f.split('.')[0] for f in os.listdir(path)][:10]
species

['bswdov1-27',
 'bawhor2-47',
 'categr-166',
 'bltbar1-7',
 'combul2-200',
 'carcha1-153',
 'brctch1-62',
 'barswa-500',
 'afpwag1-81',
 'afrthr1-45']

In [3]:
embeddings = [np.load('%s/%s.npy' % (path, s)) for s in species]

In [5]:
X = np.concatenate(embeddings)
y = np.concatenate([np.full(len(embeddings[i]), species[i]) for i in range(10)])

In [7]:
X.shape

(12546, 1024)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    shuffle=True,
                                                    test_size = 0.25, 
                                                    random_state = 457)

In [20]:
import random

In [9]:
X_train.shape

(9409, 1024)

In [14]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
reg = LogisticRegression(max_iter=1000,
                         penalty=None).fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

In [15]:
y_test_pred = reg.predict(X_test_scaled)

In [16]:
correct = np.count_nonzero(y_test == y_test_pred)
all = len(y_test)
correct, all, correct/all

(2948, 3137, 0.9397513547975773)

In [17]:
y_train_pred = reg.predict(X_train_scaled)
correct = np.count_nonzero(y_train == y_train_pred)
all = len(y_train)
correct, all, correct/all

(9409, 9409, 1.0)

### Trying out some classifiers with and without PCA preprocessing

In [22]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

from matplotlib.pyplot import cm

In [23]:
## Make the kfold object here
kfold = KFold(n_splits = 5,
              shuffle=True,
              random_state= 423)

##### KNN without PCA (since it liked the higher dimension anyways)

In [24]:
ks = range(1, 51)
n_splits = 5

st = time.time()

## Makes a 3-D array of to record the accuracies in
knn_accs = np.zeros((n_splits, len(ks)))
knn_accs2 = np.zeros((n_splits, len(ks)))

i = 0
for train_index, test_index in kfold.split(X_train, y_train):
    ## Keeps track of the cross validation split you are on
    ## This loop can be a little long
    print("CV Split", i)
    tt = X_train[train_index]
    ho = X_train[test_index]
    y_tt = y_train[train_index]
    y_ho = y_train[test_index]
    
    j = 0
    for neighbors in ks:
        dimst = time.time()
        knn = KNeighborsClassifier(neighbors)

        ## Fit on the tt data
        knn.fit(tt, y_tt)

        ## Get the holdout prediction
        pred = knn.predict(ho)

        knn_accs[i,j] = accuracy_score(y_ho, pred)
        knn_accs2[i,j] = accuracy_score(y_tt, knn.predict(tt))
        
        j = j + 1
        dimtime = time.time() - dimst
        print("Num Neighbors = %s" % neighbors)
        print(dimtime)
    i = i + 1
et = time.time() - st
print(et)

CV Split 0
Num Neighbors = 1
0.7771410942077637
Num Neighbors = 2
0.747157096862793
Num Neighbors = 3
0.7669477462768555
Num Neighbors = 4
0.7226390838623047
Num Neighbors = 5
0.7641696929931641
Num Neighbors = 6
0.75303053855896
Num Neighbors = 7
0.7644138336181641
Num Neighbors = 8
0.7433485984802246
Num Neighbors = 9
0.7778043746948242
Num Neighbors = 10
0.7793691158294678
Num Neighbors = 11
0.7791078090667725
Num Neighbors = 12
0.77044677734375
Num Neighbors = 13
0.8440263271331787
Num Neighbors = 14
0.8039662837982178
Num Neighbors = 15
0.8082520961761475
Num Neighbors = 16
0.7545433044433594
Num Neighbors = 17
0.7846107482910156
Num Neighbors = 18
0.8188536167144775
Num Neighbors = 19
0.7713170051574707
Num Neighbors = 20
0.7579357624053955
Num Neighbors = 21
0.8488991260528564
Num Neighbors = 22
0.828467845916748
Num Neighbors = 23
0.8006579875946045
Num Neighbors = 24
0.828087329864502
Num Neighbors = 25
0.8373463153839111
Num Neighbors = 26
0.8239786624908447
Num Neighbors = 2

In [25]:
## This code will print out the best # components - k combo for you
## It also prints out the highest AVG CV Accuracy
max_index = np.argmax(np.mean(knn_accs, axis=0))

print(max_index)

print("The k with the highest AVG CV Accuracy was",
         "k =", ks[max_index])
print("The highest AVG CV Accuracy was", np.max(np.mean(knn_accs, axis=0)))

print("The corresponding AVG CV Accuracy was", np.mean(knn_accs2[:,max_index]))

0
The k with the highest AVG CV Accuracy was k = 1
The highest AVG CV Accuracy was 0.959613191029937
The corresponding AVG CV Accuracy was 1.0


##### KNN with PCA

In [26]:
ks = range(1, 11)
comps = range(2,20)
n_splits = 5
## Makes a 3-D array of to record the accuracies in
pca_accs = np.zeros((n_splits, len(comps), len(ks)))
pca_accs2 = np.zeros((n_splits, len(comps), len(ks)))

i = 0
for train_index, test_index in kfold.split(X_train, y_train):
    ## Keeps track of the cross validation split you are on
    ## This loop can be a little long
    print("CV Split", i)
    tt = X_train[train_index]
    ho = X_train[test_index]
    y_tt = y_train[train_index]
    y_ho = y_train[test_index]
    
    j = 0
    for n_comps in comps:
        ## Make the PCA pipeline here
        pca_pipe = Pipeline([('scale', StandardScaler()),
                            ('pca', PCA(n_comps))])
        
        ## Fit and then get the PCA transformed tt data here
        pca_tt = pca_pipe.fit_transform(tt)
        
        ## Get the transformed holdout data here
        pca_ho = pca_pipe.transform(ho)
        
        k = 0
        for neighbors in ks:
            knn = KNeighborsClassifier(neighbors)
            
            ## Fit on the tt data
            knn.fit(pca_tt, y_tt)

            ## Get the holdout prediction
            pred = knn.predict(pca_ho)

            pca_accs[i,j,k] = accuracy_score(y_ho, pred)
            pca_accs2[i,j,k] = accuracy_score(y_tt, knn.predict(pca_tt))
            
            
            k = k + 1
        j = j + 1
    i = i + 1

CV Split 0
CV Split 1
CV Split 2
CV Split 3
CV Split 4


##### The above CV analysis with KNN takes ~10 min

In [27]:
## This code will print out the best # components - k combo for you
## It also prints out the highest AVG CV Accuracy
max_index = np.unravel_index(np.argmax(np.mean(pca_accs, axis=0), axis=None), 
                                       np.mean(pca_accs, axis=0).shape)

print(max_index)
print(ks[max_index[1]])
print(comps[max_index[0]])
print(np.round(comps[max_index[0]],2))
print(pca_accs.shape)
print("The pair with the highest AVG CV Accuracy was",
         "k =", ks[max_index[1]],
         "and number of components =", np.round(comps[max_index[0]],2))
print("The highest AVG CV validation Accuracy was", np.max(np.mean(pca_accs, axis=0)))

print("The AVG CV training Accuracy for this pair was", np.mean(pca_accs2[:,max_index[0], max_index[1]]))


(17, 0)
1
19
19
(5, 18, 10)
The pair with the highest AVG CV Accuracy was k = 1 and number of components = 19
The highest AVG CV validation Accuracy was 0.9332548031916005
The AVG CV training Accuracy for this pair was 1.0


##### It looks like the ideal PCA dimension to reduce to was the highest we tested still

#### Let's try out some small NN's:

In [28]:
from sklearn.neural_network import MLPClassifier

In [41]:
mlp = MLPClassifier(hidden_layer_sizes=(5,5,),
                    max_iter=10000)

mlp.fit(X_train, y_train)

print(accuracy_score(mlp.predict(X_test), y_test))
print(accuracy_score(mlp.predict(X_train), y_train))

0.8983104877271278
0.999468593899458


In [42]:
mlp = MLPClassifier(hidden_layer_sizes=(8,5,),
                    max_iter=10000)

mlp.fit(X_train, y_train)

print(accuracy_score(mlp.predict(X_test), y_test))
print(accuracy_score(mlp.predict(X_train), y_train))

0.9193496971628945
1.0


In [43]:
mlp = MLPClassifier(hidden_layer_sizes=(8,8,5,),
                    max_iter=10000)

mlp.fit(X_train, y_train)

print(accuracy_score(mlp.predict(X_test), y_test))
print(accuracy_score(mlp.predict(X_train), y_train))

0.8721708638826905
0.9968115633967478


In [44]:
mlp = MLPClassifier(hidden_layer_sizes=(100,50,5,),
                    max_iter=10000)

mlp.fit(X_train, y_train)

print(accuracy_score(mlp.predict(X_test), y_test))
print(accuracy_score(mlp.predict(X_train), y_train))

0.9273190946764425
0.9919226272717611


In [45]:
mlp = MLPClassifier(hidden_layer_sizes=(1024,128,128,24,),
                    max_iter=10000)

mlp.fit(X_train, y_train)

print(accuracy_score(mlp.predict(X_test), y_test))
print(accuracy_score(mlp.predict(X_train), y_train))

0.9591966847306344
1.0


In [46]:
mlp = MLPClassifier(hidden_layer_sizes=(1024,128,128,128,128,24,),
                    max_iter=10000)

mlp.fit(X_train, y_train)

print(accuracy_score(mlp.predict(X_test), y_test))
print(accuracy_score(mlp.predict(X_train), y_train))

0.9617468919349698
1.0


##### Something must be weird with the MLPClassifier if the accuracy scores are all the same.

#### Random forests with CV:

In [36]:
from sklearn.ensemble import RandomForestClassifier

In [37]:
## note this will take a minute or so to run

max_depths = range(1, 11)
n_trees = [100,500]

## Make an array of zeros that will hold the cv accuracies
rf_accs = np.zeros((5, len(max_depths), len(n_trees)))
rf_accs2 = np.zeros((5, len(max_depths), len(n_trees)))


i = 0
for train_index, test_index in kfold.split(X_train, y_train):
    tt = X_train[train_index]
    ho = X_train[test_index]
    y_tt = y_train[train_index]
    y_ho = y_train[test_index]
    
    ## Loop through the max_depth options
    j = 0
    for depth in max_depths:
        ## Look through the number of estimators options
        k = 0
        for n_est in n_trees:
            ## This will help you keep track of where the loop is
            print(i,j,k)
            ## Make the model object, include a random state
            rf = RandomForestClassifier(max_depth=depth,
                                        n_estimators=n_est,
                                        max_samples = int(.8*len(tt)))
            
            ## Fit the model
            rf.fit(tt, y_tt)
            
            ## predict on the holdout set
            pred = rf.predict(ho)
            
            ## Record the accuracy
            rf_accs[i,j,k] = accuracy_score(y_ho,  pred)
            #rf_accs2[i,j,k] = accuracy_score(y_tt, rf.predict(pca_tt))
            k = k + 1
        j = j + 1
    i = i + 1

0 0 0
0 0 1
0 1 0
0 1 1
0 2 0
0 2 1
0 3 0
0 3 1
0 4 0
0 4 1
0 5 0
0 5 1
0 6 0
0 6 1
0 7 0
0 7 1
0 8 0
0 8 1
0 9 0
0 9 1
1 0 0
1 0 1
1 1 0
1 1 1
1 2 0
1 2 1
1 3 0
1 3 1
1 4 0
1 4 1
1 5 0
1 5 1
1 6 0
1 6 1
1 7 0
1 7 1
1 8 0
1 8 1
1 9 0
1 9 1
2 0 0
2 0 1
2 1 0
2 1 1
2 2 0
2 2 1
2 3 0
2 3 1
2 4 0
2 4 1
2 5 0
2 5 1
2 6 0
2 6 1
2 7 0
2 7 1
2 8 0
2 8 1
2 9 0
2 9 1
3 0 0
3 0 1
3 1 0
3 1 1
3 2 0
3 2 1
3 3 0
3 3 1
3 4 0
3 4 1
3 5 0
3 5 1
3 6 0
3 6 1
3 7 0
3 7 1
3 8 0
3 8 1
3 9 0
3 9 1
4 0 0
4 0 1
4 1 0
4 1 1
4 2 0
4 2 1
4 3 0
4 3 1
4 4 0
4 4 1
4 5 0
4 5 1
4 6 0
4 6 1
4 7 0
4 7 1
4 8 0
4 8 1
4 9 0
4 9 1


In [38]:
## This gives you the optimal depth and number of trees
max_index = np.unravel_index(np.argmax(np.mean(rf_accs, axis=0), axis=None), 
                                       np.mean(rf_accs, axis=0).shape)


print(max_depths[max_index[0]],n_trees[max_index[1]])

10 500


In [39]:
## What is the best avg cv accuracy? (Note here we only tried it out for 100 classifiers, hence the index of 0 below)
np.mean(rf_accs, axis = 0)[max_depths[max_index[0]]-1, 0]

0.8572644618340686

In [40]:
np.mean(rf_accs, axis = 0)[max_depths[max_index[0]]-1, 1]

0.8601338628185766

#### Bayes Classifiers:

In [47]:
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

In [48]:
## Make a holder for the accuracies
bayes_accs = np.zeros((5,3))


i = 0
for train_index, test_index in kfold.split(X_train, y_train):
    tt = X_train[train_index]
    ho = X_train[test_index]
    y_tt = y_train[train_index]
    y_ho = y_train[test_index]
    
    ## Linear Discriminant Analysis
    lda = LinearDiscriminantAnalysis()
    
    lda.fit(tt, y_tt)
    lda_pred = lda.predict(ho)
    
    ## Records the accuracies
    bayes_accs[i, 0] = accuracy_score(y_ho,
                                      lda_pred)
    
    ## Quadratic Discriminant Analysis
    qda = QuadraticDiscriminantAnalysis()
    
    qda.fit(tt,y_tt)
    
    qda_pred = qda.predict(ho)
    
    ## Records the accuracies
    bayes_accs[i, 1] = accuracy_score(y_ho,
                                      qda_pred)
    
    
    ## Gaussian Naive Bayes
    nb = GaussianNB()
    
    nb.fit(tt, y_tt)
    nb_pred = nb.predict(ho)
    
    ## Records the accuracies
    bayes_accs[i, 2] = accuracy_score(y_ho,
                                      nb_pred)
    
    i = i + 1



In [49]:
## Print the Average CV Accuracies here
np.mean(bayes_accs, axis=0)

array([0.93091749, 0.58539814, 0.88011493])

In [50]:
bayes_accs

array([[0.93092455, 0.59829968, 0.88150903],
       [0.9314559 , 0.57013815, 0.89001063],
       [0.92773645, 0.58501594, 0.88363443],
       [0.93092455, 0.5770457 , 0.8639745 ],
       [0.93354599, 0.59649123, 0.88144604]])

#### Log Reg with PCA first:

In [None]:
import time

In [51]:
st = time.time()
dimst = time.time()

comps = range(2,30)
n_splits = 5
## Makes a 3-D array of to record the accuracies in
log_reg_pca_accs = np.zeros((n_splits, len(comps)))
log_reg_pca_accs2 = np.zeros((n_splits, len(comps)))

i = 0
for train_index, test_index in kfold.split(X_train, y_train):
    ## Keeps track of the cross validation split you are on
    ## This loop can be a little long
    print("CV Split", i)
    tt = X_train[train_index]
    ho = X_train[test_index]
    y_tt = y_train[train_index]
    y_ho = y_train[test_index]
    
    j = 0
    for n_comps in comps:
        dimst = time.time()
        ## Make the PCA pipeline here
        pca_pipe = Pipeline([('scale', StandardScaler()),
                            ('pca', PCA(n_comps))])
        
        ## Fit and then get the PCA transformed tt data here
        pca_tt = pca_pipe.fit_transform(tt)
        
        ## Get the transformed holdout data here
        pca_ho = pca_pipe.transform(ho)
        
        #Logistic Regression here:
        log_reg = LogisticRegression(max_iter=10000,
                                     penalty=None)
        
        log_reg.fit(tt, y_tt)
        
        pred = log_reg.predict((ho))
        
        log_reg_pca_accs[i,j] = accuracy_score(y_ho, pred)
        log_reg_pca_accs2[i,j] = accuracy_score(y_tt, log_reg.predict(tt))
        
        j = j + 1
        dimtime = time.time() - dimst
        print("num comps = %s" % n_comps)
        print(dimtime)
    i = i + 1
et = time.time() - st
print("Total time: %s" % et)

CV Split 0
num comps = 2
1.9700865745544434
num comps = 3
2.179180383682251
num comps = 4
2.737400531768799
num comps = 5
3.0433292388916016
num comps = 6
3.0809240341186523
num comps = 7
3.133175849914551
num comps = 8
2.9243555068969727
num comps = 9
2.5698952674865723
num comps = 10
2.7075486183166504
num comps = 11
2.2993109226226807
num comps = 12
2.515131950378418
num comps = 13
2.4111814498901367
num comps = 14
2.4186789989471436
num comps = 15
2.377835988998413
num comps = 16
2.5014870166778564
num comps = 17
2.5859837532043457
num comps = 18
2.7157304286956787
num comps = 19
2.8009626865386963
num comps = 20
2.6406571865081787
num comps = 21
2.7591912746429443
num comps = 22
2.853931427001953
num comps = 23
2.3202052116394043
num comps = 24
2.302853584289551
num comps = 25
2.3452091217041016
num comps = 26
2.3269312381744385
num comps = 27
2.8968541622161865
num comps = 28
2.196471929550171
num comps = 29
2.7563438415527344
CV Split 1
num comps = 2
2.9180824756622314
num comps

In [52]:
## This code will print out the best # components - k combo for you
## It also prints out the highest AVG CV Accuracy
max_index = np.argmax(np.mean(log_reg_pca_accs, axis=0))

print(max_index)

print("The case with the highest AVG CV Accuracy had number of components =", np.round(comps[max_index],2))
print("The highest AVG CV Accuracy was", np.max(np.mean(log_reg_pca_accs, axis=0)))

print("The corresponding AVG CV Accuracy was", np.mean(log_reg_pca_accs2[:,max_index]))

0
The case with the highest AVG CV Accuracy had number of components = 2
The highest AVG CV Accuracy was 0.9415456652774177
The corresponding AVG CV Accuracy was 1.0
