# BirdNET Spectrogram Data and Logistic Regression

#### In this notebook, we load the preprocessed BirdNET spectrogram dataset and explore using Logistic Regression.

In [1]:
## Import necessary modules
import numpy as np
import os
import librosa
import keras
import pandas as pd
import time

import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

2023-06-01 14:31:00.755501: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  _warn(("h5py is running against HDF5 {0} when it was built against {1}, "


In [2]:
import classifiers

In [3]:
spectrograms = pd.read_csv('/home/birdsong/processed_data/spectrograms/spectrograms.csv').values[:,1:]
labels = pd.read_csv('/home/birdsong/processed_data/spectrograms/labels_fp_are_tp.csv').values[:,1]
#labels = np.concatenate((labels_tp, np.full(7781, -1, dtype = int)))
probabilities = pd.read_csv('/home/birdsong/processed_data/spectrograms/probabilities.csv').values

In [4]:
print(spectrograms.shape)
print(labels.shape)
print(probabilities.shape)

(8997, 2816)
(8997,)
(8997, 25)


In [5]:
## Only consider the true-positive data
spectrograms_tp = spectrograms[:1216]
print(spectrograms_tp.shape)

## Only consider the false-positive data
spectrograms_fp = spectrograms[1216:]
print(spectrograms_fp.shape)

(1216, 2816)
(7781, 2816)


In [6]:
arr0 = np.zeros(7781, dtype=int)
arr1 = np.ones(1216, dtype=int)
tp_or_fp = np.concatenate((arr1, arr0))

In [7]:
cls_tp = classifiers.Classifier(input_X = spectrograms_tp,
                                input_y = labels[:1216])

cls_tp.t_t_split()

In [8]:
cls_tp.log_reg(CV = False)
print("---------------------")

cls_tp.log_reg(CV = True)
print("---------------------")

cls_tp.svm(CV = False)
print("---------------------")
cls_tp.svm(CV = True)
print("---------------------")
cls_tp.knn(CV = True,
        max_n_neighbors = 30)
print("---------------------")

cls_tp.save_statistics('/home/birdsong/classifier_stats/cls_tp_stats')
classifiers.save_object_to_pickle(cls_tp, '/home/birdsong/classifier_stats/cls_tp')

#cls_tp.log_reg_PCA(PCA_dims = [i for i in range(2, 50)])
print("---------------------")

cls_tp.save_statistics('/home/birdsong/classifier_stats/cls_tp_stats')
classifiers.save_object_to_pickle(cls_tp, '/home/birdsong/classifier_stats/cls_tp')

for dim in range(100, 901, 100):
    cls_tp.log_reg_PCA(CV = False, PCA_dim = dim)
    print("---------------------")
    
cls_tp.save_statistics('/home/birdsong/classifier_stats/cls_tp_stats')
classifiers.save_object_to_pickle(cls_tp, '/home/birdsong/classifier_stats/cls_tp')

#cls_tp.rand_forest(n_trees = [500])

The Logistic Regression test accuracy was 0.6284153005464481
The Logistic Regression train accuracy was 1.0
---------------------
CV Split: 0
CV Split: 1
CV Split: 2
CV Split: 3
CV Split: 4
Elapsed Time: 5.393780946731567
The highest CV Logistic Regression test accuracy was 0.6376811594202898
The highest CV Logistic Regression train accuracy was 1.0
---------------------
The SVM test accuracy was 0.4972677595628415
The SVM train accuracy was 0.8238141335914811
---------------------
CV Split: 0
CV Split: 1
CV Split: 2
CV Split: 3
CV Split: 4
Elapsed Time: 9.415290594100952
The highest CV SVM test accuracy was 0.5048543689320388
The highest CV SVM train accuracy was 0.8305084745762712
---------------------
CV Split: 0
CV Split: 1
CV Split: 2
CV Split: 3
CV Split: 4
Elapsed Time: 9.870601892471313
The k with the highest AVG CV Accuracy was k = 1
The highest CV KNN test accuracy was 0.3514094085643263
The corresponding CV KNN train accuracy was 0.4498988438037072
---------------------
----

In [9]:
cls_tp.save_statistics('/home/birdsong/classifier_stats/cls_tp_stats')
classifiers.save_object_to_pickle(cls_tp, '/home/birdsong/classifier_stats/cls_tp')

In [10]:
cls_tp_fp = classifiers.Classifier(input_X = spectrograms,
                                   input_y = labels, 
                                   stratify_var = tp_or_fp.reshape(-1,1))
cls_tp_fp.t_t_split()

In [11]:
cls_tp_fp.log_reg(CV = False)
print("---------------------")

cls_tp_fp.log_reg(CV = True)
print("---------------------")

cls_tp_fp.svm(CV = False)
print("---------------------")
cls_tp_fp.svm(CV = True)
print("---------------------")
cls_tp_fp.knn(CV = True,
        max_n_neighbors = 30)
print("---------------------")

cls_tp_fp.save_statistics('/home/birdsong/classifier_stats/cls_tp_fp_stats')
classifiers.save_object_to_pickle(cls_tp_fp, '/home/birdsong/classifier_stats/cls_tp_fp')

for dim in range(100, 901, 100):
    cls_tp_fp.log_reg_PCA(CV = False, PCA_dim = dim)
    print("---------------------")

cls_tp_fp.save_statistics('/home/birdsong/classifier_stats/cls_tp_fp_stats')
classifiers.save_object_to_pickle(cls_tp_fp, '/home/birdsong/classifier_stats/cls_tp_fp')
    
#cls_tp_fp.log_reg_PCA(PCA_dims = [i for i in range(2, 50)])
print("---------------------")

cls_tp_fp.save_statistics('/home/birdsong/classifier_stats/cls_tp_fp_stats')
classifiers.save_object_to_pickle(cls_tp_fp, '/home/birdsong/classifier_stats/cls_tp_fp')

#cls_tp_fp.rand_forest(n_trees = [500])

The Logistic Regression test accuracy was 0.2777777777777778
The Logistic Regression train accuracy was 1.0
---------------------
CV Split: 0
CV Split: 1
CV Split: 2
CV Split: 3
CV Split: 4
Elapsed Time: 84.08681750297546
The highest CV Logistic Regression test accuracy was 0.26405228758169935
The highest CV Logistic Regression train accuracy was 1.0
---------------------
The SVM test accuracy was 0.2651851851851852
The SVM train accuracy was 0.49758075062115864
---------------------
CV Split: 0
CV Split: 1
CV Split: 2
CV Split: 3
CV Split: 4
Elapsed Time: 575.3640592098236
The highest CV SVM test accuracy was 0.26880313930673644
The highest CV SVM train accuracy was 0.500817260542661
---------------------
CV Split: 0
CV Split: 1
CV Split: 2
CV Split: 3
CV Split: 4
Elapsed Time: 155.1172456741333
The k with the highest AVG CV Accuracy was k = 1
The highest CV KNN test accuracy was 0.15561745256201456
The corresponding CV KNN train accuracy was 0.23110388077578214
---------------------


In [12]:
cls_tp_fp.save_statistics('/home/birdsong/classifier_stats/cls_tp_fp_stats')
classifiers.save_object_to_pickle(cls_tp_fp, '/home/birdsong/classifier_stats/cls_tp_fp')

In [13]:
cls_tp_fp_unstrat = classifiers.Classifier(input_X = spectrograms,
                                   input_y = labels)
cls_tp_fp_unstrat.t_t_split()

In [14]:
cls_tp_fp_unstrat.log_reg(CV = False)
print("---------------------")

cls_tp_fp_unstrat.svm(CV = False)
print("---------------------")

cls_tp_fp_unstrat.log_reg(CV = True)
print("---------------------")

cls_tp_fp_unstrat.svm(CV = True)
print("---------------------")
cls_tp_fp_unstrat.knn(CV = True,
        max_n_neighbors = 30)
print("---------------------")

cls_tp_fp_unstrat.save_statistics('/home/birdsong/classifier_stats/cls_tp_fp_unstrat_stats')
classifiers.save_object_to_pickle(cls_tp_fp_unstrat, '/home/birdsong/classifier_stats/cls_tp_fp_unstrat')

for dim in range(100, 901, 100):
    cls_tp_fp_unstrat.log_reg_PCA(CV = False, PCA_dim = dim)
    print("---------------------")

cls_tp_fp_unstrat.save_statistics('/home/birdsong/classifier_stats/cls_tp_fp_unstrat_stats')
classifiers.save_object_to_pickle(cls_tp_fp_unstrat, '/home/birdsong/classifier_stats/cls_tp_fp_unstrat')
    
#cls_tp_fp.log_reg_PCA(PCA_dims = [i for i in range(2, 50)])
print("---------------------")

cls_tp_fp_unstrat.save_statistics('/home/birdsong/classifier_stats/cls_tp_fp_unstrat_stats')
classifiers.save_object_to_pickle(cls_tp_fp_unstrat, '/home/birdsong/classifier_stats/cls_tp_fp_unstrat')

#cls_tp_fp.rand_forest(n_trees = [500])

The Logistic Regression test accuracy was 0.2777777777777778
The Logistic Regression train accuracy was 1.0
---------------------
CV Split: 0
CV Split: 1
CV Split: 2
CV Split: 3
CV Split: 4
Elapsed Time: 83.47860145568848
The highest CV Logistic Regression test accuracy was 0.26405228758169935
The highest CV Logistic Regression train accuracy was 1.0
---------------------


KeyboardInterrupt: 

In [19]:
labels_old = pd.read_csv('/home/birdsong/processed_data/spectrograms/labels.csv').values[:,1]
labels_old.shape

(8997,)

In [20]:
cls_tp_fp_old = classifiers.Classifier(input_X = spectrograms,
                                   input_y = labels_old, 
                                   stratify_var = tp_or_fp.reshape(-1,1))
cls_tp_fp_old.t_t_split()

In [21]:
cls_tp_fp_old.log_reg(CV = False)
print("---------------------")

cls_tp_fp_old.svm(CV = False)

cls_tp_fp_old.knn(CV = True,
        max_n_neighbors = 30)
print("---------------------")

cls_tp_fp_old.save_statistics('/home/birdsong/classifier_stats/cls_tp_fp_stats_old')
classifiers.save_object_to_pickle(cls_tp_fp_old, '/home/birdsong/classifier_stats/cls_tp_fp_old')

The Logistic Regression test accuracy was 0.8711111111111111
The Logistic Regression train accuracy was 1.0
---------------------
The SVM test accuracy was 0.8637037037037038
The SVM train accuracy was 0.8775990584542958
CV Split: 0
CV Split: 1
CV Split: 2
CV Split: 3
CV Split: 4
Elapsed Time: 155.42831254005432
The k with the highest AVG CV Accuracy was k = 1
The highest CV KNN test accuracy was 0.8658299456691332
The corresponding CV KNN train accuracy was 0.8663855354530217
---------------------


In [22]:
cls_tp.total_statistics

{'ttsplit': [array([[-26.07621 , -28.625114, -28.038353, ..., -54.84223 , -54.84223 ,
          -54.84223 ],
         [-24.521595, -22.424809, -23.177723, ..., -59.69963 , -59.69963 ,
          -59.69963 ],
         [ -9.001114, -10.670686,  -9.204363, ..., -59.994778, -59.994778,
          -59.994778],
         ...,
         [-19.07335 , -19.442553, -21.958101, ..., -57.718227, -57.718227,
          -57.718227],
         [-24.545622, -19.59078 , -22.405176, ..., -74.20475 , -74.20475 ,
          -74.20475 ],
         [-33.640034, -25.323025, -23.485662, ..., -75.25044 , -75.25044 ,
          -75.25044 ]]),
  array([[-24.870014 , -24.146294 , -19.906849 , ..., -81.47645  ,
          -81.47645  , -81.47645  ],
         [-20.263443 ,  -7.3932524, -12.107784 , ..., -64.78799  ,
          -64.78799  , -64.78799  ],
         [ -7.6386857,  -5.042379 , -12.9616585, ..., -79.15178  ,
          -79.15178  , -79.15178  ],
         ...,
         [-18.565304 , -18.835167 , -16.506273 , ..., -56.6

In [23]:
accuracy_score(cls_tp.y_test, cls_tp.y_log_reg_test_pred)

0.6284153005464481

In [40]:
labels

array([141, 231, 121, ..., 174, 141,  61])

In [25]:
(cls_tp.y_test == cls_tp.y_log_reg_test_pred).astype(int)

array([1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 1])

In [86]:
cls_tp.y_test.shape

(183,)

In [27]:
cls_tp.y_log_reg_test_pred

array([ 71,  11, 101,  51,  81,  11,  11,  51, 231, 121,  71,   1, 234,
       211,  81, 121, 164, 231, 171, 211, 211, 174, 211,  71,   1, 101,
       141,  51,  11, 131, 221,  21, 111, 211, 171,  21, 101, 201,  91,
       164, 164, 221, 231, 131,  21,  21, 221,  81, 181,  11,  61,  21,
        41, 231,  31,  81, 111, 151, 164, 141,  51,  51, 131,  51, 181,
        91, 171,  71, 141, 101, 101,  41, 111, 101, 181, 121, 121,  41,
       131,  11,   1,  41,  11, 231, 151, 231, 231, 211,  51, 231, 181,
       221,  71, 151,  61, 151, 164, 121,  11, 141, 231, 181, 201, 234,
       234,  21,  81,  91, 164,  61, 101,  61, 174,  71, 171,  31,   1,
       191,  21, 231, 231,  71, 221, 141, 121,  91,  71, 211,  81, 181,
       181, 221,  71, 131, 221, 191, 111, 211, 111,  31, 131, 151,  61,
       101, 141,  11, 211, 164, 164, 111, 231, 191,  21, 191, 191,   1,
       101, 141,  51, 231,  71,  81, 191, 234, 174, 121, 164, 141,  51,
       191, 151,  11, 111, 171,  61,  51, 221,  11, 141,  31, 18

In [99]:
test_pred = np.concatenate((cls_tp.y_test.reshape(-1,1), cls_tp.y_log_reg_test_pred.reshape(-1,1)), axis = 1)

In [100]:
test_pred

array([[ 71,  71],
       [ 11,  11],
       [221, 101],
       [ 51,  51],
       [ 81,  81],
       [234,  11],
       [ 11,  11],
       [ 51,  51],
       [231, 231],
       [121, 121],
       [ 71,  71],
       [  1,   1],
       [234, 234],
       [211, 211],
       [  1,  81],
       [121, 121],
       [ 91, 164],
       [234, 231],
       [111, 171],
       [ 81, 211],
       [ 81, 211],
       [174, 174],
       [211, 211],
       [ 71,  71],
       [121,   1],
       [234, 101],
       [141, 141],
       [ 51,  51],
       [141,  11],
       [131, 131],
       [151, 221],
       [ 21,  21],
       [111, 111],
       [ 41, 211],
       [ 31, 171],
       [221,  21],
       [101, 101],
       [111, 201],
       [ 31,  91],
       [164, 164],
       [164, 164],
       [ 61, 221],
       [231, 231],
       [131, 131],
       [ 21,  21],
       [ 21,  21],
       [221, 221],
       [ 81,  81],
       [ 81, 181],
       [ 11,  11],
       [171,  61],
       [151,  21],
       [ 41,

In [52]:
frog_types = [0, 1, 2, 3, 4, 8, 12, 14, 16, 18, 21]
bird_types = [5, 6, 7, 9, 10, 11, 13, 15, 17, 19, 20, 22, 23]

frog_labels = []
for i in frog_types:
    if i == 16:
        new_i = 164
        frog_labels.append(new_i)
    else:
        new_i = int(str(i) + '1')
        frog_labels.append(new_i)
print(frog_labels)

bird_labels = []
for i in bird_types:
    if i == 17 or i == 23:
        new_i_1 = int(str(i) + '1')
        new_i_4 = int(str(i) + '4')
        bird_labels.extend([new_i_1, new_i_4])
    else:
        new_i = int(str(i) + '1')
        bird_labels.append(new_i)
print(bird_labels)

frog_bird_labels = frog_labels + bird_labels
print(frog_bird_labels)

[1, 11, 21, 31, 41, 81, 121, 141, 164, 181, 211]
[51, 61, 71, 91, 101, 111, 131, 151, 171, 174, 191, 201, 221, 231, 234]
[1, 11, 21, 31, 41, 81, 121, 141, 164, 181, 211, 51, 61, 71, 91, 101, 111, 131, 151, 171, 174, 191, 201, 221, 231, 234]


In [101]:
frog_bird_labels_str = [str(item) for item in frog_bird_labels]
mat = np.zeros((len(frog_bird_labels) + 1, len(frog_bird_labels) + 1), dtype = int)
mat[0, 1:] = frog_bird_labels
mat[1:,0] = frog_bird_labels

for item in test_pred:
    test = item[0]
    ind = np.where(mat[0] == test)[0][0]
    pred = item[1]
    ind_pred = np.where(mat[:,0] == pred)[0][0]
    if test == pred:
        mat[ind, ind] += 2
    else:
        mat[ind, ind] += 1
        mat[ind_pred, ind] += 1

In [106]:
np.savetxt('/home/birdsong/classifier_stats/cls_tp_log_reg_CM.csv', mat, delimiter = ',')

In [None]:
mat = np.zeros((len(frog_bird_labels) + 1, len(frog_bird_labels) + 1), dtype = int)
mat[0, 1:] = frog_bird_labels
mat[1:,0] = frog_bird_labels

for item in test_pred:
    test = item[0]
    ind = np.where(mat[0] == test)[0][0]
    pred = item[1]
    ind_pred = np.where(mat[:,0] == pred)[0][0]
    if test == pred:
        mat[ind, ind] += 2
    else:
        mat[ind, ind] += 1
        mat[ind_pred, ind] += 1