In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
import itertools
import os
import shutil
import random
import glob
import matplotlib.pyplot as plt
import warnings
import scipy
import scipy.integrate as integrate
import sklearn
from collections import Counter
import joblib
import gc
from skimage.io import imread
from skimage.transform import resize
from sklearn import datasets
import pickle
from joblib import dump, load
from skimage.transform import rescale
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
def resize_all(src, pklname, include, width=150, height=None):
    height = height if height is not None else width

    data = dict()
    data['description'] = 'resized ({0}x{1})mini wav images'.format(int(width), int(height))
    data['label'] = []
    data['filename'] = []
    data['data'] = []

    pklname = f"{pklname}_{width}x{height}px.pkl"

    # read all images in PATH, resize and write to DESTINATION_PATH
    for subdir in os.listdir(src):
        if subdir in include:
            print(f"Reading images for {subdir} ...")
            current_path = os.path.join(src, subdir)

            for file in os.listdir(current_path):
                if file[-3:] in {'jpg', 'png'}:
                    im = imread(os.path.join(current_path, file))
                    im = resize(im, (width, height)) #[:,:,::-1]
                    data['label'].append(subdir[:])
                    data['filename'].append(file)
                    data['data'].append(im)
            gc.collect()

        joblib.dump(data, pklname)



In [3]:
# parameters
IMAGE_PATH = '/content/drive/MyDrive/wavForm/test'
CLASSES = os.listdir(IMAGE_PATH)
BASE_NAME = '/content/drive/MyDrive/wavForm/test_data'
WIDTH = 180

# load & resize the images
resize_all(src=IMAGE_PATH, pklname=BASE_NAME, width=WIDTH, include=CLASSES)

Reading images for 62872wav ...
Reading images for 63705wav ...
Reading images for 64560wav ...
Reading images for 63879wav ...
Reading images for 62663wav ...
Reading images for 63660wav ...
Reading images for 68585wav ...
Reading images for 64968wav ...
Reading images for 64798wav ...
Reading images for 64701wav ...
Reading images for 64914wav ...
Reading images for 67687wav ...
Reading images for 65372wav ...
Reading images for 67320wav ...
Reading images for 65438wav ...
Reading images for 68384wav ...
Reading images for 72048wav ...
Reading images for 74239wav ...
Reading images for 70560wav ...
Reading images for 72298wav ...
Reading images for 70730wav ...
Reading images for 73213wav ...
Reading images for 73657wav ...
Reading images for 71726wav ...
Reading images for 70869wav ...
Reading images for 77607wav ...
Reading images for 77771wav ...
Reading images for 77426wav ...
Reading images for 76629wav ...
Reading images for 74364wav ...
Reading images for 75263wav ...
Reading 

In [4]:
# read from the pickle/dictionary
BASE_NAME = '/content/drive/MyDrive/wavForm/test_data'
WIDTH = 180
data = joblib.load(f'{BASE_NAME}_{WIDTH}x{WIDTH}px.pkl')

print('number of samples: ', len(data['data']))
print('keys: ', list(data.keys()))
print('description: ', data['description'])
print('image shape: ', data['data'][0].shape)
print('labels:', np.unique(data['label']))

Counter(data['label'])

number of samples:  16221
keys:  ['description', 'label', 'filename', 'data']
description:  resized (180x180)mini wav images
image shape:  (180, 180, 4)
labels: ['62663wav' '62872wav' '63660wav' '63705wav' '63879wav' '64560wav'
 '64701wav' '64798wav' '64914wav' '64968wav' '65372wav' '65438wav'
 '67320wav' '67687wav' '68384wav' '68585wav' '70560wav' '70730wav'
 '70869wav' '71726wav' '72048wav' '72298wav' '73213wav' '73657wav'
 '74239wav' '74364wav' '75263wav' '75377wav' '75665wav' '76326wav'
 '76617wav' '76629wav' '77426wav' '77607wav' '77771wav' '77934wav'
 '78241wav' '79105wav' '79203wav' '79683wav']


Counter({'62872wav': 484,
         '63705wav': 197,
         '64560wav': 659,
         '63879wav': 322,
         '62663wav': 597,
         '63660wav': 385,
         '68585wav': 345,
         '64968wav': 543,
         '64798wav': 570,
         '64701wav': 338,
         '64914wav': 319,
         '67687wav': 306,
         '65372wav': 332,
         '67320wav': 1900,
         '65438wav': 338,
         '68384wav': 296,
         '72048wav': 626,
         '74239wav': 370,
         '70560wav': 284,
         '72298wav': 692,
         '70730wav': 451,
         '73213wav': 570,
         '73657wav': 282,
         '71726wav': 333,
         '70869wav': 819,
         '77607wav': 245,
         '77771wav': 193,
         '77426wav': 214,
         '76629wav': 233,
         '74364wav': 106,
         '75263wav': 324,
         '76326wav': 508,
         '76617wav': 82,
         '75665wav': 260,
         '75377wav': 141,
         '77934wav': 194,
         '78241wav': 362,
         '79683wav': 193,
         '79

In [None]:
# use np.unique to get all unique values in the list of labels
labels = np.unique(data['label'])

In [None]:
# feature engineering
x = np.array(data['data'])
y = np.array(data['label'])
data=None
gc.collect()
del data

# split - train & test
SIZE = 0.3
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=SIZE,shuffle=True,random_state=42)

In [None]:
x = None
y = None
del x
del y
gc.collect()

0

In [None]:
# normalisation
x_train = x_train/255.0
x_test = x_test/255.0

# reshape the array (4d to 2d)
nsamples, nx, ny, nrgb = x_train.shape
x_train2 = x_train.reshape((nsamples,nx*ny*nrgb))

nsamples, nx, ny, nrgb = x_test.shape
x_test2 = x_test.reshape((nsamples,nx*ny*nrgb))

In [None]:
x_train = None
x_test = None
del x_train
del x_test
gc.collect()

0

In [None]:
# random forest classifier - baseline
rfc = RandomForestClassifier(n_estimators=2000)

# fit
rfc.fit(x_train2,y_train)

# predict
y_pred = rfc.predict(x_test2)

# accuracy score
acc = '{:.1%}'.format(accuracy_score(y_test, y_pred))
print(f"Accuracy for Random Forrest: {acc}")

Accuracy for Random Forrest: 84.4%


In [None]:
#save model
dump(rfc, '/content/drive/MyDrive/wavForm/project/rfc.joblib')

['/content/drive/MyDrive/wavForm/project/rfc.joblib']

In [None]:
#load model
rfc2 = load('/content/drive/MyDrive/wavForm/project/rfc.joblib')
# predict
y_pred = rfc2.predict(x_test2)
# accuracy score
acc = '{:.1%}'.format(accuracy_score(y_test, y_pred))
rec=recall_score(y_test, y_pred, average='macro')
pre=precision_score(y_test, y_pred, average='macro')
f1=f1_score(y_test, y_pred, average='macro')
print(f"Accuracy for Random Forrest: {acc}")

print(f"recall_score:{rec}")
print(f"precision_score:{pre}")
print(f"f1_score:{f1}")

Accuracy for Random Forrest: 84.4%
recall_score:0.7887668760902675
precision_score:0.8784695185459895
f1_score:0.8095778303376697
