In [25]:
import os
import sys
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from features import FeatureExtractor
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
import pickle


# %%---------------------------------------------------------------------------
#
#		                 Load Data From Disk
#
# -----------------------------------------------------------------------------

data_dir = 'testing_data' # directory where the data files are stored

# the filenames should be in the form 'speaker-data-subject-1.csv', e.g. 'speaker-data-Erik-1.csv'.

class_names = [] # the set of classes, i.e. speakers

data = np.zeros((0,8002)) #8002 = 1 (timestamp) + 8000 (for 8kHz audio data) + 1 (label)

# for filename in os.listdir(data_dir):
# 	if filename.endswith(".csv") and filename.startswith("speaker-data"):
# 		filename_components = filename.split("-") # split by the '-' character
# 		speaker = filename_components[2]
# 		print("Loading data for {}.".format(speaker))
# 		if speaker not in class_names:
# 			class_names.append(speaker)
# 		speaker_label = class_names.index(speaker)
# 		sys.stdout.flush()
# 		data_file = os.path.join(data_dir, filename)
# 		data_for_current_speaker = np.genfromtxt(data_file, delimiter=',')
# 		print("Loaded {} raw labelled audio data samples.".format(len(data_for_current_speaker)))
# 		sys.stdout.flush()
# 		data = np.append(data, data_for_current_speaker, axis=0)
        
        
        
data_file = os.path.join(data_dir, 'speaker-data-Silent-0.csv')
data = np.genfromtxt(data_file, delimiter=',')
print("Loaded {} raw labelled audio data samples.".format(len(data)))
sys.stdout.flush()





# print("Found data for {} speakers : {}".format(len(class_names), ", ".join(class_names)))
# %%---------------------------------------------------------------------------
#
#		                Extract Features & Labels
#
# -----------------------------------------------------------------------------

# Update this depending on how you compute your features
n_features = 985

print("Extracting features and labels for {} audio windows...".format(data.shape[0]))
sys.stdout.flush()

X = np.zeros((0,n_features))
y = np.zeros(0,)

# change debug to True to show print statements we've included:
feature_extractor = FeatureExtractor(debug=False) 

nr_total_windows = 0
nr_bad_windows = 0
nr_windows_with_zeros = 0

for i,window_with_timestamp_and_label in enumerate(data):
    window = window_with_timestamp_and_label[1:-1]
    label = data[i,-1]
    nr_total_windows += 1
    try:
        x = feature_extractor.extract_features(window)
        if (len(x) != X.shape[1]):
            print("Received feature vector of length {}. Expected feature vector of length {}.".format(len(x), X.shape[1]))
        X = np.append(X, np.reshape(x, (1,-1)), axis=0)
        y = np.append(y, label)
    except KeyError as e:
        print(e)
        nr_bad_windows += 1
        if np.all((window == 0)):
            nr_windows_with_zeros += 1
print("{} windows found".format(nr_total_windows))
print("{} bad windows found, with {} windows with only zeros".format(nr_bad_windows, nr_windows_with_zeros))
    
print("Finished feature extraction over {} windows".format(len(X)))
print("Unique labels found: {}".format(set(y)))
sys.stdout.flush()

Loaded 196 raw labelled audio data samples.
Extracting features and labels for 196 audio windows...
196 windows found
0 bad windows found, with 0 windows with only zeros
Finished feature extraction over 196 windows
Unique labels found: {0.0}


In [26]:
loaded_model = pickle.load(open('training_output/classifier.pickle', 'rb'))
result = loaded_model.score(X, y)
# print(result)
prediction = loaded_model.predict(X)
print(len(prediction))

196


In [27]:
from collections import Counter
print(Counter(prediction))

Counter({0.0: 184, 2.0: 12})
