In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

import myfunctions
import myplot
import mylearn
import myhelpers

logitech_step1_fast = ('logitech-fast/niels-step_1-1617765899871', 0)
logitech_step1_slow = ('logitech-slow/niels-step_1-1617765658270', 60, True)
lenovo_slow = ('lenovo-slow-easy/niels-step_5-1617952576650', 0)
lenovo_slow2 = ('lenovo-slow-easy/niels-step_5-1618197371437', 0)
lenovo_hard = ('lenovo-hard/niels-step_1-1618199054005', 0)
noweb1 = ('winform/winform-1618904428438', 60, False)
noweb2 = ('winform/winform-1619147742756', 75, False)

# Home, quiet, long measurement, (Space,Return,Back,Tilde), 903 keystrokes with 731 > 0.1
noweb_logitech1 = ('winform/winform-1619400514590', -80, True, 'home,long,4key') # Still lost some samples after about 100 seconds

# Home, quiet, short measurement taken a little while after noweb_logitech1, (Space,Return,Back,Tilde), 42 keystrokes with 19 > 0.1
noweb_logitech2 = ('winform/winform-1619417452127', 70, False, 'home,short,4key')

# NTU lab, long, (Space,Return,Back,Tilde)
noweb_logitech3 = ('winform/winform-1619490434553', 70, False, 'NTU,long,4key')

# NTU lab, short, (Space,Return,Back,Tilde)
noweb_logitech4 = ('winform/winform-1619497973221', 70, False, 'NTU,short,4key')

# NTU lab, short, AC noise, (Space,Return,Back,Tilde)
noweb_logitech5 = ('winform/winform-1619498839675', 70, False, 'NTU,short,4key,AC')

# NTU lab, short, AC noise, typing some text naturally
noweb_logitech6 = ('winform/winform-1619499075929', 70, False, 'NTU,short,4key,AC')


def getData2(x, keystroke_min_peak_level):
    if len(x) == 4:
        path, sync_adjustment, adjust_missing_samples, name = x
    else:
        path, sync_adjustment, adjust_missing_samples = x
        name = ''
    d = myfunctions.getData(path,
                            sync_adjustment=sync_adjustment,
                            adjust_missing_samples=adjust_missing_samples,
                            keystroke_min_peak_level=keystroke_min_peak_level)
    d['name'] = name
    print(myhelpers.getListGroupPercentages([x[0] for x in d['keystrokes']], '\n'))
    return d

In [None]:
d1 = getData2(noweb_logitech1, keystroke_min_peak_level=0.05)
d2 = getData2(noweb_logitech2, keystroke_min_peak_level=0.05)
d3 = getData2(noweb_logitech3, keystroke_min_peak_level=0.05)
d4 = getData2(noweb_logitech4, keystroke_min_peak_level=0.05)
d5 = getData2(noweb_logitech5, keystroke_min_peak_level=0.05)
d6 = getData2(noweb_logitech6, keystroke_min_peak_level=0.05)

In [None]:
d=d6

# myplot.plotLostSamples(d)

# myplot.plotKeystrokeContext(d, 20, 70)

# for i in range(21, 31):
#     myplot.plotKeystroke(d, i)

# d.keys()

# set([x[3] for x in d['down_events']])

# for i in range(10, 17):
#     myplot.plotKeystroke(d, i)

d=None

In [None]:
# This worked pretty well (lenovo)
# mylearn.testClustering(getData2(noweb2, keystroke_min_peak_level=0.1),
#     keep=None,
#     features=['mfcc_mean', 'mfcc_std'])

# Half-decent result using larger dataset (6 min, ~800 keystrokes), but still an easy case with just 4 keys (Return, Space, Tilde, Backspace)
mylearn.testClustering(getData2(noweb_logitech1, keystroke_min_peak_level=0.1),
    keep=None,
    features=['mfcc_mean', 'mfcc_std'])

In [None]:
mylearn.testClustering(d,
    keep=None,
    features=[
        'mfcc_mean',
        'mfcc_std',
#         'mfcc_max',
#         'mfcc_argmax_time',
#         'mfcc_argmax_channel'
    ])



In [None]:
def findNumberOfClusters(features):
    x = list(range(2, 20))
    scores = []
    for n_clusters in tqdm(x):
        kmeans = sklearn.cluster.KMeans(n_clusters)
        kmeans.fit(features)
        kmeans_score = kmeans.score(features)
        clustering = kmeans.predict(features)
        kmeans_silhouette_score = silhouette_score(features, clustering)

        gm = sklearn.mixture.GaussianMixture(n_clusters)
        gm.fit(features)
        gm_score = gm.score(features)
        gm_bic_score = gm.bic(features)
        
        scores.append((kmeans_score, kmeans_silhouette_score, gm_score, gm_bic_score))
    return x, scores

features = mylearn.getConcatenatedFeatures(d, ['mfcc_max', 'mfcc_mean', 'mfcc_std'])
number_of_clusters,scores = findNumberOfClusters(mylearn.getConcatenatedFeatures(d, ['mfcc_max', 'mfcc_mean', 'mfcc_std']))

for (i, s) in enumerate(['kmeans_score', 'kmeans_silhouette_score', 'gm_score', 'gm_bic_score']):
    plt.figure()
    plt.title(s)
    plt.scatter(number_of_clusters, [x[i] for x in scores])
    plt.show()

In [None]:
import librosa
sr = d['sr']
srms = int(sr/1000)
window_length = int(10*srms)
hop_length = int(2.5*srms)
wav = d['keystroke_wavs'][0]
print( len(wav) )
print( len(wav) / srms )
print( librosa.feature.spectral_centroid(wav, sr, win_length=window_length, hop_length=hop_length).shape )
print( librosa.feature.chroma_stft(wav, sr, win_length=window_length, hop_length=hop_length).shape )
print( librosa.feature.zero_crossing_rate(wav, frame_length=window_length, hop_length=hop_length).shape )

In [None]:
myplot.plotPCA(d1, ['mfcc_max', 'mfcc_mean'])
# 'mfcc_max', 'mfcc_mean', 'mfcc_std', 'mfcc_argmax_time', 'mfcc_argmax_channel'

In [None]:
myplot.plotPCA(d3, ['mfcc_max', 'mfcc_mean'])

In [None]:
# d=getData2(('winform-1619490289360', 0, False), 0)
myplot.plotMatplotlib(d)
# myplot.plotWaveAndKeys(d)
# getData2(x, keystroke_min_peak_level)

In [None]:


# 100 training samples seems about enough (for 4 keys)
# [(training_sample_count, logRegTrain(d1, training_sample_count=training_sample_count)[2])
#  for training_sample_count in [10, 20, 30, 40, 50, 75, 100, 125, 150, 175, 200, 250, 300, 400, 500, 600, 700, 800]
# ]

# scaler, lr = logRegTrain(d1)
# logRegTest(d3, scaler, lr)

import warnings
warnings.filterwarnings('ignore')

def test(d_train, d_test, featurenames, classifier, labels_filter=None):
    def dname(data):
        if type(data) != list:
            data = [data]
        return '+'.join([d['name'] for d in data])
    scaler, classifier = mylearn.classifierTrain(d_train, featurenames=featurenames, classifier=classifier, labels_filter=labels_filter)
    score = mylearn.classifierTest(scaler, classifier, d_test, labels_filter=labels_filter, featurenames=featurenames)
    if labels_filter == None:
        print (f'Score {score:1.3f} when trained on {dname(d_train):30} and tested using {dname(d_test):20}')
    else:
        print (f'Score {score:1.3f} when trained on {dname(d_train):30} and tested using {dname(d_test):20} with labels filter {labels_filter}')
    return score

# default solver is incredibly slow which is why it was changed to 'lbfgs'
def testAll(featurenames, classifierFactory=lambda: LogisticRegression(solver = 'lbfgs')):
    scores = []
    scores.append(test(d1, d2, featurenames=featurenames, classifier=classifierFactory()))
    scores.append(test(d1, d3, featurenames=featurenames, classifier=classifierFactory()))
    scores.append(test(d1, d4, featurenames=featurenames, classifier=classifierFactory()))
    scores.append(test(d1, d5, featurenames=featurenames, classifier=classifierFactory()))
    scores.append(test(d1, d6, featurenames=featurenames, classifier=classifierFactory()))
    print()
    scores.append(test(d3, d1, featurenames=featurenames, classifier=classifierFactory()))
    scores.append(test(d3, d2, featurenames=featurenames, classifier=classifierFactory()))
    scores.append(test(d3, d4, featurenames=featurenames, classifier=classifierFactory()))
    scores.append(test(d3, d5, featurenames=featurenames, classifier=classifierFactory()))
    scores.append(test(d3, d6, featurenames=featurenames, classifier=classifierFactory()))
    print()
    scores.append(test([d1, d3], d2, featurenames=featurenames, classifier=classifierFactory()))
    scores.append(test([d1, d3], d4, featurenames=featurenames, classifier=classifierFactory()))
    scores.append(test([d1, d3], d5, featurenames=featurenames, classifier=classifierFactory()))
    scores.append(test([d1, d3], d6, featurenames=featurenames, classifier=classifierFactory()))
    print()
    scores.append(test(d1, d2, featurenames=featurenames, classifier=classifierFactory(), labels_filter=['Space']))
    scores.append(test(d1, d3, featurenames=featurenames, classifier=classifierFactory(), labels_filter=['Space']))
    scores.append(test(d1, d4, featurenames=featurenames, classifier=classifierFactory(), labels_filter=['Space']))
    scores.append(test(d1, d5, featurenames=featurenames, classifier=classifierFactory(), labels_filter=['Space']))
    scores.append(test(d1, d6, featurenames=featurenames, classifier=classifierFactory(), labels_filter=['Space']))
    print()
    scores.append(test(d3, d1, featurenames=featurenames, classifier=classifierFactory(), labels_filter=['Space']))
    scores.append(test(d3, d2, featurenames=featurenames, classifier=classifierFactory(), labels_filter=['Space']))
    scores.append(test(d3, d4, featurenames=featurenames, classifier=classifierFactory(), labels_filter=['Space']))
    scores.append(test(d3, d5, featurenames=featurenames, classifier=classifierFactory(), labels_filter=['Space']))
    scores.append(test(d3, d6, featurenames=featurenames, classifier=classifierFactory(), labels_filter=['Space']))
    print()
    scores.append(test([d1, d3], d2, featurenames=featurenames, classifier=classifierFactory(), labels_filter=['Space']))
    scores.append(test([d1, d3], d4, featurenames=featurenames, classifier=classifierFactory(), labels_filter=['Space']))
    scores.append(test([d1, d3], d5, featurenames=featurenames, classifier=classifierFactory(), labels_filter=['Space']))
    scores.append(test([d1, d3], d6, featurenames=featurenames, classifier=classifierFactory(), labels_filter=['Space']))
    print (f'Mean score: {np.mean(scores)}')

In [None]:
d1['mfcc_flattened'].shape

In [None]:
testAll(['mfcc_flattened'])

In [None]:
testAll(['mfcc_flattened'], lambda: KNeighborsClassifier())

In [None]:
testAll(['mfcc_flattened'], lambda: MLPClassifier())

In [None]:
testAll(['mfcc_flattened'], lambda: GaussianNB())

In [None]:
import random

def specshowRandomMfccFeatures(data, key):
    indexes = [i for i, x in enumerate(data['keystroke_labels']) if x == key]
    index = random.choice(indexes)
    print (f'Key {data["keystroke_labels"][index]} at index {index}')
    librosa.display.specshow(data['normalised_mfcc_features'][index])

specshowRandomMfccFeatures(d1, 'Space')

In [None]:
specshowRandomMfccFeatures(d1, 'Oemtilde')

In [None]:
set(d1["keystroke_labels"])

In [None]:
w = d1['keystrokes'][0][1]
sr = d1['sr']
print(w.shape)
print(sr)
import librosa
print(librosa.feature.mfcc(w, sr, n_mfcc=32, n_fft=len(w), hop_length=len(w)).shape)