In [None]:
# Imports
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

## Audio loading and preprocessing

In [None]:
from preprocess_data import preprocess_data

features, (labels, labels_ohe), transformers = preprocess_data()

In [None]:
labels.unique()

In [None]:
print(features.shape)

## K-Means Clustering and t-SNE for data visualization

In [None]:
# Perform clustering on the keystrokes
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Set k to the number of unique labels logged
cluster_k = len(labels.unique())
kmeans = KMeans(n_clusters=cluster_k, random_state=0, n_init="auto").fit(features)

print(f"{cluster_k} clusters")
print('K-Means Clustering')
print(f"\tSilhouette score: {silhouette_score(features, kmeans.labels_) :.2f}")
print('')

# Get cluster predictions for each row
predicted_clusters = []
for _, keystroke in features.iterrows():
    predicted_clusters.append(kmeans.predict(keystroke.array.reshape(1, -1)))

predicted_clusters = np.array(predicted_clusters).squeeze()

In [None]:
# Plot the data with t-SNE dimensionality reduction to determine if the keystrokes make clusters

from sklearn.manifold import TSNE
import seaborn as sn

def tsne_plot(data, labels=None):
    model = TSNE(n_components = 2, random_state = 1)
    # configuring the parameters
    # the number of components = 2
    # default perplexity = 30
    # default learning rate = 200
    # default Maximum number of iterations
    # for the optimization = 1000
    
    tsne_data = model.fit_transform(data)
    
    # Colors points based on their label, if they have any
    if labels is not None:
        tsne_data = np.vstack((tsne_data.T, labels)).T
        tsne_df = pd.DataFrame(data = tsne_data,
           columns =("Dim_1", "Dim_2", "label"))
    
        # Plotting the result of tsne
        sn.scatterplot(data=tsne_df, x='Dim_1', y='Dim_2',
                       hue='label', palette="bright")
    else:
        tsne_df = pd.DataFrame(data = tsne_data,
           columns =("Dim_1", "Dim_2"))
    
        # Plotting the result of tsne
        sn.scatterplot(data=tsne_df, x='Dim_1', y='Dim_2', palette="bright")
    
    plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')

    plt.title("t-SNE keystrokes")
    plt.show()

tsne_plot(features, labels)