In [1]:
from sklearn.cluster import KMeans
from scipy.stats import multivariate_normal
import matplotlib.pyplot as plt
from matplotlib import colors as mcolors
from datetime import datetime as dt
import pandas as pd
import numpy as np
import random

In [2]:
DATA_PATH = "../../data/02_intermediate/"

In [3]:
def cc(arg):
    return mcolors.to_rgba(arg, alpha=0.6)

def all_colors():
    return ["tab:blue", "tab:orange", "tab:green", "tab:red", "tab:purple", "tab:brown", "tab:pink", "tab:gray", "tab:olive", "tab:cyan",
            "b", "g", "r", "c", "m", "y", "k",
            "limegreen", "cornflowerblue", "mediumblue", "darkorange", "maroon", "deepskyblue", "darkmagenta"]

def random_color():
    pallette = all_colors()
    return cc(random.choice(pallette))

def get_color(index):
    pallette = all_colors()
    return cc(pallette[index])

def all_lines():
    return ["-", "--", "-.", "."]

def all_markers():
    return [".", ",", "o", "v", "^", "1", "8", "*", "H", "d"]

def random_marker():
    return random.choice(all_markers())

In [4]:
def custom_scatter_2D(matrix, labels, cluster_centers, n_clusters, digit, coeffs=[0, 1]):
    """Scatter plot of 2 dimensions of kmeans results"""
    fig = plt.figure()
    ax = fig.add_subplot()

    clustered_matrix = parse_labels(matrix=matrix, labels=labels, n_clusters=n_clusters)
    # for cluster in clustered_matrix:
    for index in range(len(clustered_matrix)):
        cluster = clustered_matrix[index]
        xs = cluster[:, coeffs[0]]
        ys = cluster[:, coeffs[1]]
        ax.scatter(x=xs, y=ys, s=0.5, color=get_color(index=index), marker=random_marker())
    
    ax.set_xlabel(f"MFCC {coeffs[0]}")
    ax.set_ylabel(f"MFCC {coeffs[1]}")
    ax.set_title(f"K-Means Result for Digit {digit} with {n_clusters} Clusters")

    plt.show()

In [5]:
def create_filter_arr(labels, cluster):
    """Create boolean flagged filter array to apply to filter np array"""
    filter_arr = []
    for label in labels:
        if label == cluster:
            filter_arr.append(True)
        else:
            filter_arr.append(False)
    return filter_arr

def parse_labels(matrix, labels, n_clusters):
    """Filter matrix and labels into clustered matrices for scatter plotting"""
    clustered_matrix = []

    for cluster in range(n_clusters):
        filter_arr = create_filter_arr(labels=labels, cluster=cluster)
        sub_matrix = matrix[filter_arr]
        clustered_matrix.append(sub_matrix)

    return clustered_matrix

def analyze_cluster(matrix, labels, cluster_centers, n_clusters):
    """Compute covariance and pi value for gmm vars of clusters"""
    covariance_matrix = []
    pi_matrix = []

    for cluster in range(n_clusters):
        filter_arr = create_filter_arr(labels=labels, cluster=cluster)
        sub_matrix = matrix[filter_arr]

        pi = len(sub_matrix) / len(matrix)
        covariance = np.cov(np.transpose(sub_matrix))

        pi_matrix.append(pi)
        covariance_matrix.append(covariance)
        
    return (covariance_matrix, pi_matrix)

In [6]:
class GaussParams:
    """ Gaussian Mixture Model object to encapsulate params """
    def __init__(self, u, pi, cov):
        self.u = u
        self.pi = pi
        self.cov = cov

    def __str__(self):
        return f"u: {self.u}\npi: {self.pi}\ncov: {self.cov}"

In [7]:
def create_model(digits, clusters, model_coeffs, train_path):
    """
    Create a gaussian mixture model from parameters
    digits - max digit to train through
    cluters - array of clutser counts (indexed by digit)
    model_coeffs - range of model coefficients to use for modeling
    train_path - relative path to training data
    """
    gauss_results = []
    for digit in range(digits):
        # Read in train file and parse as dataframe
        filename = f"{train_path}{digit}.txt"
        df = pd.read_csv(filename, skip_blank_lines=True, delimiter=' ', header=None)
        df.dropna(axis=0, inplace=True)

        # Filter dataframe down to only model coefficient columns
        df_filter = df.iloc[:, model_coeffs]
        matrix = df_filter.values
        n_clusters = clusters[digit]

        # Apply kmeans on the matrix of values
        kmeans = KMeans(n_clusters=n_clusters, random_state=0)
        kmeans.fit(matrix)
        labels = kmeans.labels_
        cluster_centers = kmeans.cluster_centers_

        # Record the GMM results (u, pi, and cov)
        cluster_covariance, cluster_pi = analyze_cluster(matrix=matrix, labels=labels, cluster_centers=cluster_centers, n_clusters=n_clusters)    
        gauss = GaussParams(u=cluster_centers, pi=cluster_pi, cov=cluster_covariance)
        gauss_results.append(gauss)

        # Visualize the kmeans plot as scatter in 2D
        # custom_scatter_2D(matrix=matrix, labels=labels, cluster_centers=cluster_centers, n_clusters=n_clusters, digit=digit, coeffs=PLOT_COEFFS)
    return gauss_results

In [8]:
def get_pi_df(gauss_results):
    """ Printing probability of ending up in each mixture"""
    pi_vals = []
    for index in range(len(gauss_results)):
        result = gauss_results[index]
        pi_vals.append(result.pi)

    # print(f"PI VALUES FOR GAUSS RESULTS (cluster result x digit)")
    pi_df = pd.DataFrame(pi_vals)
    return pi_df


In [9]:
def get_all_dataframes(digit, write_path, read_path, stopwatch):
    """
    Get all of the dataframes for a single digit 
    Use single_person data folder as intermediary for pandas read csv ease of use
    """
    start_time = dt.now()
    read_filename = f"{read_path}{digit}.txt"
    write_filename = f"{write_path}{digit}.txt"

    f = open(write_filename, "w")
    line_count = 0

    df_all = []

    # Open file and build out data
    with open(read_filename, "r") as file:
        for line in file:
            if len(line.strip()) != 0:
                f.write(line)
                line_count += 1
            elif line_count > 0:
                # Close file descriptor, read in written data, update dataframes
                f.close()
                df = pd.read_csv(write_filename, skip_blank_lines=True, delimiter=' ', header=None)
                df_all.append(df)

                # Reset line count and file descriptor for new dataframe parse
                line_count = 0
                f = open(write_filename, "w")

    # Likely have one more (no missing line on final line)
    if line_count > 0:
        f.close()
        df = pd.read_csv(write_filename, skip_blank_lines=True, delimiter=' ', header=None)
        df_all.append(df)

    end_time = dt.now()
    total_time = (end_time - start_time).total_seconds()

    if (stopwatch):
        print(f"Parsed {len(df_all)} frames in {total_time} sec")

    return df_all
     
def print_summary(digit, total_time, correct, utterances):
    """Output summary from classification to console"""
    accuracy = correct / utterances * 100
    accuracy = round(accuracy, 3)
    dt_format = "%H:%M:%S"
    cur_time = dt.strftime(dt.now(), dt_format) 
    print(f"#{digit}\taccuracy: {accuracy}%\tcorrect: {correct}\tutterances: {utterances}\ttotal_time: {round(total_time, 3)} sec\tcur_time: {cur_time}")

In [10]:
def classify_dataframe(df, gauss_results, digits, model_coeffs):
    """
    Classify a dataframe based on gaussian results
    df - dataframe to classify
    gauss_results - gmm mixture model results
    digits - total digits to validate
    model_coeffs - model coefficients (array)
    """

    # Perform classification on some test data
    posterior_all = []
    for d in range(digits):
        """Iterate over all digits (possible classifications"""
        posterior_digit = 0

        for n, row in df.iterrows():
            """Iterate over all n frames of the sample"""
            frames_n = row.to_numpy()[model_coeffs]

            sum_m = 0
            result_m = gauss_results[d]
            """Iterate over all results from gmm parameters"""
            cov, pi, u = result_m.cov, result_m.pi, result_m.u   

            for m in range(len(u)):
                """Iterate over all m dimensions of mixture model"""
                u_m = u[m]
                cov_m = cov[m]
                pi_m = pi[m]

                y = multivariate_normal.pdf(x=frames_n, mean=u_m, cov=cov_m)
                posterior_i = y * pi_m
                sum_m += posterior_i

            # end sum over all gauss components for digit
            posterior_digit += np.log(sum_m)
            
            # circuit break on underflow, no longer needed with logpdf
            # y = multivariate_normal.pdf(x=frames_n, mean=u_m, cov=cov_m)  # this causes underflow
            if posterior_digit == 0:
                sys.exit()

        # TODO - normalize by the number of samples (is this necessary?)
        # end product of all n frames
        # if (debug):
        #     print(f"digit: {d}\tposterior_digit: {posterior_digit}")
        posterior_all.append(posterior_digit)
    
    classification = posterior_all.index(max(posterior_all))
    return (classification, posterior_all)

In [11]:
def test_model(classify_every, gauss_results, model_coeffs, digits):
    """
    Using previously computed gauss results, test the model
    classify_every - increase for speed (skip values)
    gauss_results - gaussian mixture model parameters
    model_coeffs - coefficients from model used in generation
    digits - total digits to train on
    """

    test_read_path = f"{DATA_PATH}test_digits/test_0"
    test_write_path = f"{DATA_PATH}single_person/test_0"

    classify_results = []
    summary_lists = []
    for digit in range(digits):
        total_classified = 0
        correct = 0
        df_all = get_all_dataframes(digit=digit, write_path=test_write_path, read_path=test_read_path, stopwatch=False)
        classify_digit = [0]*digits

        index = 0
        start_time = dt.now()
        for df in df_all:
            if index % classify_every == 0:
                (classification, posterior_all) = classify_dataframe(df=df, gauss_results=gauss_results, digits=digits, model_coeffs=model_coeffs) 
                total_classified += 1
                classify_digit[classification] += 1
                if classification == digit:
                    correct += 1
            index += 1

        classify_results.append(classify_digit)

        end_time = dt.now()
        total_time = round((end_time - start_time).total_seconds(), 3)
        accuracy = round(correct / total_classified * 100, 3)
        # total_time = f"{total_time} sec"
        # accuracy = f"{accuracy} %"
        summary = [digit, accuracy, correct, total_classified, total_time]
        summary_lists.append(summary)

    summary_df = pd.DataFrame(summary_lists, columns=["Digit", "Accuracy (%)", "Correct", "Classified", "Time (s)"])
    classify_df = pd.DataFrame(classify_results)

    return (summary_df, classify_df)

In [12]:
def name_files(use_coeffs, num_clusters):
    return f"{use_coeffs}mfcc_{num_clusters}clust"
    

In [13]:
# Define all relative file paths to actually get files
TRAIN_PATH = f"{DATA_PATH}train_digits/train_0"
TEST_PATH = f"{DATA_PATH}test_digits/test_0"
WRITE_PATH = f"../../data/03_results/"

PI_SUFFIX = "_pi.csv"
CONF_SUFFIX = "_conf.csv"
RESULTS_SUFFIX = "_results.csv"
SUMMARY_SUFFIX = "_summary.csv"

DIGITS = 10
PLOT_COEFFS = [0, 1]

In [14]:
use_coeffs = 7
digits = 10
model_coeffs = range(use_coeffs)
num_clusters = 3
clusters = [num_clusters]*digits
classify_every = 5

MAX_CLUSTERS = 8
for num_clusters in range(1,MAX_CLUSTERS):
    clusters = [num_clusters]*digits

    # Run model on parameters
    gauss_results = create_model(digits=digits, clusters=clusters, model_coeffs=model_coeffs, train_path=TRAIN_PATH)
    (summary_df, classify_df) = test_model(classify_every=classify_every, gauss_results=gauss_results, model_coeffs=model_coeffs, digits=digits)
    pi_df = get_pi_df(gauss_results=gauss_results)

    # Write output files and send useful printout
    prefix = name_files(use_coeffs=use_coeffs, num_clusters=num_clusters)
    pi_filename = f"{WRITE_PATH}{prefix}{PI_SUFFIX}"
    conf_filename = f"{WRITE_PATH}{prefix}{CONF_SUFFIX}"
    results_filename = f"{WRITE_PATH}{prefix}{RESULTS_SUFFIX}"
    # summary_filename = f"{WRITE_PATH}{prefix}{SUMMARY_SUFFIX}"

    # TODO write to datafile
    print(f"{prefix}: \n{summary_df.mean(axis=0)}\n")
    summary_df.to_csv(results_filename)
    classify_df.to_csv(conf_filename)
    pi_df.to_csv(pi_filename)

7mfcc_1clust: 
Digit            4.5000
Accuracy (%)    70.7677
Correct         31.2000
Classified      44.1000
Time (s)         5.9973
dtype: float64

7mfcc_2clust: 
Digit            4.5000
Accuracy (%)    79.1363
Correct         34.9000
Classified      44.1000
Time (s)         8.8299
dtype: float64

7mfcc_3clust: 
Digit            4.5000
Accuracy (%)    79.8183
Correct         35.2000
Classified      44.1000
Time (s)        11.8460
dtype: float64

7mfcc_4clust: 
Digit            4.5000
Accuracy (%)    80.2576
Correct         35.4000
Classified      44.1000
Time (s)        14.4753
dtype: float64

7mfcc_5clust: 
Digit            4.5000
Accuracy (%)    78.9041
Correct         34.8000
Classified      44.1000
Time (s)        18.3012
dtype: float64

7mfcc_6clust: 
Digit            4.5000
Accuracy (%)    78.7020
Correct         34.7000
Classified      44.1000
Time (s)        22.5781
dtype: float64

7mfcc_7clust: 
Digit            4.5000
Accuracy (%)    80.7271
Correct         35.6000
Classif

In [16]:
MAX_COEFFS = 11
digits = 10
model_coeffs = range(use_coeffs)
num_clusters = 4
clusters = [num_clusters]*digits
classify_every = 5

for use_coeffs in range(1,MAX_COEFFS):
    model_coeffs = range(use_coeffs)

    # Run model on parameters
    gauss_results = create_model(digits=digits, clusters=clusters, model_coeffs=model_coeffs, train_path=TRAIN_PATH)
    (summary_df, classify_df) = test_model(classify_every=classify_every, gauss_results=gauss_results, model_coeffs=model_coeffs, digits=digits)
    pi_df = get_pi_df(gauss_results=gauss_results)

    # Write output files and send useful printout
    prefix = name_files(use_coeffs=use_coeffs, num_clusters=num_clusters)
    pi_filename = f"{WRITE_PATH}{prefix}{PI_SUFFIX}"
    conf_filename = f"{WRITE_PATH}{prefix}{CONF_SUFFIX}"
    results_filename = f"{WRITE_PATH}{prefix}{RESULTS_SUFFIX}"
    # summary_filename = f"{WRITE_PATH}{prefix}{SUMMARY_SUFFIX}"

    # Useful printout for debug, save files
    print(f"{prefix}: \n{summary_df.mean(axis=0)}\n")
    summary_df.to_csv(results_filename)
    classify_df.to_csv(conf_filename)
    pi_df.to_csv(pi_filename)

    txt_out = f"{WRITE_PATH}{num_clusters}clust_out.txt"
    f = open(txt_out, "w")
    f.write(f"{prefix}: \n{summary_df.mean(axis=0)}\n")
    f.close()

1mfcc_4clust: 
Digit            4.5000
Accuracy (%)    33.1515
Correct         14.6000
Classified      44.1000
Time (s)         6.2872
dtype: float64

2mfcc_4clust: 
Digit            4.5000
Accuracy (%)    43.0709
Correct         19.0000
Classified      44.1000
Time (s)         7.6714
dtype: float64

3mfcc_4clust: 
Digit            4.5000
Accuracy (%)    69.8385
Correct         30.8000
Classified      44.1000
Time (s)         7.2889
dtype: float64

4mfcc_4clust: 
Digit            4.5000
Accuracy (%)    72.0910
Correct         31.8000
Classified      44.1000
Time (s)         7.7127
dtype: float64

5mfcc_4clust: 
Digit            4.5000
Accuracy (%)    75.0505
Correct         33.1000
Classified      44.1000
Time (s)         8.3568
dtype: float64

6mfcc_4clust: 
Digit            4.5000
Accuracy (%)    76.8486
Correct         33.9000
Classified      44.1000
Time (s)        13.1460
dtype: float64

7mfcc_4clust: 
Digit            4.5000
Accuracy (%)    80.2576
Correct         35.4000
Classif