First thing we need to do is manage our imports. Without importing these things, we cannot use them.

In [None]:
import letp
import sklearn.metrics as me
import sklearn.datasets as d
import sklearn.neighbors as nn
import numpy as np
import pandas as pd

Next, we define some measurements, and add them to the appropriate dictionaries

In [2]:
def record(measurement):
    return measurement

def confusion_matrix(measurement):
    conf_matrix = me.confusion_matrix(measurement[0],measurement[1])
    return conf_matrix

def precision(measurement):
    precision_score = me.precision_score(measurement[0],measurement[1], average='samples')
    return precision_score

def recall(measurement):
    recall_score = me.recall_score(measurement[0],measurement[1])
    return recall_score

def f1(measurement):
    f1_score = me.f1_score(measurement[0],measurement[1])
    return f1_score

def accuracy(measurement):
    correct = 0
    total = len(measurement[0])
    for i, j in zip(*measurement):
        if i == j:
            correct += 1
    return correct / total

analysis_functions = {
        "conf_matrix" : confusion_matrix,
        "accuracy" : accuracy
        #"precision" : precision,
        #"recall" : recall,
        #"f1" : f1
}

Neat, next thing we need to do is define the measurements we want to use our analysis functions on.

In [3]:
measurements = {
        "pred_true_values" : ['conf_matrix', 'accuracy']#, 'precision', 'recall', 'f1']
}

Now we make the analyzer object.

In [4]:
analyzer = letp.Analyzer(measurements, analysis_functions, '.')

Next, we bring in the dataset, we'll import it from sklearn for brevity's sake.

In [5]:
data = d.load_digits()
print(np.shape(data['images'][0]))
input_data = {'X': np.array([i.flatten() for i in data['images']]), 'Y': data['target']}

(8, 8)


Define a partitioner. The partitioner will break the dataset into training and testing components. We'll use one training and testing set for now.

In [6]:
def partitioner(data):
    split = int(len(data['Y'])*.8)
    output = { 
        'train_data': data['X'][:split],
        'train_labels': data['Y'][:split],
        'test_data': data['X'][split:],
        'test_labels': data['Y'][split:]
    }
    yield output

Instantiate the data handler.

In [7]:
data_handler = letp.DataHandler(input_data, partitioner=partitioner)

Define a step function. This is responsible for each experimental step, and is expected to be run on every iteration of the partitioner.

In [8]:
def step(data):
    model= nn.KNeighborsClassifier() 
    model.fit(data['train_data'], data['train_labels'])
    model.fit(data['train_data'], data['train_labels'])
    output_labels = model.predict(data['test_data'])
    yield ('pred_true_values', (output_labels, data['test_labels']))

Now instantiate a cycle.

In [9]:
cycle = letp.Cycle(analyzer, data_handler, step, name='K Nearest Neighbors')

Now that a cycle has been created, it must still be run, which we do with the following command.

In [10]:
results = cycle.run()
print(results)

{'pred_true_values-0': (array([2, 3, 4, 5, 6, 7, 8, 9, 0, 9, 5, 5, 6, 5, 0, 9, 8, 9, 8, 4, 1, 7,
       7, 3, 5, 1, 0, 0, 2, 2, 7, 8, 2, 0, 1, 2, 6, 3, 3, 7, 3, 3, 4, 6,
       6, 6, 4, 9, 1, 5, 0, 9, 5, 2, 8, 2, 0, 0, 1, 7, 6, 3, 2, 1, 7, 4,
       6, 3, 1, 3, 9, 1, 7, 6, 8, 4, 3, 1, 4, 0, 5, 3, 6, 9, 6, 1, 7, 5,
       4, 4, 7, 2, 8, 2, 2, 5, 7, 9, 5, 4, 8, 8, 4, 9, 0, 8, 9, 8, 0, 1,
       2, 3, 4, 5, 6, 7, 1, 9, 0, 1, 2, 3, 4, 5, 6, 9, 0, 1, 2, 3, 4, 5,
       6, 7, 8, 9, 4, 9, 5, 5, 6, 5, 0, 9, 8, 5, 8, 4, 1, 7, 7, 3, 5, 1,
       0, 0, 2, 2, 7, 8, 2, 0, 1, 2, 6, 3, 3, 7, 7, 8, 4, 6, 6, 6, 7, 9,
       1, 5, 0, 9, 5, 2, 8, 0, 1, 7, 6, 3, 2, 1, 7, 4, 6, 3, 1, 3, 9, 1,
       7, 6, 8, 4, 3, 1, 4, 0, 5, 3, 6, 9, 6, 1, 7, 5, 4, 4, 7, 2, 2, 5,
       7, 3, 5, 8, 4, 5, 0, 8, 9, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1,
       2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 9, 5, 5,
       6, 5, 0, 9, 8, 9, 8, 4, 1, 7, 7, 3, 5, 1, 0, 0, 2, 2, 7, 8, 2, 0,
       1, 2, 6, 3, 2, 7, 3,

From this output, it's clear that the measurements' analyses are exactly where they should be. We can specifically look
at the outputs by selecting the appropriate dictionaries. Let's pull up only the accuracy.

In [11]:
print(results['pred_true_values-0-analysis']['accuracy'])

0.9638888888888889


Great, there's some outputs. This is good, but more would be better. Let's write a function that creates cycle functions, and use that to test a bunch of classifiers at once. The first thing we should do though is import some new classifiers.

$det(\lambda A-B)=det(A)det(A^{-1})det(\lambda A-B)=det(A)det(A^{-1}(\lambda A-B))=det(A)det(\lambda I-A^{-1}B)$


In [None]:
import sklearn.ensemble as ens
import sklearn.discriminant_analysis as da
import sklearn.linear_model as lm
import sklearn.naive_bayes as nb


def forstner_metric(a, b):
    a = a.reshape((8,8))
    b = b.reshape((8,8))
    c = np.linalg.svd(a)
    d = np.linalg.svd(b)
    c_covar = np.matmul(c[0].T, np.diag(np.reciprocal(c[1])))
    d_covar = np.matmul(c_covar,np.matmul(d[0], np.diag(d[1])))
    d_covar[~np.isfinite(d_covar)] = 0
    print(np.linalg.svd(d_covar)[1])
    forstner = np.sqrt(np.sum(np.log(np.linalg.svd(d_covar)[1]**2)**2))
    print(forstner)
    return forstner

model_list = [ens.RandomForestClassifier,
              da.LinearDiscriminantAnalysis,
              nb.GaussianNB,
              nn.KNeighborsClassifier,
              lm.LogisticRegression,
              lambda : nn.KNeighborsClassifier(metric=forstner_metric)]

Now to define the cycle generator.

In [None]:
def cycle_generator(models, analyzer, data_handler):
    for i in models:
        def step(data):
            model = i() 
            model.fit(data['train_data'], data['train_labels'])
            model.fit(data['train_data'], data['train_labels'])
            output_labels = model.predict(data['test_data'])
            yield ('pred_true_values', (output_labels, data['test_labels']))
        yield letp.Cycle(analyzer, data_handler, step, name=i.__name__)

To run everything and collect the outputs, we can do the following. We're going to see some warnings, but we can silence them.

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
results = dict()
for i in cycle_generator(model_list, analyzer, data_handler):
    results[i._name] = i.run()

Now we can print the collected results, this won't be pretty.

In [None]:
print(results)

That was predictably ugly. Lets just mark down the accuracy, because that's usually the thing people are most interested in.

In [None]:
for k in results:
    for i in results[k]:
        if 'analysis' in i:
            print(f'{k} accuracy: {results[k][i]["accuracy"]}')

Now let's try with a different data set.

In [None]:
df = pd.read_csv('hopefully_this_good.csv', encoding='latin-1')

In [None]:
print(df.columns.values)

In [None]:
from sklearn.model_selection import KFold
def k_fold_partitioner(data):
    data_values = data[interesting_columns].values
    data_labels = data[['SC_VIOLATION']].values
    kfold = KFold(10)
    for train, test in kfold.split(data_labels):
        output = { 
            'train_data': data_values[train],
            'train_labels': data_labels[train],
            'test_data': data_values[test],
            'test_labels': data_labels[test]
        }
        yield output

In [None]:
data_handler = letp.DataHandler(df, partitioner=kyra_partitioner)
results = dict()
for i in cycle_generator(model_list, analyzer, data_handler):
    results[i._name] = i.run()

In [None]:
d = pd.DataFrame(['test', 'accuracy', 'recall', 'f1', 'precision'])
for k in results:
    running_accuracy = 0
    running_recall = 0
    running_f1 = 0
    running_precision = 0
    test_count = 0
    for i in results[k]:
        if 'analysis' in i:
            test_count = test_count + 1
            d["test"]=i
            d["accuracy"]=results[k][i]["accuracy"]
            d["f1"]=results[k][i]["f1"]
            d["precision"]=results[k][i]["precision"]
            running_accuracy += results[k][i]["accuracy"]
            running_recall += results[k][i]["recall"]
            running_f1 += results[k][i]["f1"]
            running_precision += results[k][i]["precision"]
    df[""]
    print(f'{k} accuracy: {running_accuracy/test_count:.2f}')
    print(f'{k} recall: {running_recall/test_count:.2f}')
    print(f'{k} f1: {running_f1/test_count:.2f}')
    print(f'{k} precision: {running_precision/test_count:.2f}')