In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
"Load data from all_processed_data.csv"
df = pd.read_csv('all_processed_data.csv')
"Drop all the columns with NaN value"
df = df.dropna()
"Set the gap value to be zero if it's negative"
df.gap[df.gap<0] = 0

In [6]:
"""Analysis"""
shuffled = df.sample(frac=1,random_state=1234)

weekday_dict = {day:n for day,n in zip(['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'],range(7))}
cols = ['time','height','width','length','volume','weight','angle','gap','velocity','weekday','LFT']
shuffled = shuffled[cols]
shuffled.weekday = shuffled.weekday.apply(lambda x: weekday_dict[x])
# might out put differently for Python v2 and v3
idx = np.floor(np.arange(0,len(shuffled),len(shuffled)/5)).astype(int)
# idx = list(idx)+[len(shuffled)]    # uncomment this if using Python v3
sets = {i:shuffled.iloc[idx[i]:idx[i+1]-1] for i in range(len(idx)-1)}



# function compute LDA
def lda(train_data, train_labels, test_data, test_labels):
    pos = np.array([train_data[i] for i in range(train_data.shape[0]) if train_labels[i] == 1])
    neg = np.array([train_data[i] for i in range(train_data.shape[0]) if train_labels[i] == 0])
    pos_mean = np.mean(pos,axis=0)
    neg_mean = np.mean(neg,axis=0)
    pos_data = pos - pos_mean
    neg_data = neg - neg_mean
    cov_all = np.cov(np.concatenate((pos_data, neg_data), axis=0).T)
    w = np.linalg.solve(cov_all,(pos_mean - neg_mean))
    # compute x_lda
    x_lda = test_data.dot(w)
    posm_transform = pos_mean.dot(w.T)
    negm_transform = neg_mean.dot(w.T)
    y_lda = []
    for element in x_lda:
        compare_pos = abs(element - posm_transform)
        compare_neg = abs(element - negm_transform)
        if (compare_pos < compare_neg):
            y_lda.append(1)
        else:
            y_lda.append(0)
    # calculate accuracy, precision / recall and F1 score
    accuracy = 0
    true_positive = 0
    predicted_positive = 0
    positive = sum(test_labels)
    for index, value in enumerate(y_lda):
        if value == test_labels[index]:
            accuracy += 1
        if value == 1:
            predicted_positive += 1
            if (test_labels[index] == 1):
                true_positive += 1
    accuracy  = float(accuracy) / len(y_lda)
    precision = float(true_positive) / predicted_positive
    recall    = float(true_positive) / positive
    F1_score  = 2 * (precision * recall) / (precision + recall)
    #print("accuracy  = {}".format(accuracy))
    #print("precision = {}".format(precision))
    #print("recall    = {}".format(recall))
    #print("F1_score  = {}\n".format(F1_score))
    return accuracy, precision, recall, F1_score

In [7]:
#Cross validation
accuracy_mean  = 0
precision_mean = 0
recall_mean    = 0
F1_score_mean  = 0
for i in range(len(sets)):
    #Training set (leaving out current test set)
    train = pd.concat([sets[s] for s in range(len(sets)) if s!=i])
    train_data = train[cols]
    train_data = np.array(train_data)
    train_data = np.delete(train_data, 10, 1)
    train_labels = train.LFT
    train_labels = np.array(train_labels)
    
    #Test set
    test = sets[i]
    test_data = test[cols]
    test_data = np.array(test_data)
    test_data = np.delete(test_data, 10, 1)
    test_labels = test.LFT
    test_labels = np.array(test_labels)
    
    
    #Run LDA
    print("Result for iteration {}:".format(i))
    accuracy, precision, recall, F1_score = lda(train_data, train_labels, test_data, test_labels)
    print("accuracy  = {}".format(accuracy))
    print("precision = {}".format(precision))
    print("recall    = {}".format(recall))
    print("F1_score  = {}\n".format(F1_score))
    accuracy_mean  += accuracy
    precision_mean += precision
    recall_mean    += recall
    F1_score_mean  += F1_score
print("accuracy_mean  = {}".format(accuracy_mean / 5))
print("precision_mean = {}".format(precision_mean / 5))
print("recall_mean    = {}".format(recall_mean / 5))
print("F1_score_mean  = {}\n".format(F1_score_mean / 5))

Result for iteration 0:
accuracy  = 0.692113400032
precision = 0.958122524502
recall    = 0.700270996984
F1_score  = 0.80915103294

Result for iteration 1:
accuracy  = 0.690567394044
precision = 0.95702788885
recall    = 0.699334249181
F1_score  = 0.808135328292

Result for iteration 2:
accuracy  = 0.692536828423
precision = 0.957091638352
recall    = 0.701335292749
F1_score  = 0.809492576107

Result for iteration 3:
accuracy  = 0.692522057665
precision = 0.957885925413
recall    = 0.70101503005
F1_score  = 0.809563016497

Result for iteration 4:
accuracy  = 0.689464510793
precision = 0.9580297797
recall    = 0.697398515701
F1_score  = 0.807197204755

accuracy_mean  = 0.691440838191
precision_mean = 0.957631551363
recall_mean    = 0.699870816933
F1_score_mean  = 0.808707831718

