In [3]:
import pickle
import os
import numpy
from sklearn import decomposition
from sklearn import preprocessing
import pandas as pd
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

master_path = '../CSV'
categories = next(os.walk(master_path))[1]
categories.sort()
labels = numpy.empty((0, 0))
column_to_drop = ["Frames#", "score_overall", "nose_score", "leftEye_score", "rightEye_score", "leftEar_score",
                  "rightEar_score",
                  "leftShoulder_score", "rightShoulder_score", "leftElbow_score", "rightElbow_score", "leftWrist_score",
                  "rightWrist_score", "leftHip_score", "rightHip_score", "leftKnee_score", "rightKnee_score",
                  "leftAnkle_score",
                  "rightAnkle_score"]
columns_to_retain = ["nose_x", "nose_y", "leftEye_x", "leftEye_y", "rightEye_x", "rightEye_y", "leftEar_x", "leftEar_y",
                     "rightEar_x", "rightEar_y", "leftShoulder_x", "leftShoulder_y", "rightShoulder_x",
                     "rightShoulder_y", "leftElbow_x", "leftElbow_y", "rightElbow_x", "rightElbow_y", "leftWrist_x",
                     "leftWrist_y", "rightWrist_x", "rightWrist_y", "leftHip_x", "leftHip_y", "rightHip_x",
                     "rightHip_y", "leftKnee_x", "leftKnee_y", "rightKnee_x", "rightKnee_y", "leftAnkle_x",
                     "leftAnkle_y", "rightAnkle_x", "rightAnkle_y"]
arr = numpy.empty((0, 52), float)
total_rows = 150
label_count = 0
final_data = pd.DataFrame()


def progress(total, count, folder_name):
    percent = (count / total) * 100
    print('\r', "%.2f" % round(percent, 2) + "% '" + folder_name + "' completed", end=' ')


for category in categories:
    temp_data = pd.DataFrame(columns=columns_to_retain)
    path = os.path.join(master_path, category)
    total_files = len(os.listdir(str(path)))
    i = 1
    progress(1, 100, category)
    for file in os.listdir(str(path)):
        data = pd.read_csv(os.path.join(path, file))
        data.drop(column_to_drop, axis=1, inplace=True)
        while data.shape[0] < total_rows:
            data = data.append(data, ignore_index=True)
        shape = data.shape
        if shape[0] > total_rows:
            data = data[:150]
        temp_data = temp_data.append(data, ignore_index=True)
        percent = (i / total_files) * 100
        print('\r', "%.2f" % round(percent, 2) + "% '" + category + "' completed", end=' ')
        i += 1
    print()

    result = temp_data.values

    labels = numpy.append(labels, numpy.full((1, result.shape[0]), label_count))
    # print(pca_result.shape)
    final_data = final_data.append(pd.DataFrame(result), ignore_index=True)
    label_count += 1
    # print(labels.shape)

print()
print("Applying PCA")
result = final_data.values
scaler = preprocessing.StandardScaler()
scaler.fit(result)
scaled_result = scaler.transform(result)
pca = decomposition.PCA(n_components=25)
pca.fit(scaled_result)
pca_result = pca.transform(scaled_result)
print("Applied PCA")

print("Preparing the test data")
test_size = 0.33
# seed = 7
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(pca_result, labels, test_size=test_size,
                                                                    random_state=4)
print("Test data prepared. Go ahead and train you model")

 100.00% 'buy' completed d 
 100.00% 'communicate' completed d 
 100.00% 'fun' completed d 
 100.00% 'hope' completed d 
 100.00% 'mother' completed d 
 100.00% 'really' completed d 

Applying PCA
Applied PCA
Preparing the test data
Test data prepared. Go ahead and train you model


In [4]:
print("Training using Logistic Regression...")

model = LogisticRegression(max_iter=1000)
model.fit(X_train, Y_train)
lrPickle = open('../models/logistic_regression_file', 'wb')
# source, destination
pickle.dump(model, lrPickle)
result = model.score(X_test, Y_test)
print("Logistic Regression Accuracy: %.3f%%" % (result * 100.0))
print("Training Complete")

Training using Logistic Regression...
Logistic Regression Accuracy: 76.221%
Training Complete


In [5]:
print("Training using KNeighbour Classifier...")

knn = KNeighborsClassifier(n_neighbors=200)
knn.fit(X_train, Y_train)
y_pred = knn.predict(X_test)
knnPickle = open('../models/knnpickle_file', 'wb')
# source, destination
pickle.dump(knn, knnPickle)

print("KNN Accuracy: %.3f%%" % (metrics.accuracy_score(Y_test, y_pred) * 100))
print("Training Complete")

Training using KNeighbour Classifier...
KNN Accuracy: 87.538%
Training Complete


In [13]:
print("Training using Random Forest Classifier...")

rf = RandomForestClassifier(max_depth=10, random_state=0)
rf.fit(X_train, Y_train)
y_pred = rf.predict(X_test)
rfPickle = open('../models/rfpickle_file', 'wb')
# source, destination
pickle.dump(rf, rfPickle)

print("Random Forest Accuracy: %.3f%%" % (metrics.accuracy_score(Y_test, y_pred) * 100))
print("Training Complete")

Training using Random Forest Classifier...
Random Forest Accuracy: 96.519%
Training Complete
