In [1]:
import numpy as np
from os import listdir
from os.path import join

in_dir = "preprocessed_data"

labels = listdir(in_dir)

count = 0
for label in labels:
    count += len(listdir(join(in_dir, label)))

mat = np.zeros((count, 256), dtype=np.int16)
targets = np.zeros(count, dtype=np.int8)

print(f"Shape of Matrix: {np.shape(mat)}, Targets: {np.shape(targets)}")

Shape of Matrix: (28962, 256), Targets: (28962,)


In [2]:
index = 0

for i, label in enumerate(labels):
    print(f"Loading '{label}'...")
    
    for filename in listdir(join(in_dir, label)):
        with open(join(in_dir, label, filename)) as file:
            for line in file.readlines():
                tokens = line.strip().split(":")
                mat[index][int(tokens[0])] = int(tokens[1])
        
        targets[index] = i
        index += 1

Loading 'image'...
Loading 'text'...
Loading 'audio'...
Loading 'executable'...


In [3]:
from sklearn.model_selection import train_test_split

data_train, data_test, targets_train, targets_test = train_test_split(mat, targets, test_size=0.3)
print(f"Shape of Training Data: {np.shape(data_train)}, Testing Data: {np.shape(data_test)}")
print(f"Shape of Training Targets: {np.shape(targets_train)}, Testing Targets: {np.shape(targets_test)}")

Shape of Training Data: (20273, 256), Testing Data: (8689, 256)
Shape of Training Targets: (20273,), Testing Targets: (8689,)


In [11]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

classifiers = {
    "Nearest Neighbors (k=3)": KNeighborsClassifier(3),
    "Nearest Neighbors (k=5)": KNeighborsClassifier(5),
    "Nearest Neighbors (k=10)": KNeighborsClassifier(10),
    "Decision Tree": DecisionTreeClassifier(),
    "Linear SVM" : SVC(kernel="linear"),
}

max_score = 0.0
best_classifier = ""
    
for clf_name, clf in classifiers.items():
    print(f"{clf_name} -> ", end="")
    
    clf.fit(data_train, targets_train)
    predictions = clf.predict(data_test)
    
    accuracy = 100.0 * accuracy_score(targets_test, predictions)
    print(f"accuracy: {accuracy}")
    
    if accuracy > max_score:
        max_score = accuracy
        best_classifier = clf_name
        
print(f"Best Model: {best_classifier}")

Nearest Neighbors (k=3) -> accuracy: 99.15985729082747
Nearest Neighbors (k=5) -> accuracy: 99.3094717458856
Nearest Neighbors (k=10) -> accuracy: 99.17136609506272
Decision Tree -> accuracy: 99.0792956611808
Linear SVM -> accuracy: 98.98722522729888


In [15]:
from joblib import dump

model_dir = "model"

print(f"Saving {best_classifier}")
dump(classifiers[best_classifier], join(model_dir, "model.dat"))

print("Writing Labels")
with open(join(model_dir, "labels.csv"), "w") as file:
    file.write(",".join(labels))

Saving Nearest Neighbors (k=5)...
Writing Labels...
