In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import decomposition

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from skimage.feature import hog

In [3]:
letter_name = 'letter_idx'
feature_names = [f"p_{i}" for i in range(784)]

X_train_df = pd.read_csv('data/emnist-letters-train.csv', header=None,
                     names=[letter_name] + feature_names)
X_train = X_train_df[feature_names].to_numpy()[:71040]
y_train = X_train_df[letter_name].to_numpy()[:71040]

X_val = X_train_df[feature_names].to_numpy()[71040:]
y_val = X_train_df[letter_name].to_numpy()[71040:]

X_test_df = pd.read_csv('data/features-test.csv')
X_test = X_test_df[feature_names].to_numpy()


def char_position(letter):
    return ord(letter) - ord('a')

def pos_to_char(pos):
    return chr(pos + ord('a'))

def disp_image(img_arr, str_idx):
    plt.imshow(img_arr.reshape(28, 28).T)
    plt.title(pos_to_char(str_idx))
    plt.show()

# Apply HOG feature extraction
def extract_hog_features(images):
    hog_features = []
    for img in images:
        img_reshaped = img.reshape(28, 28)
        features = hog(img_reshaped, orientations=9, pixels_per_cell=(2, 2), cells_per_block=(2, 2), block_norm='L2-Hys', visualize=False)
        hog_features.append(features)
    return np.array(hog_features)

X_train_hog = extract_hog_features(X_train)
X_val_hog = extract_hog_features(X_val)
X_test_hog = extract_hog_features(X_test)

n_components = min(X_train_hog.shape[0], X_train_hog.shape[1])
pca = decomposition.PCA(n_components=200)
pca.fit(X_train_hog)

X_train_pca = pca.transform(X_train_hog)
X_val_pca = pca.transform(X_val_hog)
X_test_pca = pca.transform(X_test_hog)

print("Original HOG feature shape:", X_train_hog.shape)
print("Reduced feature shape after PCA:", X_train_pca.shape)

clf = LogisticRegression(random_state=0, penalty='l2', max_iter=10000).fit(X_train_pca, y_train)
train_preds = clf.predict(X_train_pca)
val_preds = clf.predict(X_val_pca)

letters = [pos_to_char(x) for x in range(26)]

print(classification_report(y_train, train_preds, target_names=letters))
print(classification_report(y_val, val_preds, target_names=letters))

test_preds = clf.predict(X_test_pca)
pred_df = pd.DataFrame(test_preds, columns=['letter_idx'], index=X_test_df.index)
pred_df.to_csv("data/logistic_baseline.csv", index_label='sample-id')

Original HOG feature shape: (71040, 6084)
Reduced feature shape after PCA: (71040, 200)
              precision    recall  f1-score   support

           a       0.82      0.83      0.82      2696
           b       0.92      0.92      0.92      2730
           c       0.92      0.93      0.93      2707
           d       0.90      0.90      0.90      2755
           e       0.91      0.91      0.91      2742
           f       0.93      0.92      0.93      2714
           g       0.80      0.75      0.77      2708
           h       0.90      0.91      0.90      2752
           i       0.73      0.72      0.72      2730
           j       0.91      0.90      0.90      2752
           k       0.93      0.93      0.93      2716
           l       0.73      0.76      0.75      2737
           m       0.95      0.95      0.95      2734
           n       0.88      0.89      0.89      2689
           o       0.94      0.97      0.95      2764
           p       0.95      0.95      0.95    