In [98]:
import pandas as pd
import numpy as np

from scipy import stats
from sklearn.model_selection import train_test_split

In [191]:
 from kaggle.api.kaggle_api_extended import KaggleApi
 kaggle = KaggleApi()
 kaggle.authenticate()

In [91]:
df_test = pd.read_csv("emnist-letters-train.csv", header=None)

In [92]:
df_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,784
0,23,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,16,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,15,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,23,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [94]:
def get_train_test_data(filename: str):
    data = pd.read_csv(filename, header=None).values
    assert data.shape[1] == 785
    train, test = train_test_split(data, random_state=42)
    x_train = train[:, 1:]
    y_train = train[:, 0]
    x_test = test[:, 1:]
    y_test = test[:, 0]
    return x_train, y_train, x_test, y_test

In [185]:
def get_test_submit_data(filename: str):
    data = pd.read_csv(filename, header=None).values
    assert data.shape[1] == 784
    return data

In [115]:
def score(y_pred, y_test):
    assert len(y_pred) == len(y_test)
    return np.sum(y_pred == y_test) / len(y_pred)

In [214]:
def kaggle_submit(kaggle, name: str, y_pred):
    COMPETITION="jds3"
    submit_df = pd.DataFrame(y_submit, columns=["Category",], index=range(1, len(y_submit) + 1))
    submit_df.index = submit_df.index.rename("Id")
    submit_df.to_csv("submission.csv")
    kaggle.competition_submit("submission.csv", name, COMPETITION)

In [95]:
x_train, y_train, x_test, y_test = get_train_test_data("emnist-letters-train.csv")

In [186]:
x_submit = get_test_submit_data("emnist-letters-test-sh.csv")

In [100]:
stats.describe(y_train)

DescribeResult(nobs=66600, minmax=(1, 26), mean=13.523723723723723, variance=56.303555856942545, skewness=-0.00229399597296468, kurtosis=-1.205403415936032)

In [112]:
model_results = []

# Constant Output

In [116]:
y_pred = np.ones(y_test.shape)
model_name = "Constant Output"
model_score = score(y_pred, y_test)
model_results.append((model_score, model_name))

In [120]:
print("{} model score {:.4f} ".format(model_name, model_score))

Constant Output model score 0.0394 


# Linear Regression for classification

In [122]:
from sklearn.linear_model import LinearRegression

In [123]:
lin_reg = LinearRegression()

In [124]:
lin_reg.fit(x_train, y_train)

LinearRegression()

In [127]:
y_pred = lin_reg.predict(x_test).round().astype(np.uint8)

In [130]:
model_name = "Linear Regression"
model_score = score(y_pred, y_test)
model_results.append((model_score, model_name))
print("{} model score {:.4f} ".format(model_name, model_score))

Linear Regression model score 0.0467 


# KNN classification with reduced dimentionality

In [140]:
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

In [187]:
def reduce_dim(x):
    return x.reshape(x.shape[0], 28, -1)[:,::2,::2].reshape(x.shape[0], -1)
x_train_reduced = reduce_dim(x_train)
x_test_reduced = reduce_dim(x_test)

In [188]:
knn = KNeighborsClassifier()
knn.fit(x_train_reduced, y_train)

KNeighborsClassifier()

In [189]:
y_pred = knn.predict(x_test_reduced)

In [153]:
score(y_pred, y_test)

0.844954954954955

In [154]:
model_name = "KNN Regression with 14x14"
model_score = score(y_pred, y_test)
model_results.append((model_score, model_name))
print("{} model score {:.4f} ".format(model_name, model_score))

KNN Regression with 14x14 model score 0.8450 


In [193]:
x_submit_reduced = reduce_dim(x_submit)
y_submit = knn.predict(x_submit_reduced)

In [215]:
kaggle_submit(kaggle, "KNN Regression with 14x14", y_pred)

100%|██████████| 112k/112k [00:03<00:00, 32.8kB/s] 


## KNN with block reduce 4x4

In [160]:
import skimage.measure

In [178]:
def reduce_dim_block(x):
    im = x.reshape(x.shape[0], 28, -1)
    im_reduced = skimage.measure.block_reduce(im, (1, 4,4), np.max)
    return im_reduced.reshape(im_reduced.shape[0], -1)
    
x_train_reduced = reduce_dim_block(x_train)
x_test_reduced = reduce_dim_block(x_test)

In [172]:
knn = KNeighborsClassifier()
knn.fit(x_train_reduced, y_train)

KNeighborsClassifier()

In [173]:
y_pred = knn.predict(x_test_reduced)

In [174]:
score(y_pred, y_test)

0.7995045045045045

In [175]:
model_name = "KNN Regression with 7x7"
model_score = score(y_pred, y_test)
model_results.append((model_score, model_name))
print("{} model score {:.4f} ".format(model_name, model_score))

KNN Regression with 7x7 model score 0.7995 


In [179]:
x_train_reduced[x_train_reduced < 128] = 0
x_train_reduced[x_train_reduced >= 128] = 1
x_test_reduced[x_test_reduced < 128] = 0
x_test_reduced[x_test_reduced >= 128] = 1

In [180]:
knn = KNeighborsClassifier()
knn.fit(x_train_reduced, y_train)

KNeighborsClassifier()

In [181]:
y_pred = knn.predict(x_test_reduced)

In [182]:
score(y_pred, y_test)

0.7254054054054054

In [183]:
model_name = "KNN Regression with 7x7 1bit"
model_score = score(y_pred, y_test)
model_results.append((model_score, model_name))
print("{} model score {:.4f} ".format(model_name, model_score))

KNN Regression with 7x7 1bit model score 0.7254 
