In [1]:
from os import listdir
from os.path import isfile, join
import numpy as np

In [3]:
def save_pred(pred, filename, classes):
    file = open(filename, "w")
    file.write("ID,CATEGORY\n")
    
    for i in range(len(pred)):
        file.write(str(i) + ',' + classes[int(pred[i])] + '\n')
    
    file.close()
    return

In [4]:
def read_train_data(files):
    x = (np.load('train/' + files[0]))
    whole_data = np.empty((x.shape[0], x.shape[1] + 1))
    whole_data[:, :-1] = x;
    whole_data[:, -1] = 0;

    for i in range (1, len(files)):
        x = np.load('train/' + files[i])
        data = np.empty((x.shape[0], x.shape[1] + 1))
        data[:, :-1] = x;
        data[:, -1] = i;
        whole_data = np.append(whole_data, data, axis=0);
        
    return whole_data

In [5]:
def read_test_data():
    return np.load('test/test.npy')

In [6]:
files = [f for f in listdir('train') if isfile(join('train', f))]

In [7]:
whole_data = read_train_data(files)
np.random.shuffle(whole_data);
X = whole_data[:, :-1] / 255
y = whole_data[:, -1].astype(int)

In [8]:
X_test = read_test_data() / 255

In [9]:
classes = [s.split('.')[0] for s in files]

# Sklearn - Kmeans

In [10]:
from sklearn.cluster import KMeans

In [11]:
def get_cluster_labels(pred_clusters, n_clusters, y, n_classes):
    conf = np.zeros((n_clusters, n_classes))
    for i in range(len(y)):
        conf[pred_clusters[i]][y[i]] += 1;
    
    return np.argmax(conf, axis=1)

In [12]:
def pred_kmeans(cluster_pred, cluster_labels):
    pred = [cluster_labels[y] for y in cluster_pred]
    
    return np.array(pred)

In [13]:
kmeans = KMeans(n_clusters=20, n_init=10, max_iter = 25, n_jobs=2)

In [None]:
kmeans.fit(X)

In [None]:
cluster_labels = get_cluster_labels(kmeans.predict(X), kmeans.n_clusters, y, len(classes))
print(cluster_labels)

In [None]:
pred = pred_kmeans(kmeans.predict(X), cluster_labels)
len(np.nonzero(pred == y)[0]) / len(y)

In [None]:
pred_test = pred_kmeans(kmeans.predict(X_test), cluster_labels)

In [None]:
save_pred(pred_test, 'kmeans_300.txt', classes)

In [None]:
max_iters = [1, 2, 5, 10, 20, 30, 40, 50]
for iters in max_iters:
    kmeans = KMeans(n_clusters=20, n_init=10, max_iter=iters, n_jobs=-1)
    kmeans.fit(X)
    cluster_labels = get_cluster_labels(kmeans.predict(X), kmeans.n_clusters, y, len(classes))
    pred = pred_kmeans(kmeans.predict(X), cluster_labels)
    print(len(np.nonzero(pred == y)[0]) / len(y))
    pred_test = pred_kmeans(kmeans.predict(X_test), cluster_labels)
    save_pred(pred_test, 'kmeans_' + str(iters) + '.txt', classes)

# 0.27171
# 0.29411
# 0.34562
# 0.33659
# 0.35671
# 0.35638
# 0.3493
# 0.36156

# PCA + Libsvm

In [None]:
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from svmutil import *

In [None]:
pca = PCA(n_components=50)
reduced_X = pca.fit_transform(X)

In [None]:
C = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]

prob = svm_problem(y.tolist(), reduced_X.tolist())
for c in C:
    print(c)
    print(svm_train(prob, '-t 0 -c ' + str(c) + ' -v 5 -q'))

In [None]:
model = svm_train(y.tolist(), reduced_X.tolist(), '-t 0 -c 1 -q')

In [None]:
p_labs, p_acc, p_vals = svm_predict(y.tolist(), reduced_X.tolist(), model)
print(p_acc)
#78.180

In [None]:
pca = PCA(n_components=50)
reduced_X_test = pca.fit_transform(X_test)
p_labs, p_acc, p_vals = svm_predict([0] * len(X_test), reduced_X_test.tolist(), model)
save_pred(p_labs, 'pca-svm_1.txt', classes)

# Neural Networks - Keras

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Conv2D, MaxPooling2D, Flatten, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [None]:
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(y)

In [None]:
h_units = 1

def nn_model():
    # create model
    model = Sequential()
    model.add(Dense(h_units, input_dim=X.shape[1], activation='sigmoid'))
    model.add(Dense(len(classes), activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
hidden_units = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 4000]
for units in hidden_units:
    h_units = units
    estimator = KerasClassifier(build_fn=nn_model, epochs=10, batch_size=100, verbose=1)
    kfold = KFold(n_splits=5, shuffle=True, random_state=0)
    results = cross_val_score(estimator, X, dummy_y, cv=kfold)
    print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
model = Sequential()
model.add(Dense(1000, input_dim=X.shape[1], activation='sigmoid'))
model.add(Dense(len(classes), activation='softmax'))
    # Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X, dummy_y, epochs=10, batch_size=100, verbose=1)

In [None]:
score = model.evaluate(X, dummy_y)
print(score[1] * 100)
# train 97.168

In [None]:
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)

In [None]:
save_pred(y_pred, 'nn_1000.txt', classes)

# Convolutional Neural Network - Keras

## Cross - Validation

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Conv2D, MaxPooling2D, Flatten, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [None]:
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(y)

In [None]:
X_s = X.reshape(X.shape[0], 28, 28, 1)

In [None]:
k_size, ch_units, n_kernel = 2, 1, 1
def cnn_model():
    model = Sequential()
    model.add(Conv2D(n_kernel, kernel_size=(k_size, k_size), padding='same', input_shape=(X_s.shape[1], X_s.shape[2], X_s.shape[3])))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Flatten())
    model.add(Dense(ch_units, activation='relu'))
    model.add(Dense(len(classes), activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [None]:
hidden_units = [20, 50, 100, 200, 500, 1000]
k_sizes = [3, 2, 3, 4, 5, 7]
n_kernels = [8, 16, 32, 64, 128]
for units in hidden_units:
    ch_units = units
    for k in k_sizes:
        k_size = k
        for n_k in n_kernels:
            n_kernel = n_k
            estimator = KerasClassifier(build_fn=cnn_model, epochs=2, batch_size=100, verbose=1)
            kfold = KFold(n_splits=4, shuffle=True, random_state=0)
            results = cross_val_score(estimator, X_s, dummy_y, cv=kfold)
            print(units, k, n_k)
            print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
model = Sequential()
model.add(Conv2D(64, kernel_size=(5,5), padding='same', input_shape=(X_s.shape[1], X_s.shape[2], X_s.shape[3])))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(500, activation='relu'))
model.add(Dense(len(classes), activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X_s, dummy_y, epochs=2, batch_size=100, verbose=1)

In [None]:
score = model.evaluate(X_s, dummy_y)
print(score[1] * 100)
#train 94.772

In [None]:
y_pred = model.predict(X_test.reshape(X_test.shape[0], 28, 28, 1))
y_pred = np.argmax(y_pred, axis=1)

In [None]:
save_pred(y_pred, 'cnn_500.txt', classes)