In [59]:
import numpy as np
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Flatten, concatenate, Input, Reshape, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.models import load_model
import csv
import os
import errno
import operator
import sys
import pickle
from PIL import Image
import random
from sklearn.decomposition import PCA, NMF
from sklearn.cluster import KMeans

## clustering images

#### load images

In [2]:
images = np.load('data/image.npy')

In [3]:
images.shape

(140000, 784)

In [14]:
for i in range(0,100):
    img = Image.fromarray(images[i].reshape(28,28))
    img.save('data/image_output/{}.png'.format(i))

### Dimension Reduction

#### PCA

In [119]:
pca = PCA(n_components=700, whiten=True, svd_solver='arpack')
pca.fit(images)

PCA(copy=True, iterated_power='auto', n_components=700, random_state=None,
  svd_solver='arpack', tol=0.0, whiten=True)

In [120]:
print(pca.components_.shape)

(700, 784)


In [121]:
for i in range(len(pca.components_)):
    img = Image.fromarray((pca.components_[i]*30).astype('uint8').reshape(28,28)) # amplified pixel strength
    img.save('data/eigen_images/{}.png'.format(i))

In [122]:
pca_images = pca.transform(images)
reconstructed_images = pca.inverse_transform(pca_images) # with n_components = 784, looks the same to me

In [123]:
pca_images.shape

(140000, 700)

In [124]:
Image.fromarray(reconstructed_images[6].reshape(28,28)).show()

#### NMF

In [78]:
nmf = NMF(n_components=40, init='random', random_state=0, max_iter=20).fit(images)

In [79]:
nmf.components_.shape

(40, 784)

In [66]:
for i in range(len(nmf.components_)):
    img = Image.fromarray(pca.components_[i].astype('uint8').reshape(28,28))
    img.save('data/nmf_images/{}.png'.format(i))

#### autoencoder

In [None]:
encoding_dim = 32  # 32 floats -> compression of factor 24.5, assuming the input is 784 floats

# this is our input placeholder
input_img = Input(shape=(784,))
# "encoded" is the encoded representation of the input
encoded = Dense(encoding_dim, activation='relu')(input_img)
# "decoded" is the lossy reconstruction of the input
decoded = Dense(784, activation='sigmoid')(encoded)

# this model maps an input to its reconstruction
autoencoder = Model(input_img, decoded)
# this model maps an input to its encoded representation
encoder = Model(input_img, encoded)

# create a placeholder for an encoded (32-dimensional) input
encoded_input = Input(shape=(encoding_dim,))
# retrieve the last layer of the autoencoder model
decoder_layer = autoencoder.layers[-1]
# create the decoder model
decoder = Model(encoded_input, decoder_layer(encoded_input))

### Clustering: K-means

In [127]:
kmeans = KMeans(n_clusters=2, init='k-means++', max_iter=300,
                tol=0.0001, precompute_distances='auto', verbose=0,
                random_state=None, copy_x=True, n_jobs=1, algorithm='auto').fit(pca_images)

In [149]:
for idx in range(0,100):
    print(idx, kmeans.labels_[idx])

'for idx in range(0,100):\n    print(idx, kmeans.labels_[idx])'

In [129]:
len(kmeans.labels_)

140000

In [137]:
with open('labels.pickle', 'wb') as handle:
    pickle.dump(kmeans.labels_, handle, protocol=pickle.HIGHEST_PROTOCOL)

## rule-based

In [150]:
rule_predictions = [] # 1 for MNIST, 0 for fashionMNIST
count = 0

for idx, image in enumerate(images): # estimate 3 minutes
    if idx % 5000 == 0:
        print('finished processing', idx/len(images))
        
    pixel_count = 0
    non_black_count = 0
    near_white_count = 0
    
    for pixel in image:
        pixel_count += 1
        if pixel > 0:
            non_black_count += 1
            if pixel > 200:
                near_white_count += 1
                
    # if near_white_count / non_black_count > 0.5:
    if non_black_count / pixel_count > 0.3:
        # predictions.append(1)
        rule_predictions.append(0)
    else:
        # predictions.append(0)
        rule_predictions.append(1)
    count += 1
print('finished generating rule_predictions!')

"rule_predictions = [] # 1 for MNIST, 0 for fashionMNIST\ncount = 0\n\nfor idx, image in enumerate(images): # estimate 3 minutes\n    if idx % 5000 == 0:\n        print('finished processing', idx/len(images))\n        \n    pixel_count = 0\n    non_black_count = 0\n    near_white_count = 0\n    \n    for pixel in image:\n        pixel_count += 1\n        if pixel > 0:\n            non_black_count += 1\n            if pixel > 200:\n                near_white_count += 1\n                \n    # if near_white_count / non_black_count > 0.5:\n    if non_black_count / pixel_count > 0.3:\n        # predictions.append(1)\n        rule_predictions.append(0)\n    else:\n        # predictions.append(0)\n        rule_predictions.append(1)\n    count += 1\nprint('finished generating rule_predictions!')"

In [151]:
for i in range(100):
    print(i, rule_predictions[i])

'for i in range(100):\n    print(i, rule_predictions[i])'

## Submission

### read test file

In [130]:
x_submission = []

with open('data/test_case.csv', 'rt') as testfile:
    reader = csv.reader(testfile, delimiter=',')
    next(reader) # skip headings
    for row in reader:
        # print([int(row[1]), int(row[2])])
        x_submission.append([int(row[1]), int(row[2])])
print('finished reading file')

finished reading file


In [131]:
len(x_submission)

1980000

In [132]:
x_submission[2]

[68922, 34890]

#### PCA

In [135]:
predictions = []

for id_pair in x_submission:
    if kmeans.labels_[id_pair[0]] == kmeans.labels_[id_pair[1]]:
        predictions.append(1)
    else:
        predictions.append(0)
print(len(predictions))

1980000


#### rule-based

In [147]:
predictions = []

for id_pair in x_submission:
    if rule_predictions[id_pair[0]] == rule_predictions[id_pair[1]]:
        predictions.append(1)
    else:
        predictions.append(0)
print(len(predictions))

1980000


### write submission

In [148]:
with open('nonblack_0.3.csv', 'wt') as outfile:
    test_writer = csv.writer(outfile)
    test_writer.writerow(['ID','Ans'])
    
    counter = 0
    for i in predictions:
        test_writer.writerow([counter, int(i)])
        counter += 1
    
print('finished writing submission!')

finished writing submission!
