In [1]:
import numpy as np
import matplotlib.pyplot as plt
import gzip
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
import csv

  from numpy.core.umath_tests import inner1d


In [2]:
fImageTrain = gzip.open('data/train-images-idx3-ubyte.gz','r')
fImageTest = gzip.open('data/t10k-images-idx3-ubyte.gz','r')

image_size = 28
no_train = 60000
no_val = 15000
no_test = 10000

import numpy as np

fImageTrain.read(16)
buf_trainImage = fImageTrain.read(image_size * image_size * no_train)
train_img = np.frombuffer(buf_trainImage, dtype=np.uint8).astype(np.float32)
train_img = train_img.reshape(no_train, image_size*image_size)

trainval_img = train_img
val_img = train_img[no_train-no_val:no_train, :]
train_img = train_img[0:no_train-no_val, :]

fImageTest.read(16)
buf_testImage = fImageTest.read(image_size * image_size * no_test)
test_img = np.frombuffer(buf_testImage, dtype=np.uint8).astype(np.float32)
test_img = test_img.reshape(no_test, image_size*image_size)


print(train_img.shape)
print(val_img.shape)
print(test_img.shape)

(45000, 784)
(15000, 784)
(10000, 784)


In [3]:
fLabelTrain = gzip.open('data/train-labels-idx1-ubyte.gz','r')
fLabelTest = gzip.open('data/t10k-labels-idx1-ubyte.gz','r')

fLabelTrain.read(8)
buf_trainLabel = fLabelTrain.read(1 * 32 * no_train)
train_lbl = np.frombuffer(buf_trainLabel, dtype=np.uint8).astype(np.int64)
train_lbl = train_lbl.reshape(no_train)

trainval_lbl = train_lbl
val_lbl = train_lbl[no_train-no_val:no_train]
train_lbl = train_lbl[0:no_train-no_val]

fLabelTest.read(8)
buf_testLabel = fLabelTest.read(1 * 32 * no_test)
test_lbl = np.frombuffer(buf_testLabel, dtype=np.uint8).astype(np.int64)
test_lbl = test_lbl.reshape(no_test)

print(train_lbl.shape)
print(val_lbl.shape)
print(test_lbl.shape)

(45000,)
(15000,)
(10000,)


In [4]:
# all parameters not specified are set to their defaults
# default solver is incredibly slow thats why we change it
estimator_list = [150, 200, 250, 300, 350, 400]
score_list = np.zeros(len(estimator_list))
j = 0
for estimator in estimator_list:
    print(estimator)
    RandomFor = RandomForestClassifier(n_estimators = estimator, n_jobs = -1)
    RandomFor.fit(train_img, train_lbl)
    predictions = RandomFor.predict(val_img)
    score = RandomFor.score(val_img, val_lbl)
    score_list[j] = score
    j= j+1
    print('Finished iteration ', j)
print(score_list)

150
Finished iteration  1
200
Finished iteration  2
250
Finished iteration  3
300
Finished iteration  4
350
Finished iteration  5
400
Finished iteration  6
[0.96726667 0.96793333 0.96846667 0.96906667 0.96933333 0.96846667]


In [5]:
score_index = np.argmax(score_list)
n_estimators = estimator_list[score_index]
print(n_estimators)

350


In [6]:
RandomFor = RandomForestClassifier(n_estimators = n_estimators, n_jobs = -1)
RandomFor.fit(trainval_img, trainval_lbl)
predictions = RandomFor.predict(test_img)
score = RandomFor.score(test_img, test_lbl)
print(score)

0.9709


In [7]:
index = 0
misclassifiedIndexes = []
for label, predict in zip(val_lbl, predictions):
    if label != predict: 
        misclassifiedIndexes.append(index)
        index +=1
print(index)

8970


In [15]:
onehot_encoder = OneHotEncoder(sparse=False)
predictions = predictions.reshape(len(predictions), 1)
x = onehot_encoder.fit_transform(predictions)
print(x)

with open('results/rf.csv', 'w') as csvFile:
    writer = csv.writer(csvFile, lineterminator='\n')
    writer.writerows(x)

csvFile.close()

[[0. 0. 0. ... 1. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
