In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Data Augmentation

In [2]:
from scipy.ndimage.interpolation import shift
from skimage.transform import rotate

class AugmentData:
    def __init__(self, X, y, classes):
        self.X = X
        self.y = y
        self.classes = classes
        self.side = 28
    
    def plot_image(self, image):       
        print(classes[self.y[image]])
        plt.imshow(self.X[image].reshape(self.side, self.side), cmap='gray_r',
               interpolation='nearest')
        plt.show()
    
    def shift_image(self, image, dx, dy):
        image = image.reshape((self.side, self.side))
        shifted_image = shift(image, [dy, dx], cval=0, mode="constant")
        return shifted_image.reshape([-1])
    
    def add_shifted_images(self, deltas=[1]):
        l_X = self.X.tolist()
        l_y = self.y.tolist()
        for delta in deltas:
            for dx, dy in ((delta, 0), (-delta, 0), (0, delta), (0, -delta)):
                for image in self.X:
                    l_X.append(self.shift_image(image, dx, dy))
                l_y += self.y.tolist()
        
        self.X = np.array(l_X)
        self.y = np.array(l_y)
        
    def add_flipped_images(self):
        flipped = [np.fliplr(image.reshape(self.side, self.side)).flatten()
                   for image in self.X]
        self.X = np.array(flipped + self.X.tolist())
        self.y = np.tile(self.y, 2)
        
    def add_rotated_images(self, angles=[-10, 10]):
        l_X = self.X.tolist()
        l_y = self.y.tolist()
        for angle in angles:
            rotated = [rotate(image.reshape(self.side, self.side), angle).flatten() 
                       for image in self.X]
            l_X += rotated
            l_y += self.y.tolist()
        
        self.X = np.array(l_X)
        self.y = np.array(l_y)
        
    def return_data(self):
        return self.X, self.y

# Parameter Tuning on Original Dataset

In [3]:
from sklearn.model_selection import train_test_split

# load data
train = np.load('data/train.npz')
X, y = train['arr_0'], train['arr_1']
classes = ['ant', 'spider', 'flower', 'dolphin', 'lobster', 'bulldozer']

X_train, X_test, y_train, y_test = train_test_split(
    train['arr_0'], train['arr_1'], test_size=0.40, random_state=42)

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestClassifier()

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 100)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(2, 120, num = 30)]
max_depth.append(None)
min_samples_split = range(2, 20)
min_samples_leaf = range(2, 20)
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

clf = RandomizedSearchCV(rf, random_grid, n_iter=500, verbose=2, random_state=0, 
                         n_jobs=100)
clf.fit(X_train, y_train)

print(f'accuracy: {clf.best_score_:.3f}',
      f'\nbest parameters: { clf.best_params_}')

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


[Parallel(n_jobs=100)]: Using backend LokyBackend with 100 concurrent workers.
[Parallel(n_jobs=100)]: Done 165 tasks      | elapsed:   12.3s
[Parallel(n_jobs=100)]: Done 448 tasks      | elapsed:   26.5s
[Parallel(n_jobs=100)]: Done 813 tasks      | elapsed:   41.6s
[Parallel(n_jobs=100)]: Done 1258 tasks      | elapsed:  1.1min
[Parallel(n_jobs=100)]: Done 1785 tasks      | elapsed:  1.5min
[Parallel(n_jobs=100)]: Done 2500 out of 2500 | elapsed:  2.1min finished


accuracy: 0.504 
best parameters: {'n_estimators': 1672, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'max_depth': 91, 'bootstrap': False}


# Fit on Augmented Dataset
Rotations do not seem to increase accuracy.

In [6]:
aug_data = AugmentData(X_train, y_train, classes)
aug_data.add_shifted_images([1, 2, 3])
aug_data.add_flipped_images()
X_train_aug, y_train_aug = aug_data.return_data()

fitted_rf = RandomForestClassifier(**clf.best_params_, n_jobs=100)
fitted_rf.fit(X_train_aug, y_train_aug)

RandomForestClassifier(bootstrap=False, max_depth=91, max_features='sqrt',
                       min_samples_leaf=3, n_estimators=1672, n_jobs=100)

In [7]:
from sklearn.metrics import accuracy_score
y_pred = fitted_rf.predict(X_test)
accuracy_score(y_test, y_pred)

0.5916666666666667

# Final Estimation

In [8]:
from sklearn.model_selection import train_test_split

# load data
train = np.load('data/train.npz')
X, y = train['arr_0'], train['arr_1']
classes = ['ant', 'spider', 'flower', 'dolphin', 'lobster', 'bulldozer']

aug_data = AugmentData(X_train, y_train, classes)
aug_data.add_shifted_images([1, 2, 3])
aug_data.add_flipped_images()
X_aug, y_aug = aug_data.return_data()

fitted_rf = RandomForestClassifier(**clf.best_params_, n_jobs=100)
fitted_rf.fit(X_aug, y_aug)

RandomForestClassifier(bootstrap=False, max_depth=91, max_features='sqrt',
                       min_samples_leaf=3, n_estimators=1672, n_jobs=100)

# Submission

In [46]:
# load data
test = np.load('data/test.npz')
X_test = test['arr_0']
y_pred_test = fitted_rf.predict(X_test)
submission = pd.DataFrame({'Id': range(len(y_pred_test)), 'Category': y_pred_test})
submission.head()

Unnamed: 0,Id,Category
0,0,2
1,1,2
2,2,1
3,3,5
4,4,2


In [47]:
submission.to_csv('submissions/rf1.csv', index=False)