In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Data Augmentation

In [17]:
from scipy.ndimage.interpolation import shift
from skimage.transform import rotate

class AugmentData:
    def __init__(self, X, y, classes):
        self.X = X
        self.y = y
        self.classes = classes
        self.side = 28
    
    def plot_image(self, image):       
        print(classes[self.y[image]])
        plt.imshow(self.X[image].reshape(self.side, self.side), cmap='gray_r',
               interpolation='nearest')
        plt.show()
    
    def shift_image(self, image, dx, dy):
        image = image.reshape((self.side, self.side))
        shifted_image = shift(image, [dy, dx], cval=0, mode="constant")
        return shifted_image.reshape([-1])
    
    def add_shifted_images(self, deltas=[1]):
        l_X = self.X.tolist()
        l_y = self.y.tolist()
        for delta in deltas:
            for dx, dy in ((delta, 0), (-delta, 0), (0, delta), (0, -delta)):
                for image in self.X:
                    l_X.append(self.shift_image(image, dx, dy))
                l_y += self.y.tolist()
        
        self.X = np.array(l_X)
        self.y = np.array(l_y)
        
    def add_flipped_images(self):
        flipped = [np.fliplr(image.reshape(self.side, self.side)).flatten()
                   for image in self.X]
        self.X = np.array(flipped + self.X.tolist())
        self.y = np.tile(self.y, 2)
        
    def add_rotated_images(self, angles=[-10, 10]):
        l_X = self.X.tolist()
        l_y = self.y.tolist()
        for angle in angles:
            rotated = [rotate(image.reshape(self.side, self.side), angle).flatten() 
                       for image in self.X]
            l_X += rotated
            l_y += self.y.tolist()
        self.X = np.array(l_X)
        self.y = np.array(l_y)
        
    def return_data(self):
        return self.X, self.y

# Parameter Tuning on Original Dataset

In [18]:
from sklearn.model_selection import train_test_split

# load data
train = np.load('data/train.npz')
X, y = train['arr_0'], train['arr_1']
classes = ['ant', 'spider', 'flower', 'dolphin', 'lobster', 'bulldozer']

X_train, X_test, y_train, y_test = train_test_split(
    train['arr_0'], train['arr_1'], test_size=0.33, random_state=42)

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV

pipe = Pipeline([('transformer', FunctionTransformer(lambda x: x / 255)),
                 ('polynomials', PolynomialFeatures(include_bias=False)),
                 ('pca', PCA()),
                 ('logistic', LogisticRegression())])

parameters = {'polynomials__degree': [2],
              'pca__n_components': [600, 650, 700],
              'logistic__penalty': ['l2'],
              'logistic__C': [0.001, 0.005, 0.01]}

clf = RandomizedSearchCV(pipe, parameters, n_iter=100, n_jobs=60, verbose=2)

clf.fit(X_train, y_train)

In [28]:
print(f'accuracy: {clf.best_score_:.3f}',
      f'\nbest parameters: { clf.best_params_}')

accuracy: 0.531 
best parameters: {'polynomials__degree': 2, 'pca__n_components': 650, 'logistic__penalty': 'l2', 'logistic__C': 0.005}


# Fit on Augmented Dataset
Rotations do not seem to increase accuracy (they actually decrease it).

In [40]:
aug_data = AugmentData(X_train, y_train, classes)
aug_data.add_shifted_images([1, 2, 3])
aug_data.add_flipped_images()
X_train_aug, y_train_aug = aug_data.return_data()

fitted_pipe = Pipeline([('transformer', FunctionTransformer(lambda x: x / 255)),
                        ('polynomials', PolynomialFeatures(degree=2, include_bias=False)),
                        ('pca', PCA(n_components=650)),
                        ('logistic', LogisticRegression(C=0.005, n_jobs=100))])

fitted_pipe.fit(X_train_aug, y_train_aug)

Pipeline(steps=[('transformer',
                 FunctionTransformer(func=<function <lambda> at 0x7f5c1541d0d0>)),
                ('polynomials', PolynomialFeatures(include_bias=False)),
                ('pca', PCA(n_components=650)),
                ('logistic', LogisticRegression(C=0.005, n_jobs=100))])

In [41]:
from sklearn.metrics import accuracy_score
y_pred = fitted_pipe.predict(X_test)
accuracy_score(y_test, y_pred)

0.6323232323232323

In [43]:
def show_confusion_matrix(y_test, y_pred, classes):
    df_report = pd.DataFrame(confusion_matrix(y_test, y_pred, normalize='true'), columns=classes).round(3)
    df_report.index = classes
    print(df_report)

show_confusion_matrix(y_test, y_pred, classes)

             ant  spider  flower  dolphin  lobster  bulldozer
ant        0.539   0.171   0.053    0.066    0.145      0.026
spider     0.125   0.573   0.031    0.094    0.083      0.094
flower     0.028   0.028   0.861    0.000    0.056      0.028
dolphin    0.045   0.091   0.011    0.682    0.034      0.136
lobster    0.200   0.212   0.062    0.038    0.388      0.100
bulldozer  0.036   0.048   0.012    0.048    0.084      0.771


# Final Estimation

In [44]:
# load data
train = np.load('data/train.npz')
X, y = train['arr_0'], train['arr_1']
classes = ['ant', 'spider', 'flower', 'dolphin', 'lobster', 'bulldozer']

aug_data = AugmentData(X, y, classes)
aug_data.add_shifted_images([1, 2, 3])
aug_data.add_flipped_images()
X_aug, y_aug = aug_data.return_data()

fitted_pipe.fit(X, y)

Pipeline(steps=[('transformer',
                 FunctionTransformer(func=<function <lambda> at 0x7f5c1541d0d0>)),
                ('polynomials', PolynomialFeatures(include_bias=False)),
                ('pca', PCA(n_components=650)),
                ('logistic', LogisticRegression(C=0.005, n_jobs=100))])

# Submission
no submission was made to kaggle

In [45]:
# load data
test = np.load('data/test.npz')
X_test = test['arr_0']
y_pred_test = fitted_pipe.predict(X_test)
submission = pd.DataFrame({'Id': range(len(y_pred_test)), 'Category': y_pred_test})
submission.head()

Unnamed: 0,Id,Category
0,0,2
1,1,2
2,2,1
3,3,0
4,4,4


In [46]:
submission.to_csv('submissions/logistic.csv', index=False)