In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from data_augmentation import AugmentData

# Logistic Regression

## Parameter Tuning on Original Dataset

In [2]:
from sklearn.model_selection import train_test_split

# load data
train = np.load('data/train.npz')
X, y = train['arr_0'], train['arr_1']
classes = ['ant', 'spider', 'flower', 'dolphin', 'lobster', 'bulldozer']

X_train, X_test, y_train, y_test = train_test_split(
    train['arr_0'], train['arr_1'], test_size=0.33, random_state=42)

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV

pipe = Pipeline([('transformer', FunctionTransformer(lambda x: x / 255)),
                 ('polynomials', PolynomialFeatures(include_bias=False)),
                 ('pca', PCA()),
                 ('logistic', LogisticRegression())])

parameters = {'polynomials__degree': [2],
              'pca__n_components': [600, 650, 700],
              'logistic__penalty': ['l2'],
              'logistic__C': [0.001, 0.005, 0.01]}

clf = RandomizedSearchCV(pipe, parameters, n_iter=100, n_jobs=60, verbose=2,
                         random_state=12)

clf.fit(X_train, y_train)

print(f'accuracy: {clf.best_score_:.3f}',
      f'\nbest parameters: { clf.best_params_}')

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=60)]: Using backend LokyBackend with 60 concurrent workers.
[Parallel(n_jobs=60)]: Done  18 out of  45 | elapsed:  2.1min remaining:  3.2min
[Parallel(n_jobs=60)]: Done  41 out of  45 | elapsed:  2.5min remaining:   14.5s
[Parallel(n_jobs=60)]: Done  45 out of  45 | elapsed:  2.6min finished


accuracy: 0.534 
best parameters: {'polynomials__degree': 2, 'pca__n_components': 600, 'logistic__penalty': 'l2', 'logistic__C': 0.01}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Fit on Augmented Dataset
Rotations do not seem to increase accuracy (they actually decrease it).

In [None]:
aug_data = AugmentData(X_train, y_train, classes)
aug_data.add_shifted_images([1, 2, 3])
aug_data.add_flipped_images()
X_train_aug, y_train_aug = aug_data.return_data()

fitted_pipe = Pipeline([('transformer', FunctionTransformer(lambda x: x / 255)),
                        ('polynomials', PolynomialFeatures(degree=2, include_bias=False)),
                        ('pca', PCA(n_components=650)),
                        ('logistic', LogisticRegression(C=0.005, n_jobs=100))])

fitted_pipe.fit(X_train_aug, y_train_aug)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

def show_confusion_matrix(y_test, y_pred, classes):
    df_report = pd.DataFrame(confusion_matrix(y_test, y_pred, normalize='true'), columns=classes).round(3)
    df_report.index = classes
    display(df_report)

y_pred = fitted_pipe.predict(X_test)
print(f'accuracy: {accuracy_score(y_test, y_pred):.3f}')
          
show_confusion_matrix(y_test, y_pred, classes)

## Final Estimation

In [None]:
# load data
train = np.load('data/train.npz')
X, y = train['arr_0'], train['arr_1']
classes = ['ant', 'spider', 'flower', 'dolphin', 'lobster', 'bulldozer']

aug_data = AugmentData(X, y, classes)
aug_data.add_shifted_images([1, 2, 3])
aug_data.add_flipped_images()
X_aug, y_aug = aug_data.return_data()

fitted_pipe.fit(X, y)

## Submission
no submission was made to kaggle

In [None]:
# load data
test = np.load('data/test.npz')
X_test = test['arr_0']
y_pred_test = fitted_pipe.predict(X_test)
submission = pd.DataFrame({'Id': range(len(y_pred_test)), 'Category': y_pred_test})
submission.to_csv('submissions/logistic.csv', index=False)

# Random Forest

## Parameter Tuning on Original Dataset

In [None]:
from sklearn.model_selection import train_test_split

# load data
train = np.load('data/train.npz')
X, y = train['arr_0'], train['arr_1']
classes = ['ant', 'spider', 'flower', 'dolphin', 'lobster', 'bulldozer']

X_train, X_test, y_train, y_test = train_test_split(
    train['arr_0'], train['arr_1'], test_size=0.40, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 100)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(2, 120, num = 30)]
max_depth.append(None)
min_samples_split = range(2, 20)
min_samples_leaf = range(2, 20)
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

clf = RandomizedSearchCV(rf, random_grid, n_iter=500, verbose=2, random_state=0, 
                         n_jobs=100)
clf.fit(X_train, y_train)

print(f'accuracy: {clf.best_score_:.3f}',
      f'\nbest parameters: {clf.best_params_}')

## Fit on Augmented Dataset
Rotations do not seem to increase accuracy.

In [None]:
aug_data = AugmentData(X_train, y_train, classes)
aug_data.add_shifted_images([1, 2, 3])
aug_data.add_flipped_images()
X_train_aug, y_train_aug = aug_data.return_data()

fitted_rf = RandomForestClassifier(**clf.best_params_, n_jobs=100)
fitted_rf.fit(X_train_aug, y_train_aug)

In [None]:
y_pred = fitted_rf.predict(X_test)
print(f'accuracy: {accuracy_score(y_test, y_pred):.3f}')
          
show_confusion_matrix(y_test, y_pred, classes)

## Final Estimation

In [None]:
# load data
train = np.load('data/train.npz')
X, y = train['arr_0'], train['arr_1']

aug_data = AugmentData(X_train, y_train, classes)
aug_data.add_shifted_images([1, 2, 3])
aug_data.add_flipped_images()
X_aug, y_aug = aug_data.return_data()

fitted_rf = RandomForestClassifier(**clf.best_params_, n_jobs=100)
fitted_rf.fit(X_aug, y_aug)

## Submission

In [None]:
# load data
test = np.load('data/test.npz')
X_test = test['arr_0']
y_pred_test = fitted_rf.predict(X_test)
submission = pd.DataFrame({'Id': range(len(y_pred_test)), 'Category': y_pred_test})
submission.to_csv('submissions/rf1.csv', index=False)

# Support Vector Classifier (SVC) with Histogram of Oriented Gradients (HOG)

## Parameter Tuning on Original Dataset

In [None]:
# load data
train = np.load('data/train.npz')
X, y = train['arr_0'], train['arr_1']
X_train, X_test, y_train, y_test = train_test_split(
    train['arr_0'], train['arr_1'], test_size=0.33, random_state=42)

### Linear SVC

In [None]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import RandomizedSearchCV
     
parameters = {'C': np.logspace(-4, 0, 5)}

clf = RandomizedSearchCV(LinearSVC(), parameters, n_iter=100, n_jobs=60, verbose=2)
clf.fit(X_train, y_train)

print(f'accuracy: {clf.best_score_:.3f}',
      f'\nbest parameters: { clf.best_params_}')

### SVC

In [None]:
from sklearn.svm import SVC
     
parameters = {'C': np.linspace(0, 5, 10)}

clf = RandomizedSearchCV(SVC(), parameters, n_iter=100, n_jobs=60, verbose=2)
clf.fit(X_train, y_train)

print(f'accuracy: {clf.best_score_:.3f}',
      f'\nbest parameters: { clf.best_params_}')

### SVC with HOG

In [None]:
from sklearn.preprocessing import FunctionTransformer
from skimage.feature import hog
from skimage.transform import resize

def transform_hog(X):
    return np.array([hog(resize(x.reshape(28, 28), (32, 32))) for x in X])
    
pipe = Pipeline(
    [('transformer', FunctionTransformer(transform_hog)),
     ('SVC', SVC())])

parameters = {'SVC__kernel': ['rbf', 'linear'],
              'SVC__C': np.linspace(0, 2, 10)}

clf = RandomizedSearchCV(pipe, parameters, n_iter=100, n_jobs=60, verbose=2)
clf.fit(X_train, y_train)

print(f'accuracy: {clf.best_score_:.3f}',
      f'\nbest parameters: { clf.best_params_}')

## Fit on Augmented Dataset
Rotations do not seem to increase accuracy (they actually decrease it).

In [None]:
aug_data = AugmentData(X_train, y_train, classes)
aug_data.add_shifted_images([1, 2, 3])
aug_data.add_flipped_images()
X_train_aug, y_train_aug = aug_data.return_data()

fitted_pipe = Pipeline(
    [('transformer', FunctionTransformer(transform_hog)),
     ('SVC', SVC(C=1.77, kernel='rbf'))])

fitted_pipe.fit(X_train_aug, y_train_aug)

from sklearn.metrics import accuracy_score
y_pred = fitted_pipe.predict(X_test)
print(accuracy_score(y_test, y_pred))

In [None]:
y_pred = fitted_pipe.predict(X_test)
print(f'accuracy: {accuracy_score(y_test, y_pred):.3f}')
          
show_confusion_matrix(y_test, y_pred, classes)

## Final Estimation

In [None]:
# load data
train = np.load('data/train.npz')
X, y = train['arr_0'], train['arr_1']

aug_data = AugmentData(X, y_train, classes)
aug_data.add_shifted_images([1, 2, 3])
aug_data.add_flipped_images()
X_aug, y_aug = aug_data.return_data()

fitted_pipe.fit(X_train_aug, y_train_aug)

## Submission

In [None]:
# load data
test = np.load('data/test.npz')
X_test = test['arr_0']
y_pred_test = fitted_pipe.predict(X_test)
submission = pd.DataFrame({'Id': range(len(y_pred_test)), 'Category': y_pred_test})
submission.to_csv('submissions/svm_hog.csv', index=False)

# CNN with augmented dataset

In [6]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from data_augmentation import AugmentData


In [None]:
dfile = "data/train.npz"
images = np.load(dfile)["arr_0"]
labels = np.load(dfile)["arr_1"]
classes = ['ant', 'spider', 'flower', 'dolphin', 'lobster', 'bulldozer']

In [None]:
# augment dataset
aug_data = AugmentData(images, labels, classes)
aug_data.add_shifted_images([1, 2, 3])
aug_data.add_flipped_images()
X_train_aug, y_train_aug = aug_data.return_data()

In [None]:
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers
import tensorflow.keras.layers.experimental.preprocessing as preprocessing

In [None]:
### Submission .84 with this model: b=100 x 200, b= 600 x 200
model = keras.Sequential([
    
    # Preprocessing to add random variations to the training data
    preprocessing.RandomFlip('horizontal'), # flip left-to-right
    preprocessing.RandomContrast(0.5), # contrast change by up to 50%
    preprocessing.Normalization(),
    preprocessing.RandomZoom(height_factor=(-0.2,0.2), width_factor=(-0.2,0.2)),
    preprocessing.RandomRotation(factor=0.05),
    preprocessing.RandomTranslation(height_factor=0.2, width_factor=0.2),

    # based on this model: https://github.com/ck090/Google_Quick_Draw/blob/master/Myquickdraw.ipynb
    # These values were chosen empirically. We found keeping a small number of filters with an extra layer lowered overfitting and gave us better validation accuracy, although further improvements could still be made.
    layers.Conv2D(32, (3, 3), input_shape=(28, 28,1), activation='relu'), 
    layers.Conv2D(32, (3, 3), activation='relu'),
    layers.Conv2D(32, (3, 3), activation='relu'),
    layers.MaxPooling2D(pool_size=(3,3)),
    layers.Dropout(0.3),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(50, activation='relu'),
    layers.Dense(6, activation='softmax')

])

In [None]:
from keras.utils import np_utils
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy'],
)

# format the data before training
y_train_aug_cnn = np_utils.to_categorical(y_train_aug)
X_train_aug_cnn = X_train_aug.reshape(X_train_aug.shape[0], 28, 28, 1).astype('float32')

history = model.fit(X_train_aug_cnn, y_train_aug_cnn, batch_size=100, epochs=200, verbose=1)

In [4]:
# load test data

test = np.load('data/test.npz')
test_images = test['arr_0']
X_test_cnn = test_images.reshape(test_images.shape[0], 28, 28, 1).astype('float32')

In [None]:
# predict

y_preds = model.predict(X_test_cnn)
y_cats = np.argmax(y_preds, axis=1)

In [None]:
# Get some csv

import csv
with open('pred_file.csv', mode='w') as file:
    wr = csv.writer(file, delimiter=',')
    i = 0
    wr.writerow(['Id','Category'])
    for cat in y_cats:
      wr.writerow([i,cat])
      i+=1