In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from data_augmentation import AugmentData

# Logistic Regression

## Parameter Tuning on Original Dataset

In [2]:
from sklearn.model_selection import train_test_split

# load data
train = np.load('data/train.npz')
X, y = train['arr_0'], train['arr_1']
classes = ['ant', 'spider', 'flower', 'dolphin', 'lobster', 'bulldozer']

X_train, X_test, y_train, y_test = train_test_split(
    train['arr_0'], train['arr_1'], test_size=0.33, random_state=42)

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV

pipe = Pipeline([('transformer', FunctionTransformer(lambda x: x / 255)),
                 ('polynomials', PolynomialFeatures(include_bias=False)),
                 ('pca', PCA()),
                 ('logistic', LogisticRegression())])

parameters = {'polynomials__degree': [2],
              'pca__n_components': [600, 650, 700],
              'logistic__penalty': ['l2'],
              'logistic__C': [0.001, 0.005, 0.01]}

clf = RandomizedSearchCV(pipe, parameters, n_iter=100, n_jobs=60, verbose=2,
                         random_state=12)

clf.fit(X_train, y_train)

print(f'accuracy: {clf.best_score_:.3f}',
      f'\nbest parameters: { clf.best_params_}')

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=60)]: Using backend LokyBackend with 60 concurrent workers.
[Parallel(n_jobs=60)]: Done  18 out of  45 | elapsed:  2.2min remaining:  3.3min
[Parallel(n_jobs=60)]: Done  41 out of  45 | elapsed:  2.4min remaining:   14.2s
[Parallel(n_jobs=60)]: Done  45 out of  45 | elapsed:  2.7min finished


accuracy: 0.531 
best parameters: {'polynomials__degree': 2, 'pca__n_components': 650, 'logistic__penalty': 'l2', 'logistic__C': 0.005}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Fit on Augmented Dataset
Rotations do not seem to increase accuracy (they actually decrease it).

In [4]:
aug_data = AugmentData(X_train, y_train, classes)
aug_data.add_shifted_images([1, 2, 3])
aug_data.add_flipped_images()
X_train_aug, y_train_aug = aug_data.return_data()

fitted_pipe = Pipeline([('transformer', FunctionTransformer(lambda x: x / 255)),
                        ('polynomials', PolynomialFeatures(degree=2, include_bias=False)),
                        ('pca', PCA(n_components=650)),
                        ('logistic', LogisticRegression(C=0.005, n_jobs=100))])

fitted_pipe.fit(X_train_aug, y_train_aug)

Pipeline(steps=[('transformer',
                 FunctionTransformer(func=<function <lambda> at 0x7f3f9c2b29d0>)),
                ('polynomials', PolynomialFeatures(include_bias=False)),
                ('pca', PCA(n_components=650)),
                ('logistic', LogisticRegression(C=0.005, n_jobs=100))])

In [6]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

def show_confusion_matrix(y_test, y_pred, classes):
    df_report = pd.DataFrame(confusion_matrix(y_test, y_pred, normalize='true'), columns=classes).round(3)
    df_report.index = classes
    display(df_report)

y_pred = fitted_pipe.predict(X_test)
print(f'accuracy: {accuracy_score(y_test, y_pred):.3f}')
          
show_confusion_matrix(y_test, y_pred, classes)

accuracy: 0.636


Unnamed: 0,ant,spider,flower,dolphin,lobster,bulldozer
ant,0.526,0.171,0.053,0.066,0.158,0.026
spider,0.115,0.615,0.031,0.073,0.062,0.104
flower,0.014,0.014,0.889,0.0,0.042,0.042
dolphin,0.045,0.102,0.023,0.67,0.023,0.136
lobster,0.15,0.188,0.075,0.075,0.425,0.088
bulldozer,0.06,0.036,0.024,0.06,0.108,0.711


## Final Estimation

In [None]:
# load data
train = np.load('data/train.npz')
X, y = train['arr_0'], train['arr_1']

aug_data = AugmentData(X, y, classes)
aug_data.add_shifted_images([1, 2, 3])
aug_data.add_flipped_images()
X_aug, y_aug = aug_data.return_data()

fitted_pipe.fit(X, y)

## Submission
no submission was made to kaggle

In [None]:
# load data
test = np.load('data/test.npz')
X_test = test['arr_0']
y_pred_test = fitted_pipe.predict(X_test)
submission = pd.DataFrame({'Id': range(len(y_pred_test)), 'Category': y_pred_test})
submission.to_csv('submissions/logistic.csv', index=False)