# Preamble

In [1]:
%load_ext autoreload
%autoreload 2

# disable parallelization for BLAS and co.
from nalabtools.utils.parallelization import set_threads_for_external_libraries
set_threads_for_external_libraries(n_threads=1)

# general
import re
import collections
import pickle

# data
import numpy as np
import pandas as pd

# ml / stats
import sklearn as sk
import statsmodels.stats.multitest

# plotting
import seaborn as sns
import matplotlib.pyplot as plt

# nalab
import nalabtools

# init notebook files
import nalabtools.utils.misc
nbfile = nalabtools.utils.misc.init_notebook_file(None)

# Load data

In [7]:
import h5py
file = h5py.File('../data/deepinsight/dataset2.mat', "r")

In [252]:
X_train_raw = file["dset"]["Xtrain"][...]
X_test_raw = file["dset"]["Xtest"][...]

In [9]:
y_train = np.repeat([0,1], file["dset"]["num_tr"][...].flatten().astype(np.int))
y_test = np.repeat([0,1], file["dset"]["num_tst"][...].flatten().astype(np.int))

In [264]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(6660, 20) (6660,) (740, 20) (740,)


# Normalization

In [256]:
preprocessing = sklearn.preprocessing.MinMaxScaler().fit(X_train_raw)
X_train = preprocessing.transform(X_train_raw)
X_test = preprocessing.transform(X_test_raw)

# Derive images and dataset

In [257]:
import butterfly.deepinsight.album2

In [258]:
at = butterfly.deepinsight.album2.AlbumTransformer(40)
at.fit(X_train)
X_train_album = at.transform_parallel(X_train, n_jobs=None)

In [282]:
X_test_album = at.transform_parallel(X_test, n_jobs=None)

In [259]:
import torch.utils.data
dataset_train = butterfly.deepinsight.album2.AlbumDataset(X_train_album[:,1,:,:], y_train)

# Network

In [260]:
import torch.optim as optim
import butterfly.deepinsight.deepinsight

In [266]:
m = butterfly.deepinsight.deepinsight.DeepInsight(
    input_dim=X_train_album.shape[2:],
    kernel_size1=1, kernel_size2=2, n_initial_filters=1)
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.SGD(m.parameters(), lr=0.001, momentum=0.9, )
dataloader_train = torch.utils.data.DataLoader(dataset_train, batch_size=64, shuffle=True)

for epoch in range(100):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, (inputs, labels) in enumerate(dataloader_train):
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = m(inputs.reshape(-1, 1, *inputs.shape[1:]))
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        every = 30
        if i % every == every - 1:
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / every))
            running_loss = 0.0

print('Finished Training')

[1,    30] loss: 0.701
[1,    60] loss: 0.698
[1,    90] loss: 0.697
[2,    30] loss: 0.696
[2,    60] loss: 0.694
[2,    90] loss: 0.696
[3,    30] loss: 0.693
[3,    60] loss: 0.692
[3,    90] loss: 0.690
[4,    30] loss: 0.690
[4,    60] loss: 0.690
[4,    90] loss: 0.687
[5,    30] loss: 0.686
[5,    60] loss: 0.685
[5,    90] loss: 0.687
[6,    30] loss: 0.682
[6,    60] loss: 0.683
[6,    90] loss: 0.683
[7,    30] loss: 0.679
[7,    60] loss: 0.682
[7,    90] loss: 0.680
[8,    30] loss: 0.678
[8,    60] loss: 0.677
[8,    90] loss: 0.678
[9,    30] loss: 0.675
[9,    60] loss: 0.676
[9,    90] loss: 0.673
[10,    30] loss: 0.672
[10,    60] loss: 0.674
[10,    90] loss: 0.670
[11,    30] loss: 0.670
[11,    60] loss: 0.669
[11,    90] loss: 0.669
[12,    30] loss: 0.667
[12,    60] loss: 0.666
[12,    90] loss: 0.667
[13,    30] loss: 0.664
[13,    60] loss: 0.665
[13,    90] loss: 0.665
[14,    30] loss: 0.662
[14,    60] loss: 0.661
[14,    90] loss: 0.663
[15,    30] loss: 0

In [291]:
y_pred_proba = m(torch.tensor(X_train_album[:,[1],:,:]).float())
y_pred = np.argmax(y_pred_proba.detach().numpy(), axis=1)
sklearn.metrics.accuracy_score(y_pred, y_train)

0.8322822822822823

In [290]:
y_pred_proba = m(torch.tensor(X_test_album[:,[1],:,:]).float())
y_pred = np.argmax(y_pred_proba.detach().numpy(), axis=1)
sklearn.metrics.accuracy_score(y_pred, y_test)

0.8405405405405405

In [285]:
import sklearn.ensemble
rf = sklearn.ensemble.RandomForestClassifier(n_estimators=100)
rf.fit(X_train_raw, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [288]:
sklearn.metrics.accuracy_score(rf.predict(X_train_raw), y_train)

1.0

In [289]:
sklearn.metrics.accuracy_score(rf.predict(X_test_raw), y_test)

0.9581081081081081