In [None]:
# CODE:
# https://scikit-learn.org/stable/modules/grid_search.html#grid-search
# https://github.com/skorch-dev/skorch/issues/451
# https://tomaugspurger.github.io <- check

In [None]:
import h5py
import numpy as np
import pandas as pd
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

from tensorboardX import SummaryWriter

from sklearn.preprocessing import LabelEncoder

In [None]:
import matplotlib.pylab as pylab
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter
params = {'legend.fontsize': 'x-large',
          'figure.figsize': (15, 5),
          'axes.labelsize': 'x-large',
          'axes.titlesize': 'x-large',
          'xtick.labelsize': 'x-large',
          'ytick.labelsize': 'x-large'}
pylab.rcParams.update(params)

## Explore

In [None]:
with h5py.File('../data/usps.h5', 'r') as hf:
    train = hf.get('train')
    X_train = train.get('data')[:]
    y_train = train.get('target')[:]
    test = hf.get('test')
    X_test = test.get('data')[:]
    y_test = test.get('target')[:]
    
X_train.shape, y_train.shape

In [None]:
np.c_[X_train, y_train].shape

In [None]:
label_df = pd.DataFrame(data=y_train)
print(label_df[0].value_counts())

In [None]:
counts = np.bincount(y_train)
fig, ax = plt.subplots()
ax.bar(range(10), counts, width=0.8, align='center', color='#86bf91')
ax.set(xticks=range(10), xlim=[-1, 10])

plt.show()

In [None]:
train_df = pd.DataFrame(data=X_train)
train_df.describe()

## Dataset

In [None]:
BATCH_SIZE = 128
INPUT_SIZE = 256
ENCODE_DIM = (64, 16)

FEAT_EPOCHS = 1000

device = torch.device("cpu")

In [None]:
class USPSTrainDataset(Dataset):
    def __init__(self, filename):
        with h5py.File(filename, 'r') as hf:
            train = hf.get('train')
            self.X = train.get('data')[:]
            self.y = train.get('target')[:]
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        vector = self.X[index, :]
        label = self.y[index]
        
        return vector, label
            
class USPSTestDataset(Dataset):
    def __init__(self, filename):
        with h5py.File(filename, 'r') as hf:
            test = hf.get('test')
            self.X = test.get('data')[:]
            self.y = test.get('target')[:]
            
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        vector = self.X[index, :]
        label = self.y[index]
        
        return vector, label

## Feature Extraction - Vanilla Autoencoder

In [None]:
train_dataset = USPSTrainDataset('../data/usps.h5')
test_dataset = USPSTestDataset('../data/usps.h5')

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
class Autoencoder(nn.Module):
    def __init__(self, input_size, encode_dim):
        super(Autoencoder, self).__init__()
        
        self.encoder = nn.Sequential(
            nn.Linear(input_size, encode_dim[0]),
            nn.ReLU(True),
            nn.Linear(encode_dim[0], encode_dim[1]),
            nn.ReLU(True))

        self.decoder = nn.Sequential(             
            nn.Linear(encode_dim[1], encode_dim[0]),
            nn.ReLU(True),
            nn.Linear(encode_dim[0], input_size))

    def forward(self, x):
        x_enc = self.encoder(x)
        x_dec = self.decoder(x_enc)
        return x_dec, x_enc

In [None]:
def train(epoch):
    for data in train_loader:
        vec, labels = data
        vec = Variable(vec, requires_grad=True).cpu()
        
        # ===================forward=====================
        dec, enc = model(vec)
        loss = distance(dec, vec)
        
        # ===================backward====================
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print('epoch [{}/{}], loss: {:.4f}'.format(epoch + 1, FEAT_EPOCHS, loss.item()))
    return loss.item()

In [None]:
writer = SummaryWriter()
model = Autoencoder().cpu()
distance = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 1, momentum = 0.9)

for epoch in range(FEAT_EPOCHS):
    loss = train(epoch)
    writer.add_scalar('usps/mse', loss, epoch)
    
writer.close()

In [None]:
writer.close()

In [None]:
torch.save(model.state_dict(), './USPS_Feat_1000_lr_10')

## Model

In [None]:
model = Autoencoder(INPUT_SIZE, ENCODE_DIM)
model.load_state_dict(torch.load('./USPS_Feat_1000_lr_01'))

In [None]:
def to_latent(dataset_loader):
    model.eval()
    encs = torch.Tensor([])
    labels = []
    for data in dataset_loader:
        vec, label = data
        vec = Variable(vec, requires_grad=False).cpu()
        _, enc = model(vec)
        
        encs = torch.cat((encs, enc))
        labels.extend([l.tolist() for l in label])
        
    return encs.detach().numpy(), labels

In [None]:
encs, labels = to_latent(train_loader)

In [None]:
latent_df = pd.DataFrame(data=np.c_[encs, labels])

In [None]:
latent_df.to_csv('../data/usps_latent.csv', sep='\t', encoding='utf-8')

In [None]:
from sklearn import svm
from sklearn.model_selection import cross_val_score

In [None]:
clf = svm.SVC(gamma='auto')
latent_scores = cross_val_score(clf, encs, labels, cv=7)

In [None]:
print(latent_scores.mean(), latent_scores.std())
latent_scores

In [None]:
clf = svm.SVC(gamma='auto')
raw_scores = cross_val_score(clf, X_train, y_train, cv=7)

In [None]:
print(raw_scores.mean(), raw_scores.std())
raw_scores

In [None]:
test_encs, test_labels = to_latent(test_loader)

In [None]:
clf = svm.SVC(gamma='auto')
clf.fit(encs, labels)

In [None]:
clf.score(test_encs, test_labels)

In [None]:
clf = svm.SVC(gamma='auto')
clf.fit(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

## Model selection

In [None]:
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from skorch import NeuralNet

In [None]:
clf = svm.SVC(gamma='auto')
param_grid = {
    'svm__C': [0.1, 1, 10, 100]
}
ppl = Pipeline([
    ('svm', clf)
])
search = GridSearchCV(ppl, 
                      param_grid, 
                      iid=False, 
                      cv=StratifiedKFold(n_splits=7),
                      return_train_score=False,
                      verbose=10)

In [None]:
search.fit(X_train, y_train)

In [None]:
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

In [None]:
search.best_estimator_.score(X_test, y_test)

In [None]:
class AutoencoderPipeline(NeuralNet):
    def get_loss(self, y_pred, y_true, *args, **kwargs):
        decoded, _encoded = y_pred
        return super().get_loss(decoded, y_true, *args, **kwargs)
    
    def fit(self, X, y=None, **fit_params):
        if not self.warm_start or not self.initialized_:
            self.initialize()

        self.partial_fit(X, X, **fit_params)
        return self
    
    def transform(self, X):
        _decoded, encoded = super().forward(X)
        return encoded.numpy()

In [None]:
autoenc = AutoencoderPipeline(
    module__input_size=256,
    module__encode_dim=(64, 16),
    module=Autoencoder,
    max_epochs=100,
    criterion=nn.MSELoss,
    optimizer__momentum=0.9,
    optimizer__lr=1,
    optimizer=torch.optim.SGD,
    verbose=0
)

In [None]:
clf = svm.SVC(gamma='auto')
param_grid = {
#     'autofeat__optimizer__lr': [0.1, 1],
    'autofeat__module__encode_dim': [(128, 64), (128, 32), (96, 24), (64, 16), (64, 8)],
    'svm__C': [10, 100]
}

best_grid = {
    'autofeat__module__encode_dim': [(128, 64), (128, 32)],
    'svm__C': [10, 100]
}

ppl = Pipeline([
    ('autofeat', autoenc),
    ('svm', clf)
])

gs = GridSearchCV(ppl, 
                  best_grid, 
                  verbose=10,
                  cv=StratifiedKFold(n_splits=7))

In [None]:
gs.fit(X_train, y_train)

In [None]:
from dask.distributed import Client
client = Client()

from sklearn.externals import joblib

with joblib.parallel_backend('dask'):
    gs.fit(X_train, y_train)