In [None]:
# import packages

import numpy as np
import sklearn
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.model_selection import learning_curve
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.random_projection import GaussianRandomProjection
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans

import torch
from torch import nn, optim
from skorch import NeuralNetClassifier
from skorch.callbacks import EpochScoring 
import matplotlib.pyplot as plt
import pandas as pd
import time

In [None]:
# set random seed
seed = 7
np.random.seed(seed)
torch.manual_seed(seed)
device = 'cpu'
plt.rcParams.update({'font.size': 14})

In [None]:
# define neural network class, code adapted from https://github.com/jlm429/pyperch/blob/master/pyperch/neural/backprop_nn.py

class BackpropModule(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_units=10, hidden_layers=1,
                 dropout_percent=0, activation=nn.ReLU(), output_activation=nn.Softmax(dim=-1)):
        """

        Initialize the neural network.

        PARAMETERS:

        input_dim {int}:
            Number of features/dimension of the input.  Must be greater than 0.

        output_dim {int}:
            Number of classes/output dimension of the model. Must be greater than 0.

        hidden_units {int}:
            Number of hidden units.

        hidden_layers {int}:
            Number of hidden layers.

        dropout_percent {float}:
            Probability of an element to be zeroed.

        activation {torch.nn.modules.activation}:
            Activation function.

        output_activation {torch.nn.modules.activation}:
            Output activation.

        """
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.hidden_units = hidden_units
        self.hidden_layers = hidden_layers
        self.dropout = nn.Dropout(dropout_percent)
        self.activation = activation
        self.output_activation = output_activation
        self.layers = nn.ModuleList()
        self.device = "cpu"

        # input layer
        self.layers.append(nn.Linear(self.input_dim, self.hidden_units, device=self.device))
        # hidden layers
        for layer in range(self.hidden_layers):
            self.layers.append(nn.Linear(self.hidden_units, self.hidden_units, device=self.device))
        # output layer
        self.layers.append(nn.Linear(self.hidden_units, self.output_dim, device=self.device))

    def forward(self, X, **kwargs):
        """
        Recipe for the forward pass.

        PARAMETERS:

        X {torch.tensor}:
            NN input data. Shape (batch_size, input_dim).

        RETURNS:

        X {torch.tensor}:
            NN output data. Shape (batch_size, output_dim).
        """
        X = self.activation(self.layers[0](X))
        X = self.dropout(X)
        for i in range(self.hidden_layers):
            X = self.activation(self.layers[i+1](X))
            X = self.dropout(X)
        X = self.output_activation(self.layers[self.hidden_layers+1](X))
        return X

In [None]:
bean = pd.read_excel('.//datasets//bean//Dry_Bean_Dataset.xlsx')

X_bean = bean.iloc[:, :-1]
y_bean = bean.iloc[:, -1]

encoder = preprocessing.LabelEncoder().fit(y_bean)
y_bean = encoder.transform(y_bean)

X_bean_train, X_bean_test, y_bean_train, y_bean_test = model_selection.train_test_split(X_bean, y_bean, test_size=0.2, random_state=seed, stratify=y_bean)

scaler_bean = preprocessing.StandardScaler().fit(X_bean_train)
X_bean_train = scaler_bean.transform(X_bean_train)
X_bean_test = scaler_bean.transform(X_bean_test)

X_train = X_bean_train.astype(np.float32)
X_test = X_bean_test.astype(np.float32)
y_train = y_bean_train.astype(np.int64)
y_test = y_bean_test.astype(np.int64)

In [None]:
# baseline results

num_features = X_train.shape[1]
num_classes = len(np.unique(y_train))

NN_base = NeuralNetClassifier(
    module=BackpropModule,
    module__input_dim=num_features,
    module__output_dim=num_classes,
    module__hidden_units=16,
    module__hidden_layers=1,
    module__dropout_percent=0.1,
    max_epochs=100,
    verbose=0,
    callbacks=[EpochScoring(scoring='f1_macro', name='train_acc', on_train=True),],
    criterion=nn.CrossEntropyLoss,
    optimizer=optim.SGD,
    optimizer__weight_decay=0,
    optimizer__momentum=0.9,
    lr=0.01,
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
)

t1 = time.time()
NN_base.fit(X_train, y_train)
t2 = time.time()
print('Baseline training time is: {}'.format(t2-t1))

results_base = {'train_loss': NN_base.history[:, 'train_loss'], 'valid_loss': NN_base.history[:, 'valid_loss'], 'train_acc': NN_base.history[:, 'train_acc'], 'valid_acc': NN_base.history[:, 'valid_acc']}
results_base = pd.DataFrame(results_base)
results_base.to_excel('NN_base.xlsx')

y_test_prob = NN_base.predict_proba(X_test)
y_test_pred = np.argmax(y_test_prob, axis=1)
NN_test_acc = metrics.accuracy_score(y_test, y_test_pred)
NN_test_recall = metrics.recall_score(y_test, y_test_pred, average='macro')
NN_test_f1 = metrics.f1_score(y_test, y_test_pred, average='macro')
print('Neural network baseline, test accuracy is: {:.3f}'.format(NN_test_acc))
print('Neural network baseline, test recall is: {:.3f}'.format(NN_test_recall))
print('Neural network baseline, test f1 score is: {:.3f}'.format(NN_test_f1))

cm = metrics.confusion_matrix(y_test, y_test_pred)
cm_plot = metrics.ConfusionMatrixDisplay(cm)
cm_plot.plot()
cm_plot.figure_.savefig('NN_base_conf.png', dpi=500)

In [None]:
# baseline learning curve
num_features = X_train.shape[1]
num_classes = len(np.unique(y_train))

NN_base = NeuralNetClassifier(
    module=BackpropModule,
    module__input_dim=num_features,
    module__output_dim=num_classes,
    module__hidden_units=16,
    module__hidden_layers=1,
    module__dropout_percent=0.1,
    max_epochs=100,
    verbose=0,
    callbacks=[EpochScoring(scoring='f1_macro', name='train_acc', on_train=True),],
    criterion=nn.CrossEntropyLoss,
    optimizer=optim.SGD,
    optimizer__weight_decay=0,
    optimizer__momentum=0.9,
    lr=0.01,
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
)

folds = StratifiedKFold(n_splits=5)
train_sizes, train_scores, val_scores = learning_curve(NN_base, X_train, y_train, train_sizes=np.linspace(0.025, 1, 20), cv=folds, scoring='f1_macro')

train_scores_mean = train_scores.mean(axis=1)
train_scores_std = train_scores.std(axis=1)
val_scores_mean = val_scores.mean(axis=1)
val_scores_std = val_scores.std(axis=1)

results_lc = {'train_size': train_sizes, 'train_score_mean': train_scores_mean, 'train_score_std': train_scores_std, 'val_score_mean': val_scores_mean, 'val_score_std': val_scores_std}
results_lc = pd.DataFrame(results_lc)
results_lc.to_excel('NN_base_lc.xlsx')

In [None]:
# DR results

n_components = {'PCA': 5, 'ICA': 6, 'RP': 6}
folds = StratifiedKFold(n_splits=5)

for DR in ['PCA', 'ICA', 'RP']:
    if DR == 'PCA':
        DR_algo = PCA(n_components=n_components[DR], random_state=seed)
        t1 = time.time()
        DR_algo.fit(X_train)
        X_DR = DR_algo.transform(X_train)
        t2 = time.time()
        X_DR_test = DR_algo.transform(X_test)
    elif DR == 'ICA':
        DR_algo = FastICA(n_components=n_components[DR], random_state=seed, max_iter=2000)
        t1 = time.time()
        DR_algo.fit(X_train)       
        X_DR = DR_algo.transform(X_train)
        t2 = time.time()
        X_DR_test = DR_algo.transform(X_test)
    elif DR == 'RP':
        DR_algo = GaussianRandomProjection(n_components=n_components[DR], random_state=seed)
        t1 = time.time()
        DR_algo.fit(X_train)
        X_DR = DR_algo.transform(X_train)
        t2 = time.time()
        X_DR_test = DR_algo.transform(X_test)

    num_features = X_DR.shape[1]
    num_classes = len(np.unique(y_train))

    NN_DR = NeuralNetClassifier(
        module=BackpropModule,
        module__input_dim=num_features,
        module__output_dim=num_classes,
        module__hidden_units=16,
        module__hidden_layers=1,
        module__dropout_percent=0.1,
        max_epochs=100,
        verbose=0,
        callbacks=[EpochScoring(scoring='f1_macro', name='train_acc', on_train=True),],
        criterion=nn.CrossEntropyLoss,
        optimizer=optim.SGD,
        optimizer__weight_decay=0,
        optimizer__momentum=0.9,
        lr=0.01,
        # Shuffle training data on each epoch
        iterator_train__shuffle=True,
    )

    t3 = time.time()
    NN_DR.fit(X_DR, y_train)
    t4 = time.time()
    
    print('NN with {} training time is: {} + {} (DR time + NN fit time)'.format(DR, t2-t1, t4-t3))

    results_DR = {'train_loss': NN_DR.history[:, 'train_loss'], 'valid_loss': NN_DR.history[:, 'valid_loss'], 'train_acc': NN_DR.history[:, 'train_acc'], 'valid_acc': NN_DR.history[:, 'valid_acc']}
    results_DR = pd.DataFrame(results_DR)
    results_DR.to_excel('NN_{}.xlsx'.format(DR))

    y_test_prob = NN_DR.predict_proba(X_DR_test)
    y_test_pred = np.argmax(y_test_prob, axis=1)
    NN_test_acc = metrics.accuracy_score(y_test, y_test_pred)
    NN_test_recall = metrics.recall_score(y_test, y_test_pred, average='macro')
    NN_test_f1 = metrics.f1_score(y_test, y_test_pred, average='macro')
    print('Neural network after {}, test accuracy is: {:.3f}'.format(DR, NN_test_acc))
    print('Neural network after {}, test recall is: {:.3f}'.format(DR, NN_test_recall))
    print('Neural network after {}, test f1 score is: {:.3f}'.format(DR, NN_test_f1))

    cm = metrics.confusion_matrix(y_test, y_test_pred)
    cm_plot = metrics.ConfusionMatrixDisplay(cm)
    cm_plot.plot()
    cm_plot.figure_.savefig('NN_{}_conf.png'.format(DR), dpi=500)

    train_sizes, train_scores, val_scores = learning_curve(NN_DR, X_DR, y_train, train_sizes=np.linspace(0.025, 1, 20), cv=folds, scoring='f1_macro')

    train_scores_mean = train_scores.mean(axis=1)
    train_scores_std = train_scores.std(axis=1)
    val_scores_mean = val_scores.mean(axis=1)
    val_scores_std = val_scores.std(axis=1)
    
    results_lc = {'train_size': train_sizes, 'train_score_mean': train_scores_mean, 'train_score_std': train_scores_std, 'val_score_mean': val_scores_mean, 'val_score_std': val_scores_std}
    results_lc = pd.DataFrame(results_lc)
    results_lc.to_excel('NN_{}_lc.xlsx'.format(DR))

In [None]:
# baseline alt

redundant = ['Perimeter', 'MajorAxisLength', 'Area', 'ConvexArea', 'EquivDiameter', 'MinorAxisLength', 'Eccentricity', 'Compactness', 'ShapeFactor2', 'ShapeFactor3']
col_idx = [X_bean.columns.get_loc(c) for c in redundant]
retain = np.delete(np.arange(X_train.shape[1]), col_idx)
X_train_redu = X_train[:, retain]
X_test_redu = X_test[:, retain]

num_features = X_train_redu.shape[1]
num_classes = len(np.unique(y_train))

NN_alt = NeuralNetClassifier(
    module=BackpropModule,
    module__input_dim=num_features,
    module__output_dim=num_classes,
    module__hidden_units=16,
    module__hidden_layers=1,
    module__dropout_percent=0.1,
    max_epochs=100,
    verbose=0,
    callbacks=[EpochScoring(scoring='f1_macro', name='train_acc', on_train=True),],
    criterion=nn.CrossEntropyLoss,
    optimizer=optim.SGD,
    optimizer__weight_decay=0,
    optimizer__momentum=0.9,
    lr=0.01,
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
)

t1 = time.time()
NN_alt.fit(X_train_redu, y_train)
t2 = time.time()

print('Baseline training time is: {}'.format(t2-t1))

results_base = {'train_loss': NN_alt.history[:, 'train_loss'], 'valid_loss': NN_alt.history[:, 'valid_loss'], 'train_acc': NN_alt.history[:, 'train_acc'], 'valid_acc': NN_alt.history[:, 'valid_acc']}
results_base = pd.DataFrame(results_base)
results_base.to_excel('NN_alt.xlsx')

y_test_prob = NN_alt.predict_proba(X_test_redu)
y_test_pred = np.argmax(y_test_prob, axis=1)
NN_test_acc = metrics.accuracy_score(y_test, y_test_pred)
NN_test_recall = metrics.recall_score(y_test, y_test_pred, average='macro')
NN_test_f1 = metrics.f1_score(y_test, y_test_pred, average='macro')
print('Neural network baseline, test accuracy is: {:.3f}'.format(NN_test_acc))
print('Neural network baseline, test recall is: {:.3f}'.format(NN_test_recall))
print('Neural network baseline, test f1 score is: {:.3f}'.format(NN_test_f1))

folds = StratifiedKFold(n_splits=5)
train_sizes, train_scores, val_scores = learning_curve(NN_alt, X_train_redu, y_train, train_sizes=np.linspace(0.025, 1, 20), cv=folds, scoring='f1_macro')

train_scores_mean = train_scores.mean(axis=1)
train_scores_std = train_scores.std(axis=1)
val_scores_mean = val_scores.mean(axis=1)
val_scores_std = val_scores.std(axis=1)

results_lc = {'train_size': train_sizes, 'train_score_mean': train_scores_mean, 'train_score_std': train_scores_std, 'val_score_mean': val_scores_mean, 'val_score_std': val_scores_std}
results_lc = pd.DataFrame(results_lc)
results_lc.to_excel('NN_alt_lc.xlsx')

In [None]:
base = pd.read_excel('NN_base.xlsx', index_col=0)
alt = pd.read_excel('NN_alt.xlsx', index_col=0)
pca = pd.read_excel('NN_PCA.xlsx', index_col=0)
ica = pd.read_excel('NN_ICA.xlsx', index_col=0)
rp = pd.read_excel('NN_RP.xlsx', index_col=0)

base_lc = pd.read_excel('NN_base_lc.xlsx', index_col=0)
alt_lc = pd.read_excel('NN_alt_lc.xlsx', index_col=0)
pca_lc = pd.read_excel('NN_PCA_lc.xlsx', index_col=0)
ica_lc = pd.read_excel('NN_ICA_lc.xlsx', index_col=0)
rp_lc = pd.read_excel('NN_RP_lc.xlsx', index_col=0)

fig, ax = plt.subplots(1, 2, figsize=(12, 5.5))
ax = ax.flatten()
ax[0].plot(base.loc[:, 'valid_acc'], label='baseline')
ax[0].plot(alt.loc[:, 'valid_acc'], label='covar')
ax[0].plot(pca.loc[:, 'valid_acc'], label='after PCA')
ax[0].plot(ica.loc[:, 'valid_acc'], label='after ICA')
ax[0].plot(rp.loc[:, 'valid_acc'], label='after RP')
ax[0].set_ylim([0, 1.05])
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Validation f1 score')
ax[0].legend()

ax[1].plot(base_lc.loc[:, 'train_size'], base_lc.loc[:, 'val_score_mean'], label='baseline')
ax[1].plot(alt_lc.loc[:, 'train_size'], alt_lc.loc[:, 'val_score_mean'], label='covar')
ax[1].plot(pca_lc.loc[:, 'train_size'], pca_lc.loc[:, 'val_score_mean'], label='after PCA')
ax[1].plot(ica_lc.loc[:, 'train_size'], ica_lc.loc[:, 'val_score_mean'], label='after ICA')
ax[1].plot(rp_lc.loc[:, 'train_size'], rp_lc.loc[:, 'val_score_mean'], label='after RP')
ax[1].set_ylim([0.05, 1.05])
ax[1].set_xlabel('Training size')
ax[1].set_ylabel('Validation f1 score')
ax[1].legend()

plt.rcParams.update({'font.size': 14})
plt.tight_layout()
# plt.savefig('NN_wDR.png', dpi=500, bbox_inches='tight')

window = 15
thresh = 0.001

base_window = base['valid_acc'].rolling(window=window).mean()
print('Base case converges at epoch {}'.format((base_window.diff() < thresh).idxmax()))
alt_window = alt['valid_acc'].rolling(window=window).mean()
print('Covar case converges at epoch {}'.format((alt_window.diff() < thresh).idxmax()))
pca_window = pca['valid_acc'].rolling(window=window).mean()
print('PCA case converges at epoch {}'.format((pca_window.diff() < thresh).idxmax()))
ica_window = ica['valid_acc'].rolling(window=window).mean()
print('ICA case converges at epoch {}'.format((ica_window.diff() < thresh).idxmax()))
rp_window = rp['valid_acc'].rolling(window=window).mean()
print('RP case converges at epoch {}'.format((rp_window.diff() < thresh).idxmax()))

In [None]:
n_clusters = {'GMM': 5, 'KM': 5}
folds = StratifiedKFold(n_splits=5)

for C in ['GMM', 'KM']:
    if C == 'GMM':
        gmm = GaussianMixture(n_components=n_clusters['GMM'], random_state=seed, max_iter=500, n_init=25, covariance_type='full', tol=0.0001)
        gmm.fit(X_train)
        clusters = gmm.predict(X_train)
        clusters_test = gmm.predict(X_test)
        onehot_train = preprocessing.OneHotEncoder(sparse_output=False)
        clusters = onehot_train.fit_transform(clusters.reshape(-1, 1))
        onehot_test = preprocessing.OneHotEncoder(sparse_output=False)
        clusters_test = onehot_test.fit_transform(clusters_test.reshape(-1, 1))
        X_train_add = np.concatenate([X_train, clusters], axis=1).astype(np.float32)
        X_test_add = np.concatenate([X_test, clusters_test], axis=1).astype(np.float32)
    elif C == 'KM':
        km = KMeans(n_clusters=n_clusters['KM'], max_iter=500, random_state=seed, n_init=500)
        km.fit(X_train)
        clusters = km.predict(X_train).reshape(-1, 1)
        clusters_test = km.predict(X_test).reshape(-1, 1)
        # sil_values = metrics.silhouette_samples(X_train, clusters)
        # sil_values_test = metrics.silhouette_samples(X_test, clusters_test)
        onehot_train = preprocessing.OneHotEncoder(sparse_output=False)
        clusters = onehot_train.fit_transform(clusters.reshape(-1, 1))
        onehot_test = preprocessing.OneHotEncoder(sparse_output=False)
        clusters_test = onehot_test.fit_transform(clusters_test.reshape(-1, 1))
        X_train_add = np.concatenate([X_train, clusters], axis=1).astype(np.float32)
        X_test_add = np.concatenate([X_test, clusters_test], axis=1).astype(np.float32)
        
    num_features = X_train_add.shape[1]
    num_classes = len(np.unique(y_train))

    NN_C = NeuralNetClassifier(
        module=BackpropModule,
        module__input_dim=num_features,
        module__output_dim=num_classes,
        module__hidden_units=16,
        module__hidden_layers=1,
        module__dropout_percent=0.1,
        max_epochs=100,
        verbose=0,
        callbacks=[EpochScoring(scoring='f1_macro', name='train_acc', on_train=True),],
        criterion=nn.CrossEntropyLoss,
        optimizer=optim.SGD,
        optimizer__weight_decay=0,
        optimizer__momentum=0.9,
        lr=0.01,
        # Shuffle training data on each epoch
        iterator_train__shuffle=True,
    )

    t1 = time.time()
    NN_C.fit(X_train_add, y_train)
    t2 = time.time()
    
    print('NN with {} training time is: {}'.format(C, t2-t1))

    results_C = {'train_loss': NN_C.history[:, 'train_loss'], 'valid_loss': NN_C.history[:, 'valid_loss'], 'train_acc': NN_C.history[:, 'train_acc'], 'valid_acc': NN_C.history[:, 'valid_acc']}
    results_C = pd.DataFrame(results_C)
    results_C.to_excel('NN_{}.xlsx'.format(C))

    y_test_prob = NN_C.predict_proba(X_test_add)
    y_test_pred = np.argmax(y_test_prob, axis=1)
    NN_test_acc = metrics.accuracy_score(y_test, y_test_pred)
    NN_test_recall = metrics.recall_score(y_test, y_test_pred, average='macro')
    NN_test_f1 = metrics.f1_score(y_test, y_test_pred, average='macro')
    print('Neural network after {}, test accuracy is: {:.3f}'.format(C, NN_test_acc))
    print('Neural network after {}, test recall is: {:.3f}'.format(C, NN_test_recall))
    print('Neural network after {}, test f1 score is: {:.3f}'.format(C, NN_test_f1))

    cm = metrics.confusion_matrix(y_test, y_test_pred)
    cm_plot = metrics.ConfusionMatrixDisplay(cm)
    cm_plot.plot()
    cm_plot.figure_.savefig('NN_{}_conf.png'.format(C), dpi=500)

    train_sizes, train_scores, val_scores = learning_curve(NN_C, X_train_add, y_train, train_sizes=np.linspace(0.025, 1, 20), cv=folds, scoring='f1_macro')

    train_scores_mean = train_scores.mean(axis=1)
    train_scores_std = train_scores.std(axis=1)
    val_scores_mean = val_scores.mean(axis=1)
    val_scores_std = val_scores.std(axis=1)
    
    results_C_lc = {'train_size': train_sizes, 'train_score_mean': train_scores_mean, 'train_score_std': train_scores_std, 'val_score_mean': val_scores_mean, 'val_score_std': val_scores_std}
    results_C_lc = pd.DataFrame(results_C_lc)
    results_C_lc.to_excel('NN_{}_lc.xlsx'.format(C))

In [None]:
base = pd.read_excel('NN_base.xlsx', index_col=0)
gmm = pd.read_excel('NN_GMM.xlsx', index_col=0)
km = pd.read_excel('NN_KM.xlsx', index_col=0)

base_lc = pd.read_excel('NN_base_lc.xlsx', index_col=0)
gmm_lc = pd.read_excel('NN_GMM_lc.xlsx', index_col=0)
km_lc = pd.read_excel('NN_KM_lc.xlsx', index_col=0)

fig, ax = plt.subplots(1, 2, figsize=(12, 5.5))
ax = ax.flatten()
ax[0].plot(base.loc[:, 'valid_acc'], label='baseline')
ax[0].plot(gmm.loc[:, 'valid_acc'], label='after GMM')
ax[0].plot(km.loc[:, 'valid_acc'], label='after KM')
ax[0].set_ylim([0.05, 1.05])
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Validation f1 score')
ax[0].legend()

ax[1].plot(base_lc.loc[:, 'train_size'], base_lc.loc[:, 'val_score_mean'], label='baseline')
ax[1].plot(gmm_lc.loc[:, 'train_size'], gmm_lc.loc[:, 'val_score_mean'], label='after GMM')
ax[1].plot(km_lc.loc[:, 'train_size'], km_lc.loc[:, 'val_score_mean'], label='after KM')
ax[1].set_ylim([0.05, 1.05])
ax[1].set_xlabel('Training size')
ax[1].set_ylabel('Validation f1 score')
ax[1].legend()

plt.rcParams.update({'font.size': 14})
plt.tight_layout()
# plt.savefig('NN_wC.png', dpi=500, bbox_inches='tight')

base_window = base['valid_acc'].rolling(window=window).mean()
print('Base case converges at epoch {}'.format((base_window.diff() < thresh).idxmax()))
gmm_window = gmm['valid_acc'].rolling(window=window).mean()
print('GMM case converges at epoch {}'.format((gmm_window.diff() < thresh).idxmax()))
km_window = km['valid_acc'].rolling(window=window).mean()
print('KM case converges at epoch {}'.format((km_window.diff() < thresh).idxmax()))