In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data_utils

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statistics
import csv
import copy

from math import log

from sklearn import preprocessing
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import ConfusionMatrixDisplay, classification_report

from hanMaskingPackage import weight_perc
import warnings
warnings.filterwarnings('ignore')

In [2]:
if torch.cuda.is_available():
    device = "cuda:0"
else:
    device = "cpu"

# device = "cpu"

print(device)

cuda:0


In [3]:
k_folds = 5
num_of_epoch = 100
output_size = 1
binary_classification = True
torch.manual_seed(42)
np.random.seed(42)

<torch._C.Generator at 0x7fa828175d30>

In [4]:
hidden_Layer_1 = 256
hidden_Layer_2 = 128
hidden_Layer_3 = 64

In [5]:
def normalize_dataset(train_df, test_df, y_column_name):
    y_train = train_df[y_column_name]
    X_train = train_df.drop([y_column_name], axis=1)

    y_test = test_df[y_column_name]
    X_test = test_df.drop([y_column_name], axis=1)

    min_max_scaler = preprocessing.MinMaxScaler()
    scale = min_max_scaler.fit(X_train)
    
    x_train_transformed = scale.transform(X_train)
    x_test_transformed = scale.transform(X_test)

    train_df_standardized = pd.DataFrame(x_train_transformed, columns = X_train.columns)
    test_df_standardized = pd.DataFrame(x_test_transformed, columns = X_test.columns)

    train_df = pd.concat([train_df_standardized.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)
    test_df = pd.concat([test_df_standardized.reset_index(drop=True), y_test.reset_index(drop=True)], axis=1)

    return train_df, test_df

In [6]:
def standardize_dataset(train_df, test_df, y_column_name):

    y_train = train_df[y_column_name]
    X_train = train_df.drop([y_column_name], axis=1)

    y_test = test_df[y_column_name]
    X_test = test_df.drop([y_column_name], axis=1)

    # standardize
    sc = StandardScaler()
    scale = sc.fit(X_train)
    
    x_train_transformed = scale.transform(X_train)
    x_test_transformed = scale.transform(X_test)

    train_df_standardized = pd.DataFrame(x_train_transformed, columns = X_train.columns)
    
    test_df_standardized = pd.DataFrame(x_test_transformed, columns = X_test.columns)

    train_df = pd.concat([train_df_standardized.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)
    test_df = pd.concat([test_df_standardized.reset_index(drop=True), y_test.reset_index(drop=True)], axis=1)

    return train_df, test_df

In [7]:
#Read files
def file_reader(file_path):
    '''Input = file path (str)
       Output = numpy array of items in files
    '''
    
    data = []
    with open(file_path) as f:
        reader = csv.reader(f, delimiter='\n')
        for row in reader:
            for x in row:
                x=x.split(' ')
                example = []
                for item in x:
                    if item:
                        item = int(item) #convert to int
                        example.append(item)
                data.append(example)
        data = np.asarray(data)
    return data

In [8]:
## arcene
dbName = 'arcene'

arcene_train_X = file_reader('../hd-datasets/ARCENE/arcene_train.data')
arcene_test_X = file_reader('../hd-datasets/ARCENE/arcene_valid.data')

arcene_train_y = file_reader('../hd-datasets/ARCENE/arcene_train.labels')
arcene_train_y = np.ravel(arcene_train_y)
arcene_test_y = file_reader('../hd-datasets/ARCENE/arcene_valid.labels')
arcene_test_y = np.ravel(arcene_test_y)

arcene_train = np.column_stack( (arcene_train_X,arcene_train_y) )
arcene_test = np.column_stack( (arcene_test_X,arcene_test_y) )
arcene = np.row_stack( (arcene_train,arcene_test) )

data_df = pd.DataFrame.from_records(arcene)
y_column_name = 10000

le = LabelEncoder()
data_df[y_column_name] = le.fit_transform(data_df[y_column_name])

In [9]:
data_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9991,9992,9993,9994,9995,9996,9997,9998,9999,10000
0,0,71,0,95,0,538,404,20,0,0,...,570,86,0,36,0,80,0,0,524,1
1,0,41,82,165,60,554,379,0,71,0,...,605,69,7,473,0,57,0,284,423,0
2,0,0,1,40,0,451,402,0,0,0,...,593,28,0,24,0,90,0,34,508,1
3,0,56,44,275,14,511,470,0,0,0,...,600,0,26,86,0,102,0,0,469,1
4,105,0,141,348,0,268,329,0,0,1,...,0,0,0,0,190,301,0,0,354,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,24,73,0,436,92,400,0,0,139,261,...,540,0,86,130,365,58,17,3,37,0
196,11,58,50,332,109,393,122,0,75,134,...,355,156,77,26,277,265,0,36,261,0
197,93,32,137,319,0,264,231,21,0,0,...,9,0,0,0,244,309,0,276,312,1
198,119,12,198,339,0,289,410,0,0,4,...,0,37,0,0,256,402,0,0,350,1


In [10]:
def average_multiple_lists(multiple_lists):
    data = np.array(multiple_lists)
    return np.average(data, axis=0), np.std(data,axis=0)

In [11]:
# basic auto encoder with three layers
class Basic_DNN_3(nn.Module):

    def __init__(self, input_size, output_size = 1):

        hidden_L_1 = hidden_Layer_1
        hidden_L_2 = hidden_Layer_2
        hidden_L_3 = hidden_Layer_3
        

        super().__init__()        
        self.encoder = nn.Sequential(
            nn.Linear(input_size, hidden_L_1),
            nn.ReLU(),
            nn.Linear(hidden_L_1, hidden_L_2),
            nn.ReLU(),
            # nn.Dropout(0.20),
            nn.Linear(hidden_L_2, hidden_L_3),
            nn.ReLU(),
            # nn.Dropout(0.20),
            nn.Linear(hidden_L_3, output_size),
            nn.ReLU(),
            # nn.Dropout(0.20)
        )

        if binary_classification:
            self.activation = nn.Sigmoid()

    # fine-tuning
    def forward(self, x):
        intermediate = self.encoder(x)
        if binary_classification:
            return self.activation(intermediate)
        else:
            # no activation function is used for multiclass classification
            # cross entrophy loss automatically applies softmax in pytorch
            return intermediate

In [12]:
model_compressions = []

def calculate_model_compression(model):
    perc_weight = []
    state_dict = model.state_dict()

    for name, param in state_dict.items():

        # ignore biases and the decoder weights
        if not "weight" in name or "decoder" in name:
            continue

        W =  param.cpu().numpy()
        cnt_zero = len(np.ravel(W))-np.count_nonzero(W)
        perc = (( cnt_zero)*100)/len(np.ravel(W))
        perc_weight.append(perc)
        param.requires_grad = False

    model_compressions.append(sum(perc_weight)/len(perc_weight))
    return model
    # print('Achieved model compression: ',sum(perc_weight)/len(perc_weight), '%')

In [13]:
def weight_perturbation(model, epoch, perc_weight, threshold, Wold_dict):

    state_dict = model.state_dict()
    

    for name, param in state_dict.items():

        # ignore biases and the decoder weights
        if not "weight" in name or "decoder" in name:
            continue


        W =  param.cpu().numpy()
        if name not in Wold_dict:
            Wold = np.ones(W.shape)
        else:
            Wold = Wold_dict[name]

        Wold, perc = weight_perc(Wold,W,threshold)
        Wold_dict[name] = Wold
        perc_weight.append(perc)
        wm = np.multiply(Wold,W)
        wm = torch.from_numpy(wm)
        state_dict[name].copy_(wm)

        # print('The average of weights perturbation: ',sum(perc_weight)/len(perc_weight), '%')

    return perc_weight, Wold_dict

In [14]:
def model_finetuning(model, train_tensor, num_of_epoch, train_x, train_y, test_x, test_y, threshold):

    model = model.to(device)

    finetuning_loss = []
    train_accuracy_scores = []
    test_accuracy_scores = []

    data_loader = torch.utils.data.DataLoader(dataset=train_tensor,
                                        batch_size=64,
                                        shuffle=True)

    criterion = nn.CrossEntropyLoss()
    if binary_classification:
        criterion = nn.BCELoss()

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    minimum_loss = None

    limit_counter = 0
    for epoch in range(num_of_epoch):
        model = model.train()
        for train,target in data_loader:
            optimizer.zero_grad()

            # to make sure all variables are float type
            train = train.float()
            target = target.float()

            prediction = model(train)
            # target = target.unsqueeze(1)
            classification_loss = criterion(prediction, target.unsqueeze(1))
            optimizer.zero_grad()
            classification_loss.backward()
            optimizer.step()

            
        finetuning_loss.append(classification_loss.item())

        # train accuracy after each epoch
        model = model.eval()
        prediction_train = model(train_x)
        prediction_train = prediction_train.flatten()
        train_accuracy = accuracy_score(train_y.cpu().detach().numpy(),np.round(prediction_train.cpu().detach().numpy()))
        train_accuracy_scores.append(train_accuracy)

        # test accuracy after each epoch
        prediction_test = model(test_x)
        prediction_test = prediction_test.flatten()
        test_accuracy = accuracy_score(test_y.cpu().detach().numpy(),np.round(prediction_test.cpu().detach().numpy()))
        test_accuracy_scores.append(test_accuracy)
        
    # perturbation variables
    perc_weight = []
    Wold_dict = {}
    perc_weight, Wold_dict = weight_perturbation(model, epoch, perc_weight, threshold, Wold_dict)


    # load and return the model with minimum loss at the end
    model = calculate_model_compression(model)
    
    return model, finetuning_loss, train_accuracy_scores ,test_accuracy_scores, prediction

In [15]:
def convert_df_to_tensor(df):
    y_train = df[y_column_name]
    X_train = df.drop([y_column_name], axis=1)
    y_tensor = torch.tensor(y_train.values)
    x_tensor = torch.tensor(X_train.values)
    x_tensor = x_tensor.to(device)
    y_tensor = y_tensor.to(device)

    # commented out normalization here because data is standardized
    # x_tensor_norm = torch.nn.functional.normalize(x_tensor, p=2.0, dim=1, eps=1e-12, out=None)
    
    dataset_tensor = data_utils.TensorDataset(x_tensor, y_tensor)
    return x_tensor, y_tensor, dataset_tensor

In [16]:
def ae_DNN_training(model_name, dataset, num_of_epoch, k_folds, threshold):

    kfold = KFold(n_splits = k_folds, random_state= 42, shuffle = True)

    finetuning_loss_fold = []
    train_accuracy_fold = []
    test_accuracy_fold = []
    classification_accuracy_fold = []
    f1_fold = []
    
    fold_counter = 0
    # run k fold in loop
    for train, test in list(kfold.split(dataset)):

        fold_counter+=1

        # divide the data for train and test
        train_df = dataset.iloc[train]
        test_df =  dataset.iloc[test]

        train_df, test_df = standardize_dataset(train_df, test_df, y_column_name)
        # train_df= normalize_dataset(train_df, y_column_name)
        # test_df= normalize_dataset(test_df, y_column_name)

        # convert dataframe into train and test tensor
        train_x, train_y, train_tensor = convert_df_to_tensor(train_df)
        test_x, test_y, test_tensor = convert_df_to_tensor(test_df)

        x_dim = train_x.shape[1]
        
        model = Basic_DNN_3(input_size=x_dim)

        model, finetuning_loss, train_accuracy, test_accuracy, prediction = model_finetuning(model, train_tensor, num_of_epoch, train_x.float(), train_y.float(), test_x.float(), test_y.float(), threshold)

        finetuning_loss_fold.append(finetuning_loss)
        train_accuracy_fold.append(train_accuracy)
        test_accuracy_fold.append(test_accuracy)

        # calculate final test accuracy, confusion matrix etc
        test_x = test_x.float()
        prediction_tensor = model(test_x)
        classification_accuracy_fold.append(accuracy_score(test_y.cpu().detach().numpy(),np.round(prediction_tensor.cpu().detach().numpy().flatten())))
        f1_fold.append(f1_score(test_y.cpu().detach().numpy(),np.round(prediction_tensor.cpu().detach().numpy().flatten()), average='weighted'))
        report = classification_report(test_y.cpu().detach().numpy(),np.round_(prediction_tensor.cpu().detach().numpy().flatten()), output_dict=True)
        report_df = pd.DataFrame(report).transpose()
        report_df.to_csv(dbName+"_"+model_name+"_"+str(fold_counter), index= True)
            
        finetuning_loss = average_multiple_lists(finetuning_loss_fold)
        train_accuracy_curve = average_multiple_lists(train_accuracy_fold)
        test_accuracy_curve = average_multiple_lists(test_accuracy_fold)

    classification_accuracy_mean = np.round(statistics.mean(classification_accuracy_fold),4)
    classification_accuracy_std = np.round(statistics.pstdev(classification_accuracy_fold),4)
    print("test accuracy:",classification_accuracy_mean," (",classification_accuracy_std,")")
    
    f1_mean = np.round(statistics.mean(f1_fold),4)
    f1_std = np.round(statistics.pstdev(f1_fold),4)
    print("test f1:",f1_mean," (",f1_std,")")
    
    return finetuning_loss, train_accuracy_curve, test_accuracy_curve, classification_accuracy_mean, f1_mean, f1_std

In [17]:
def shaded_plot(pair, label, x_label, y_label, filename):
    mean = pair[0]
    std = pair[1]
    plt.figure(figsize=(9, 6))
    x = np.arange(len(mean))
    plt.plot(x, mean, label=label)
    plt.fill_between(x, mean - std, mean + std, alpha=0.2)
    plt.legend(prop={'size': 13})
    plt.xlabel(x_label,fontsize=13)
    plt.ylabel(y_label,fontsize=13)
    plt.savefig(filename)
    plt.show()


def shaded_plot_multiple(pairs, labels, x_label, y_label, filename):
    
    plt.figure(figsize=(9, 6))

    for pair, label in zip(pairs, labels):
        mean = pair[0]
        std = pair[1]
        x = np.arange(len(mean))
        plt.plot(x, mean, label=label)
        plt.fill_between(x, mean - std, mean + std, alpha=0.2)
    
    plt.legend(prop={'size': 13})
    plt.xlabel(x_label,fontsize=13)
    plt.ylabel(y_label,fontsize=13)
    plt.savefig(filename)
    plt.show()

In [18]:
import pickle
dbName = "dnn/"+dbName +"/"
print(dbName)

info = []
basic_f1s = []
basic_compressions = []
basic_stds = []

for threshold in range(2, 80,2):
    print("Threshold: ",threshold)
    #training basic AE
    model_compressions = []
    BFT_Error, basic_train_accuracy, basic_test_accuracy, basic_accuracy, basic_f1, basic_std = ae_DNN_training('basic',data_df, num_of_epoch, k_folds, threshold)
    print('Basic average compressions: ', np.mean(model_compressions))
    basic_f1s.append(basic_f1)
    basic_compressions.append(np.mean(model_compressions))
    basic_stds.append(basic_std)

dnn/arcene/
Threshold:  2
test accuracy: 0.65  ( 0.0791 )
test f1: 0.5856  ( 0.1297 )
Basic average compressions:  2.2822343750000003
Threshold:  4
test accuracy: 0.685  ( 0.0644 )
test f1: 0.6766  ( 0.0795 )
Basic average compressions:  4.173069091796876
Threshold:  6
test accuracy: 0.665  ( 0.0682 )
test f1: 0.6024  ( 0.1392 )
Basic average compressions:  6.064668701171875
Threshold:  8
test accuracy: 0.565  ( 0.1722 )
test f1: 0.5462  ( 0.1854 )
Basic average compressions:  8.34613232421875
Threshold:  10
test accuracy: 0.525  ( 0.0707 )
test f1: 0.4343  ( 0.1253 )
Basic average compressions:  10.236970947265625
Threshold:  12
test accuracy: 0.55  ( 0.1214 )
test f1: 0.4474  ( 0.1954 )
Basic average compressions:  12.125520751953125
Threshold:  14
test accuracy: 0.45  ( 0.0758 )
test f1: 0.3325  ( 0.1133 )
Basic average compressions:  14.016361328125
Threshold:  16
test accuracy: 0.455  ( 0.1077 )
test f1: 0.3478  ( 0.1226 )
Basic average compressions:  16.297826904296876
Threshold:

In [20]:
import pickle
with open(dbName+'compressions.pkl', 'wb') as f:
    pickle.dump(basic_f1s, f)
    pickle.dump(basic_compressions, f)
    pickle.dump(basic_stds, f)