In [12]:
import pandas as pd
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer as CountV
from sklearn.feature_selection import VarianceThreshold

import numpy as np
import os
# import pandas as pd
import matplotlib.pyplot as plt
import time, random
from tqdm import tqdm
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.model_selection import train_test_split
import datetime
import argparse
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import transforms, utils, datasets
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import TensorDataset
from ember_utils import *
from ember_model import *
from ember_pjr_utils import *



In [16]:
raw_path = '/home/mr6564/continual_research/AZ_Data/Domain/'
years = ['2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016']

all_X_tr, all_Y_tr, all_Y_tr_family = [], [], []
all_X_te, all_Y_te, all_Y_te_family = [], [], []


for year in years:
    tr_file = raw_path + year + '_Domain_AZ_Train.npz'
    te_file = raw_path + year + '_Domain_AZ_Test.npz'
    
    tr_year = np.load(tr_file, allow_pickle=True)
    te_year = np.load(te_file, allow_pickle=True)
    
    X_tr_year, Y_tr_year, Y_tr_family_year = tr_year['X_train'], tr_year['Y_train'], tr_year['Y_tr_family']
    X_te_year, Y_te_year, Y_te_family_year = te_year['X_test'], te_year['Y_test'], te_year['Y_te_family']
    
    
    all_X_tr, all_Y_tr, all_Y_tr_family = np.concatenate((all_X_tr, X_tr_year)),\
                                            np.concatenate((all_Y_tr, Y_tr_year)),\
                                            np.concatenate((all_Y_tr_family, Y_tr_family_year))
    
    all_X_te, all_Y_te, all_Y_te_family = np.concatenate((all_X_te, X_te_year)),\
                                            np.concatenate((all_Y_te, Y_te_year)),\
                                            np.concatenate((all_Y_te_family, Y_te_family_year))

#print(len(np.where(Y_tr_family_year == 'goodware')[0]), len(np.where(Y_tr_year == 0)[0]))

In [18]:
from scipy.sparse import lil_matrix

def build_vocabulary(data):
    """Build a vocabulary from a list of lists of strings."""
    vocab_set = set(word for sample in data for word in sample)
    return sorted(list(vocab_set))  # Sort for consistency

def vectorize_samples(data, vocabulary):
    """Vectorize data based on the given vocabulary."""
    vocab_index = {word: idx for idx, word in enumerate(vocabulary)}
    vectorized_data = np.zeros((len(data), len(vocabulary)), dtype=int)

    for i, sample in enumerate(data):
        for word in sample:
            if word in vocab_index:
                vectorized_data[i, vocab_index[word]] = 1

    return vectorized_data




def transform_with_training_vocab(training_vocab_list, data_samples):
    """
    Transform the data samples using the vocabulary list from the training data.
    :param training_vocab_list: List of words from the training data
    :param data_samples: List of data samples (each sample is a list of words)
    :return: Vectorized data as a NumPy array
    """
    # Convert the vocabulary list to a dictionary {word: index}
    training_vocab_dict = {word: idx for idx, word in enumerate(training_vocab_list)}

    # Create a zero matrix with dimensions: number of samples x size of vocabulary
    vectorized = np.zeros((len(data_samples), len(training_vocab_list)), dtype=int)

    for i, sample in enumerate(data_samples):
        for word in sample:
            if word in training_vocab_dict:
                vectorized[i, training_vocab_dict[word]] = 1

    return vectorized



def vectorize_samples_sparse(data, vocabulary):
    vocab_index = {word: idx for idx, word in enumerate(vocabulary)}
    vectorized_data = lil_matrix((len(data), len(vocabulary)), dtype=int)
    for i, sample in enumerate(data):
        for word in sample:
            if word in vocab_index:
                vectorized_data[i, vocab_index[word]] += 1
    return vectorized_data.tocsr()  # Convert to CSR format for efficient arithmetic and matrix vector operations



def transform_with_training_vocab_sparse(training_vocab_list, data_samples):
    training_vocab_dict = {word: idx for idx, word in enumerate(training_vocab_list)}
    # Create a sparse matrix instead of a dense numpy array
    vectorized = lil_matrix((len(data_samples), len(training_vocab_list)), dtype=int)

    for i, sample in enumerate(data_samples):
        for word in sample:
            if word in training_vocab_dict:
                vectorized[i, training_vocab_dict[word]] += 1
    return vectorized.tocsr()





# all_X_tr, all_Y_tr, all_Y_tr_family
# all_X_te, all_Y_te, all_Y_te_family


data = all_X_tr
vocabulary = build_vocabulary(data)
vectorized_data = vectorize_samples_sparse(data, vocabulary)

print("Vectorized Data shape:", vectorized_data.shape)

vectorized_test_data = transform_with_training_vocab_sparse(vocabulary, all_X_te)
print(vectorized_test_data.shape)

Vectorized Data shape: (682598, 3858791)
(75848, 3858791)


In [27]:
selector = VarianceThreshold(threshold=0.001)

selector.fit(vectorized_data)

X_train_selected = selector.transform(vectorized_data)
X_test_selected = selector.transform(vectorized_test_data)

X_train, Y_train = X_train_selected, all_Y_tr
X_test, Y_test = X_test_selected, all_Y_te


print(f'after variance thresholding')
print(X_train_selected.shape, X_test_selected.shape)

X_train = X_train.toarray()
X_test = X_test.toarray()

print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

after variance thresholding
(682598, 1789) (75848, 1789)
(682598, 1789) (75848, 1789) (682598,) (75848,)


In [28]:
class Ember_MLP_Net(nn.Module):
    def __init__(self, input_features):
        super(Ember_MLP_Net, self).__init__()
        
        self.fc1 = nn.Linear(input_features, 1024)
        self.fc1_bn = nn.BatchNorm1d(1024)
        self.act1 = nn.ReLU()
        self.fc1_drop = nn.Dropout(p=0.5)
        
        self.fc2 = nn.Linear(1024, 512)
        self.fc2_bn = nn.BatchNorm1d(512)
        self.act2 = nn.ReLU()
        self.fc2_drop = nn.Dropout(p=0.5)
        
        self.fc3 = nn.Linear(512, 256)
        self.fc3_bn = nn.BatchNorm1d(256)
        self.act3 = nn.ReLU()
        self.fc3_drop = nn.Dropout(p=0.5)        
        
        self.fc4 = nn.Linear(256, 128)
        self.fc4_bn = nn.BatchNorm1d(128)
        self.act4 = nn.ReLU()
        self.fc4_drop = nn.Dropout(p=0.5)  
        
        self.fc_last = nn.Linear(128, 1) 
        self.out = nn.Sigmoid()
        
        #self.activate = nn.ReLU()

    def forward(self, x):
        x = x.view(x.size(0), -1)
        #print(x.shape)
        x = self.fc1(x)
        x = self.fc1_bn(x)
        x = self.act1(x) 
        x = self.fc1_drop(x)

        x = self.fc2(x)
        x = self.fc2_bn(x)
        x = self.act2(x) 
        x = self.fc2_drop(x)
        
        x = self.fc3(x)
        x = self.fc3_bn(x)
        x = self.act3(x) 
        x = self.fc3_drop(x)
        
        x = self.fc4(x)
        x = self.fc4_bn(x)
        x = self.act4(x)
        x = self.fc4_drop(x)
        
        x = self.fc_last(x)
        x = self.out(x)
        return x

    


exp_seeds = [random.randint(1, 99999) for i in range(2)]


accs_all = []
rocauc_all = []

num_epoch = 50
batch_size = 128
patience = 10


input_features = X_train.shape[1]

replay_type, current_task = 'azdomain', 'azdomain'

for exp in exp_seeds:

    start_time = time.time()
    use_cuda = True
    print('Torch', torch.__version__, 'CUDA', torch.version.cuda)
    use_cuda = use_cuda and torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    torch.manual_seed(exp)

    model = Ember_MLP_Net(input_features)
    #optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=0.000001)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
       
    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = nn.DataParallel(model)
    
    model = model.to(device)
    print(f'Model has {count_parameters(model)/1000000}m parameters')    
    criterion = nn.BCELoss()    

    
#     standardization = StandardScaler()
#     standard_scaler = standardization.fit(X_train)

#     X_train = standard_scaler.transform(X_train)
#     X_test = standard_scaler.transform(X_test)
    
#     X_train, Y_train = np.array(X_train, np.float32), np.array(Y_train, np.int32)
#     X_test, Y_test = np.array(X_test, np.float32), np.array(Y_test, np.int32)  

    
    model_save_dir = '../az_model/model/'
    create_parent_folder(model_save_dir)

    opt_save_path = '../az_model/opt/'
    create_parent_folder(opt_save_path)

    results_save_dir =  '../az_model/res/' 
    create_parent_folder(results_save_dir)

    print(f'X_train {X_train.shape} Y_train {Y_train.shape}')
    print(f'X_test {X_test.shape} Y_test {Y_test.shape}')
    
    
    task_training_time, epoch_ran, training_loss, validation_loss  = training_early_stopping(\
                                 model, model_save_dir, opt_save_path, X_train, Y_train,\
                                 X_test, Y_test, patience, batch_size, device, optimizer, num_epoch,\
                                 criterion, replay_type, current_task, exp, earlystopping=True)

    
    end_time = time.time()
    print(f'Elapsed time {(end_time - start_time)/60} mins.') 
    
    
    
    best_model_path = model_save_dir + os.listdir(model_save_dir)[0]
    print(f'loading best model {best_model_path}')
    model.load_state_dict(torch.load(best_model_path))

    #optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=0.000001)
    best_optimizer = opt_save_path + os.listdir(opt_save_path)[0]
    print(f'loading best optimizer {best_optimizer}')
    optimizer.load_state_dict(torch.load(best_optimizer))


    acc, rocauc, precision, recall, f1score = testing_aucscore(model, X_test, Y_test, batch_size, device)
    print()
    del model_save_dir
    del opt_save_path
    del results_save_dir
    
    accs_all.append(acc)
    rocauc_all.append(rocauc)


Torch 2.0.1 CUDA 11.8
Model has 2.525953m parameters
X_train (682598, 1789) Y_train (682598,)
X_test (75848, 1789) Y_test (75848,)
Epoch 1 of 50


100%|██████████████████████████████████████████████████████████| 5332/5332 [00:31<00:00, 168.33it/s]
100%|████████████████████████████████████████████████████████████| 593/593 [00:01<00:00, 336.07it/s]


Train Loss: 0.1480, Train Acc: 0.9477
Val Loss: 0.1010, Val Acc: 0.9635
Validation loss decreased (inf --> 0.100977).  Saving model ...
../az_model/model/best_model_epoch_1.pt
../az_model/opt/best_optimizer_epoch_1.pt
Epoch 2 of 50


100%|██████████████████████████████████████████████████████████| 5332/5332 [00:32<00:00, 162.43it/s]
100%|████████████████████████████████████████████████████████████| 593/593 [00:01<00:00, 327.51it/s]


Train Loss: 0.1111, Train Acc: 0.9620
Val Loss: 0.0925, Val Acc: 0.9675
Validation loss decreased (0.100977 --> 0.092510).  Saving model ...
../az_model/model/best_model_epoch_2.pt
../az_model/opt/best_optimizer_epoch_2.pt
Epoch 3 of 50


100%|██████████████████████████████████████████████████████████| 5332/5332 [00:31<00:00, 167.68it/s]
100%|████████████████████████████████████████████████████████████| 593/593 [00:01<00:00, 327.11it/s]


Train Loss: 0.0971, Train Acc: 0.9676
Val Loss: 0.0905, Val Acc: 0.9684
Validation loss decreased (0.092510 --> 0.090538).  Saving model ...
../az_model/model/best_model_epoch_3.pt
../az_model/opt/best_optimizer_epoch_3.pt
Epoch 4 of 50


100%|██████████████████████████████████████████████████████████| 5332/5332 [00:36<00:00, 147.06it/s]
100%|████████████████████████████████████████████████████████████| 593/593 [00:01<00:00, 327.28it/s]


Train Loss: 0.0866, Train Acc: 0.9715
Val Loss: 0.0867, Val Acc: 0.9701
Validation loss decreased (0.090538 --> 0.086743).  Saving model ...
../az_model/model/best_model_epoch_4.pt
../az_model/opt/best_optimizer_epoch_4.pt
Epoch 5 of 50


100%|██████████████████████████████████████████████████████████| 5332/5332 [00:33<00:00, 160.81it/s]
100%|████████████████████████████████████████████████████████████| 593/593 [00:01<00:00, 331.02it/s]


Train Loss: 0.0800, Train Acc: 0.9737
Val Loss: 0.0928, Val Acc: 0.9688
EarlyStopping counter: 1 out of 10
Epoch 6 of 50


100%|██████████████████████████████████████████████████████████| 5332/5332 [00:35<00:00, 149.88it/s]
100%|████████████████████████████████████████████████████████████| 593/593 [00:01<00:00, 333.18it/s]


Train Loss: 0.0752, Train Acc: 0.9752
Val Loss: 0.0823, Val Acc: 0.9719
Validation loss decreased (0.086743 --> 0.082280).  Saving model ...
../az_model/model/best_model_epoch_6.pt
../az_model/opt/best_optimizer_epoch_6.pt
Epoch 7 of 50


100%|██████████████████████████████████████████████████████████| 5332/5332 [00:31<00:00, 168.97it/s]
100%|████████████████████████████████████████████████████████████| 593/593 [00:01<00:00, 331.58it/s]


Train Loss: 0.0720, Train Acc: 0.9765
Val Loss: 0.0832, Val Acc: 0.9730
EarlyStopping counter: 1 out of 10
Epoch 8 of 50


100%|██████████████████████████████████████████████████████████| 5332/5332 [00:29<00:00, 180.16it/s]
100%|████████████████████████████████████████████████████████████| 593/593 [00:02<00:00, 262.35it/s]


Train Loss: 0.0680, Train Acc: 0.9780
Val Loss: 0.0787, Val Acc: 0.9754
Validation loss decreased (0.082280 --> 0.078710).  Saving model ...
../az_model/model/best_model_epoch_8.pt
../az_model/opt/best_optimizer_epoch_8.pt
Epoch 9 of 50


100%|██████████████████████████████████████████████████████████| 5332/5332 [00:35<00:00, 152.04it/s]
100%|████████████████████████████████████████████████████████████| 593/593 [00:01<00:00, 305.18it/s]


Train Loss: 0.0650, Train Acc: 0.9789
Val Loss: 0.0817, Val Acc: 0.9747
EarlyStopping counter: 1 out of 10
Epoch 10 of 50


100%|██████████████████████████████████████████████████████████| 5332/5332 [00:29<00:00, 180.18it/s]
100%|████████████████████████████████████████████████████████████| 593/593 [00:02<00:00, 284.16it/s]


Train Loss: 0.0636, Train Acc: 0.9795
Val Loss: 0.0828, Val Acc: 0.9748
EarlyStopping counter: 2 out of 10
Epoch 11 of 50


100%|██████████████████████████████████████████████████████████| 5332/5332 [00:38<00:00, 139.38it/s]
100%|████████████████████████████████████████████████████████████| 593/593 [00:01<00:00, 310.53it/s]


Train Loss: 0.0614, Train Acc: 0.9802
Val Loss: 0.0843, Val Acc: 0.9748
EarlyStopping counter: 3 out of 10
Epoch 12 of 50


100%|██████████████████████████████████████████████████████████| 5332/5332 [00:33<00:00, 161.46it/s]
100%|████████████████████████████████████████████████████████████| 593/593 [00:01<00:00, 312.54it/s]


Train Loss: 0.0596, Train Acc: 0.9808
Val Loss: 0.0865, Val Acc: 0.9748
EarlyStopping counter: 4 out of 10
Epoch 13 of 50


100%|██████████████████████████████████████████████████████████| 5332/5332 [00:35<00:00, 151.11it/s]
100%|████████████████████████████████████████████████████████████| 593/593 [00:01<00:00, 311.93it/s]


Train Loss: 0.0577, Train Acc: 0.9816
Val Loss: 0.0817, Val Acc: 0.9752
EarlyStopping counter: 5 out of 10
Epoch 14 of 50


100%|██████████████████████████████████████████████████████████| 5332/5332 [00:38<00:00, 138.56it/s]
100%|████████████████████████████████████████████████████████████| 593/593 [00:01<00:00, 316.55it/s]


Train Loss: 0.0561, Train Acc: 0.9821
Val Loss: 0.0807, Val Acc: 0.9755
EarlyStopping counter: 6 out of 10
Epoch 15 of 50


100%|██████████████████████████████████████████████████████████| 5332/5332 [00:35<00:00, 148.47it/s]
100%|████████████████████████████████████████████████████████████| 593/593 [00:02<00:00, 292.83it/s]


Train Loss: 0.0555, Train Acc: 0.9823
Val Loss: 0.0831, Val Acc: 0.9739
EarlyStopping counter: 7 out of 10
Epoch 16 of 50


100%|██████████████████████████████████████████████████████████| 5332/5332 [00:39<00:00, 134.41it/s]
100%|████████████████████████████████████████████████████████████| 593/593 [00:02<00:00, 279.90it/s]


Train Loss: 0.0547, Train Acc: 0.9827
Val Loss: 0.0802, Val Acc: 0.9748
EarlyStopping counter: 8 out of 10
Epoch 17 of 50


100%|██████████████████████████████████████████████████████████| 5332/5332 [00:38<00:00, 138.81it/s]
100%|████████████████████████████████████████████████████████████| 593/593 [00:01<00:00, 299.50it/s]


Train Loss: 0.0537, Train Acc: 0.9830
Val Loss: 0.0797, Val Acc: 0.9756
EarlyStopping counter: 9 out of 10
Epoch 18 of 50


100%|██████████████████████████████████████████████████████████| 5332/5332 [00:35<00:00, 151.90it/s]
100%|████████████████████████████████████████████████████████████| 593/593 [00:02<00:00, 282.50it/s]


Train Loss: 0.0526, Train Acc: 0.9833
Val Loss: 0.0823, Val Acc: 0.9754
EarlyStopping counter: 10 out of 10
Early stopping
Training time: 10.942 minutes
Elapsed time 10.954123024145762 mins.
loading best model ../az_model/model/best_model_epoch_8.pt


100%|████████████████████████████████████████████████████████████| 593/593 [00:02<00:00, 251.28it/s]

test accuracy 0.963860783267383 and ROC-AUC 0.9639617336301808






NameError: name 'rocauc' is not defined

In [None]:


class Ember_CNN(nn.Module):
    def __init__(self, num_features):
        super(Ember_CNN, self).__init__()
        # Define the first convolutional layer
        self.conv1 = nn.Conv1d(1, 64, kernel_size=3, stride=1, padding=1)
        
        # Additional convolutional layers
        self.conv2 = nn.Conv1d(64, 128, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv1d(128, 256, kernel_size=3, stride=1, padding=1)
        
        # Dynamically calculate the size for the fully connected layer
        self._to_linear = None
        self.convs(torch.zeros(1, 1, num_features))
        
        # Fully connected layers
        self.fc1 = nn.Linear(self._to_linear, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 1)

    def convs(self, x):
        # Max pooling following each convolution
        x = F.max_pool1d(F.relu(self.conv1(x)), 2)
        x = F.max_pool1d(F.relu(self.conv2(x)), 2)
        x = F.max_pool1d(F.relu(self.conv3(x)), 2)
        
        # Calculate the output size for the fully connected layer
        if self._to_linear is None:
            self._to_linear = x[0].shape[0] * x[0].shape[1]
        return x

    def forward(self, x):
        x = x.view(x.size(0), 1, -1)  # Reshape the input for Conv1d
        x = self.convs(x)
        x = x.view(-1, self._to_linear)  # Flatten the output for the fully connected layers
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x




num_samples, num_features = X_train.shape

# Reshape X_train to (num_samples, 1, num_features)
X_train_reshaped = X_train.reshape(num_samples, 1, num_features)

# Convert to PyTorch tensor if it's not already
X_train_tensor = torch.tensor(X_train_reshaped, dtype=torch.float32)

# Convert the NumPy array Y_train to a PyTorch tensor
Y_train_tensor = torch.tensor(Y_train, dtype=torch.float32)

# Create the TensorDataset using X_train_tensor and Y_train_tensor
train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)



X_test_reshaped = X_test.reshape(X_test.shape[0], 1, num_features)
X_test_tensor = torch.tensor(X_test_reshaped, dtype=torch.float32)

# Convert the NumPy array Y_train to a PyTorch tensor
Y_test_tensor = torch.tensor(Y_test, dtype=torch.float32)

# Create the TensorDataset using X_train_tensor and Y_train_tensor
test_dataset = TensorDataset(X_test_tensor, Y_test_tensor)


# Assuming you have training and validation datasets loaded in DataLoader
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Model, optimizer, and loss function
model = Ember_CNN(num_features)  # Replace num_features with your number of features
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=5):
    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode

        running_loss = 0.0
        for inputs, labels in train_loader:
            # Forward pass
            outputs = model(inputs)
            outputs = outputs.squeeze()  # Squeeze the output to match target labels' shape
            loss = criterion(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        # Validation
        val_loss = 0.0
        model.eval()  # Set the model to evaluation mode
        with torch.no_grad():
            for inputs, labels in val_loader:
                outputs = model(inputs)
                outputs = outputs.squeeze()
                loss = criterion(outputs, labels)
                val_loss += loss.item()

        print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {running_loss/len(train_loader)}, Validation Loss: {val_loss/len(val_loader)}")

    print("Finished Training")

# Train the model
train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10)

