In [11]:
import pandas as pd
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer as CountV
from sklearn.feature_selection import VarianceThreshold
import logging
from scipy import sparse

import numpy as np
import os
# import pandas as pd
import matplotlib.pyplot as plt
import time, random
from tqdm import tqdm
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.model_selection import train_test_split
import datetime
import argparse
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import transforms, utils, datasets
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import TensorDataset
from az_utils import *
from ember_model import *
#from ember_pjr_utils import *

In [12]:
raw_path = '/home/mr6564/continual_research/AZ_Data/Family/'

tr_file = raw_path + 'Family_AZ_Train.npz'
te_file = raw_path + 'Family_AZ_Test.npz'

tr_data = np.load(tr_file, allow_pickle=True)
te_data = np.load(te_file, allow_pickle=True)

In [13]:

X_tr, Y_tr, Y_tr_family = tr_data['X_train'], tr_data['Y_train'], tr_data['Y_tr_family']
X_te, Y_te, Y_te_family = te_data['X_test'], te_data['Y_test'], te_data['Y_te_family']

In [15]:
from scipy.sparse import lil_matrix

def build_vocabulary(data):
    """Build a vocabulary from a list of lists of strings."""
    vocab_set = set(word for sample in data for word in sample)
    return sorted(list(vocab_set))  # Sort for consistency

def vectorize_samples(data, vocabulary):
    """Vectorize data based on the given vocabulary."""
    vocab_index = {word: idx for idx, word in enumerate(vocabulary)}
    vectorized_data = np.zeros((len(data), len(vocabulary)), dtype=int)

    for i, sample in enumerate(data):
        for word in sample:
            if word in vocab_index:
                vectorized_data[i, vocab_index[word]] = 1

    return vectorized_data




def transform_with_training_vocab(training_vocab_list, data_samples):
    """
    Transform the data samples using the vocabulary list from the training data.
    :param training_vocab_list: List of words from the training data
    :param data_samples: List of data samples (each sample is a list of words)
    :return: Vectorized data as a NumPy array
    """
    # Convert the vocabulary list to a dictionary {word: index}
    training_vocab_dict = {word: idx for idx, word in enumerate(training_vocab_list)}

    # Create a zero matrix with dimensions: number of samples x size of vocabulary
    vectorized = np.zeros((len(data_samples), len(training_vocab_list)), dtype=int)

    for i, sample in enumerate(data_samples):
        for word in sample:
            if word in training_vocab_dict:
                vectorized[i, training_vocab_dict[word]] = 1

    return vectorized



def vectorize_samples_sparse(data, vocabulary):
    vocab_index = {word: idx for idx, word in enumerate(vocabulary)}
    vectorized_data = lil_matrix((len(data), len(vocabulary)), dtype=int)
    for i, sample in enumerate(data):
        for word in sample:
            if word in vocab_index:
                vectorized_data[i, vocab_index[word]] += 1
    return vectorized_data.tocsr()  # Convert to CSR format for efficient arithmetic and matrix vector operations

def transform_with_training_vocab_sparse(training_vocab_list, data_samples):
    training_vocab_dict = {word: idx for idx, word in enumerate(training_vocab_list)}
    # Create a sparse matrix instead of a dense numpy array
    vectorized = lil_matrix((len(data_samples), len(training_vocab_list)), dtype=int)

    for i, sample in enumerate(data_samples):
        for word in sample:
            if word in training_vocab_dict:
                vectorized[i, training_vocab_dict[word]] += 1
    return vectorized.tocsr()



#X_train, X_test, y_train, y_test
#X_tr, Y_tr, Y_tr_family, X_te, Y_te, Y_te_family 

data = X_tr
vocabulary = build_vocabulary(data)

#vectorized_data = vectorize_samples(data, vocabulary)
vectorized_data = vectorize_samples_sparse(data, vocabulary)

print("Vectorized Data shape:", vectorized_data.shape)

vectorized_test_data = transform_with_training_vocab_sparse(vocabulary, X_te)
print("Vectorized Test Data shape:", vectorized_test_data.shape)

Vectorized Data shape: (257023, 1067550)
Vectorized Test Data shape: (28559, 1067550)


In [14]:
#X_tr, Y_tr, Y_tr_family
#X_te, Y_te, Y_te_family

In [35]:
selector = VarianceThreshold(threshold=0.001)

selector.fit(vectorized_data)

X_train_selected = selector.transform(vectorized_data)
X_test_selected = selector.transform(vectorized_test_data)

X_train, Y_train = X_train_selected, Y_tr
X_test, Y_test = X_test_selected, Y_te


X_train = X_train.toarray()
X_test = X_test.toarray()

print(vectorized_data.shape, X_train_selected.shape, X_test_selected.shape)

(257023, 1067550) (257023, 2439) (28559, 2439)


In [None]:
#(257023, 1064974) (257023, 2441) (28559, 2441)

In [39]:
save_path = '/home/mr6564/continual_research/AZ_Data/Family_Transformed/'


tr_file_save = save_path + 'Family_AZ_Train_Transformed.npz'
te_file_save = save_path + 'Family_AZ_Test_Transformed.npz'

np.savez(tr_file_save, X_train=X_train, Y_train=Y_tr, Y_tr_family = Y_tr_family)
np.savez(te_file_save, X_test=X_test, Y_test=Y_te, Y_te_family = Y_te_family)


In [3]:
len(np.unique(Y_train)), len(np.unique(Y_test))

(100, 100)

In [37]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


class AZ_MLP_Net(nn.Module):
    def __init__(self, input_features, n_classes):
        super(AZ_MLP_Net, self).__init__()
        self.input_feats_length = input_features
        self.output_classes = n_classes
        
        
        self.fc01 = nn.Linear(self.input_feats_length, 2048)
        self.fc01_bn = nn.BatchNorm1d(2048)
        self.fc01_drop = nn.Dropout(p=0.5)
        
        self.fc1 = nn.Linear(2048, 1024)
        self.fc1_bn = nn.BatchNorm1d(1024)
        self.fc1_drop = nn.Dropout(p=0.5)
        
        self.fc2 = nn.Linear(1024, 512)
        self.fc2_bn = nn.BatchNorm1d(512)
        self.fc2_drop = nn.Dropout(p=0.5)
        
        self.fc3 = nn.Linear(512, 256)
        self.fc3_bn = nn.BatchNorm1d(256)
        self.fc3_drop = nn.Dropout(p=0.5)        
        
        self.fc4 = nn.Linear(256, 128)
        self.fc4_bn = nn.BatchNorm1d(128)
        self.fc4_drop = nn.Dropout(p=0.5)  
        
        self.fc_last = nn.Linear(128, self.output_classes) 
        
        self.activate = nn.ReLU()

    def forward(self, x):
        x = x.view(x.size(0), -1)
        #print(x.shape)
        
        x = self.fc01(x)
        x = self.fc01_bn(x)
        x = self.activate(x) 
        x = self.fc01_drop(x)
        
        
        x = self.fc1(x)
        x = self.fc1_bn(x)
        x = self.activate(x) 
        x = self.fc1_drop(x)

        x = self.fc2(x)
        x = self.fc2_bn(x)
        x = self.activate(x) 
        x = self.fc2_drop(x)
        
        x = self.fc3(x)
        x = self.fc3_bn(x)
        x = self.activate(x) 
        x = self.fc3_drop(x)
        
        x = self.fc4(x)
        x = self.fc4_bn(x)
        x = self.activate(x) 
        x = self.fc4_drop(x)
        
        x = self.fc_last(x)
        return x

    

def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        # Validation phase
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {running_loss/len(train_loader)}, "
              f"Validation Loss: {val_loss/len(val_loader)}, Accuracy: {100 * correct / total}%")

    print("Finished Training")


# Convert sparse matrix to dense
# X_train_dense = X_train.toarray()
# X_test_dense = X_test.toarray()

### exps trial

from sklearn.model_selection import train_test_split


# X_train_small, X_test_small, Y_train_small, Y_test_small = train_test_split(X_train, Y_train, test_size=0.10, random_state=42)
# X_train_small, Y_train_small = np.array(X_train_small, np.float32), np.array(Y_train_small, np.int32)
# X_test_small, Y_test_small = np.array(X_test_small, np.float32), np.array(Y_test_small, np.int32)


X_train_small, Y_train_small = X_train, Y_train = np.array(X_train, np.float32), np.array(Y_train, np.int32)
X_test_small, Y_test_small = X_test, Y_test = np.array(X_test, np.float32), np.array(Y_test, np.int32)


# Convert to PyTorch tensors
# X_train = torch.tensor(X_train, dtype=torch.float32)
# X_test = torch.tensor(X_test, dtype=torch.float32)

# Convert your numpy arrays to PyTorch tensors
# X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
# Y_train_tensor = torch.tensor(Y_train, dtype=torch.long)  # Use long for integer labels

# X_val_tensor = torch.tensor(X_test, dtype=torch.float32)
# Y_val_tensor = torch.tensor(Y_test, dtype=torch.long)

# # Create TensorDatasets
# train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)
# val_dataset = TensorDataset(X_val_tensor, Y_val_tensor)



X_train_small = torch.from_numpy(X_train_small).type(torch.FloatTensor)
X_test_small = torch.from_numpy(X_test_small).type(torch.FloatTensor)
Y_train_small = torch.from_numpy(Y_train_small).type(torch.int64)
Y_test_small = torch.from_numpy(Y_test_small).type(torch.int64)

# Create TensorDatasets
train_dataset = TensorDataset(X_train_small, Y_train_small)
val_dataset = TensorDataset(X_test_small, Y_test_small)


# Create DataLoaders
batch_size = 2048  # Modify this based on your requirements

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)

    
# Example usage
input_features = X_train.shape[1]  # Replace with your input feature size
n_classes = 100        # Replace with the number of classes in your dataset
model = AZ_MLP_Net(input_features, n_classes)





use_cuda = True
print('Torch', torch.__version__, 'CUDA', torch.version.cuda)
use_cuda = use_cuda and torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
torch.manual_seed(42)

if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model = nn.DataParallel(model)
    
model = model.to(device)
print(f'Model has {count_parameters(model)/1000000}m parameters') 

# Hyperparameters
learning_rate = 0.001
num_epochs = 200

# Criterion and Optimizer
criterion = nn.CrossEntropyLoss()
#optimizer = optim.Adam(model.parameters(), lr=learning_rate)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=0.000001)

train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=num_epochs)


Torch 2.0.1 CUDA 11.8
Model has 7.805156m parameters
Epoch 1/200, Training Loss: 3.622315753073919, Validation Loss: 2.5260693516050066, Accuracy: 40.28152246227109%
Epoch 2/200, Training Loss: 2.45676122582148, Validation Loss: 1.9002162303243364, Accuracy: 52.55436114709899%
Epoch 3/200, Training Loss: 2.046163354601179, Validation Loss: 1.6110456032412392, Accuracy: 59.36482369830876%
Epoch 4/200, Training Loss: 1.8162015250750951, Validation Loss: 1.4227586558886938, Accuracy: 64.30897440386568%
Epoch 5/200, Training Loss: 1.6598900575486442, Validation Loss: 1.2884441273553031, Accuracy: 68.32522147134003%
Epoch 6/200, Training Loss: 1.5380630814839924, Validation Loss: 1.1828156837395258, Accuracy: 70.13901046955425%
Epoch 7/200, Training Loss: 1.4432591966220312, Validation Loss: 1.099480471440724, Accuracy: 71.80223397177772%
Epoch 8/200, Training Loss: 1.363521036647615, Validation Loss: 1.0248821675777435, Accuracy: 74.06421793480163%
Epoch 9/200, Training Loss: 1.29433857637

Epoch 72/200, Training Loss: 0.607700548001698, Validation Loss: 0.42886711763484137, Accuracy: 87.03736125214468%
Epoch 73/200, Training Loss: 0.6026300385830894, Validation Loss: 0.4280679928404944, Accuracy: 86.95332469624286%
Epoch 74/200, Training Loss: 0.5980445331051236, Validation Loss: 0.425756325679166, Accuracy: 87.09688714590847%
Epoch 75/200, Training Loss: 0.596948821866323, Validation Loss: 0.4228223306792123, Accuracy: 87.1213978080465%
Epoch 76/200, Training Loss: 0.5942659084759061, Validation Loss: 0.41945355172668186, Accuracy: 87.24745264189923%
Epoch 77/200, Training Loss: 0.5930625455720084, Validation Loss: 0.4188381869878088, Accuracy: 87.31048005882559%
Epoch 78/200, Training Loss: 0.5881222458112807, Validation Loss: 0.41727856014456066, Accuracy: 87.16341608599741%
Epoch 79/200, Training Loss: 0.5886180240010458, Validation Loss: 0.41753321832844187, Accuracy: 87.26145873454954%
Epoch 80/200, Training Loss: 0.5870973434713151, Validation Loss: 0.416715859302

Epoch 143/200, Training Loss: 0.4809772015090973, Validation Loss: 0.3464017989380019, Accuracy: 89.2258132287545%
Epoch 144/200, Training Loss: 0.4774222593931925, Validation Loss: 0.3489548159497125, Accuracy: 89.02272488532512%
Epoch 145/200, Training Loss: 0.4767075251965296, Validation Loss: 0.3448065748172147, Accuracy: 89.1277705802024%
Epoch 146/200, Training Loss: 0.47438331894458285, Validation Loss: 0.34293236477034433, Accuracy: 89.2258132287545%
Epoch 147/200, Training Loss: 0.47343260948620147, Validation Loss: 0.3431073161108153, Accuracy: 89.1837949508036%
Epoch 148/200, Training Loss: 0.4725368216870323, Validation Loss: 0.34104112748588833, Accuracy: 89.2258132287545%
Epoch 149/200, Training Loss: 0.4717510136820021, Validation Loss: 0.3411356844007969, Accuracy: 89.25732693721768%
Epoch 150/200, Training Loss: 0.46973671421172125, Validation Loss: 0.3394964805671147, Accuracy: 89.22931475191709%
Epoch 151/200, Training Loss: 0.4710973437343325, Validation Loss: 0.339