# Classification of a vectors of random variables

### Introduction
For the demonstration, I choose to use synthetic data to simplify things and save time
### Predictors and targets
- The predictor $X$ is a 10-dimensional vector where the entries are comming from a normal distribution $N \left( \mu, \sigma \right)$
I choose to fix $\mu = 100$ and to have instances where the standard deviation $\sigma \in \{1, 2, 3, 4, 5\}$
- So for a given instance, the predictor is $X$ and the target $Y = \sigma$
### Goal
Our goal is to classify i.e. to predict the standard deviation $\sigma$ based on the vector $X$
### Approach
- We split data into training, validation and test data
- We use a feedforward neural network, there is no need to have a sequence-oriented model such as RNN, because the entries of the vector are independant
- We choose to fix epochs in 10, and tune other hyperparameters using the optuna framework, which uses a bayesian optimization approach. An approach that is similar to the popular idea in reinforecement learning, explore and exploit. So it explore the hyperparameter space in a non-exaustive way, but uses previous experiences to guide future explorations.
- I have used the accuaracy metric as the data is balanced

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import optuna
from optuna.trial import TrialState

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
num_instances = 10000 #number of instances
num_features = 10 #number of features
s_values = [1, 2, 3, 4, 5] #the classes, i.e. the standard deviation of the sampling distribution

In [3]:
X = []
Y = []

In [4]:
# Creating the data
for _ in range(num_instances):
    s = np.random.choice(s_values)
    x = np.random.normal(loc=100, scale=s, size=num_features)#loc : mean, scale:standard deviation, size: the vector shape
    y = s
    X.append(x)
    Y.append(y)

In [5]:
X = np.array(X)
Y = np.array(Y)

In [6]:
print(X.shape)
print(Y.shape)

(10000, 10)
(10000,)


In [7]:
print(X[:3])

[[106.97252557 101.53964421  98.79761814 108.08184519  94.68424195
   96.52768656  99.57866773 100.79549978 100.51348269  94.14896118]
 [ 98.44498226  93.82214874 104.06849009  99.28200967 105.18988478
   98.12541208 105.29406747 105.04727376  98.89638568  96.26732825]
 [ 99.45452167  99.62915417 100.99268478 100.04924124  98.85569226
   99.31898887  99.71365697 100.64984672 100.20770955  99.31519275]]


In [8]:
print(Y[:3])

[5 4 1]


In [9]:
# verifying if the resulting instances are balanced over classes
for s in s_values:
    print('For s =', s, '-->',len(Y[Y==s])/len(Y))

For s = 1 --> 0.2045
For s = 2 --> 0.1918
For s = 3 --> 0.1984
For s = 4 --> 0.2041
For s = 5 --> 0.2012


In [10]:
# splitting with making sure that the resulting splits have the same distribution as the original data
# This is important especialy for the test data, so to be representative of real external data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, shuffle=True, random_state=42, stratify=Y)

In [11]:
# verifying balance after splitting
for s in s_values:
    print('For s =', s, '-->',len(Y_train[Y_train==s])/len(Y_train))

For s = 1 --> 0.2045
For s = 2 --> 0.19175
For s = 3 --> 0.198375
For s = 4 --> 0.204125
For s = 5 --> 0.20125


In [12]:
# splitting the training data into one for training and one for validating
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2, shuffle=True, random_state=42, stratify=Y_train)

In [13]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(6400, 10)
(1600, 10)
(2000, 10)


In [14]:
class dataset(Dataset):
    def __init__(self, X, Y):
        self.X = torch.tensor(X, dtype = torch.float32)
        self.Y = torch.tensor(Y, dtype = torch.int64)

    def __len__(self):
        return len(self.X)
    def __getitem__(self, index):
        return self.X[index], self.Y[index]

In [15]:
training_data = dataset(X_train, Y_train)
validation_data = dataset(X_val, Y_val)
testing_data = dataset(X_test, Y_test)

In [16]:
trainloader = DataLoader(training_data, batch_size=256)

In [17]:
EPOCHS = 10 #epochs are fixed to simplify the tuning

#defining the hyperparameter space of the model hyperparameters
def define_model(trial):

    n_layers = trial.suggest_int("n_layers", 1, 3) #layers can take any number between 1 and 3
    layers = []

    in_features = num_features
    for i in range(n_layers):
        out_features = trial.suggest_int("n_units_l{}".format(i), 4, 128) #number of unites/layer can take any number between 4 and 128
        layers.append(nn.Linear(in_features, out_features))
        layers.append(nn.ReLU())
        p = trial.suggest_float("dropout_l{}".format(i), 0.2, 0.5) #the dropoout probability can take any float number between 0.2 and 0.5
        layers.append(nn.Dropout(p))

        in_features = out_features
    layers.append(nn.Linear(in_features, len(s_values)))

    return nn.Sequential(*layers)

#defining the hyperparameter space of the objective function hyperparameters
def objective(trial):
    
    model = define_model(trial)

    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    l2 = trial.suggest_float("l2", 1e-5, 1, log=True)
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr, weight_decay=l2)
    criterion = torch.nn.CrossEntropyLoss()

    for epoch in range(EPOCHS):
        model.train()
        for batch_idx, (inputs, targets) in enumerate(trainloader):
            
            optimizer.zero_grad()
            outputs = model(inputs)
            targets = targets - 1 #substracting one to be consistent with what Cross Entropy Loss in PyTorch expect
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

        # Validation of the model.
        model.eval()
        with torch.no_grad():
            y_pred = model(validation_data.X)
            y_pred = torch.argmax(y_pred, dim=1).numpy() # Transformation of the dummy variable to the class index
            y_true = validation_data.Y - 1
        
        acc = accuracy_score(y_true, y_pred)

        trial.report(acc, epoch)

        #Pruning unpromising trials to save time
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return acc # returning the metric that direct the bayesian optimization


if __name__ == "__main__":
    study = optuna.create_study(direction="maximize") #The metric is to be maximized
    study.optimize(objective, n_trials=500) #The number of trials is 500

    pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
    complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

    print("Study statistics: ")
    print("  Number of finished trials: ", len(study.trials))
    print("  Number of pruned trials: ", len(pruned_trials))
    print("  Number of complete trials: ", len(complete_trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[I 2025-01-03 20:04:14,597] A new study created in memory with name: no-name-74cdd1c8-162e-4a68-a56b-4604af64a165
[I 2025-01-03 20:04:17,001] Trial 0 finished with value: 0.223125 and parameters: {'n_layers': 1, 'n_units_l0': 18, 'dropout_l0': 0.46757447677315583, 'optimizer': 'Adam', 'lr': 0.0003927493984493093, 'l2': 0.03322835901401022}. Best is trial 0 with value: 0.223125.
[I 2025-01-03 20:04:17,746] Trial 1 finished with value: 0.204375 and parameters: {'n_layers': 3, 'n_units_l0': 108, 'dropout_l0': 0.28481278795827947, 'n_units_l1': 127, 'dropout_l1': 0.3712138942672482, 'n_units_l2': 30, 'dropout_l2': 0.4168533690452827, 'optimizer': 'SGD', 'lr': 0.0003439331845395555, 'l2': 0.419824542805444}. Best is trial 0 with value: 0.223125.
[I 2025-01-03 20:04:18,531] Trial 2 finished with value: 0.200625 and parameters: {'n_layers': 3, 'n_units_l0': 113, 'dropout_l0': 0.28867890329649737, 'n_units_l1': 16, 'dropout_l1': 0.20515925287217637, 'n_units_l2': 99, 'dropout_l2': 0.4653915363

Study statistics: 
  Number of finished trials:  500
  Number of pruned trials:  426
  Number of complete trials:  74
Best trial:
  Value:  0.47125
  Params: 
    n_layers: 1
    n_units_l0: 90
    dropout_l0: 0.4801244864190618
    optimizer: SGD
    lr: 0.002455759433335553
    l2: 0.6012728116515451


In [18]:
#Training the model with the best parameters on the training data
#The corect way is to train on the training data including the validation data
#But here I didn't include it for time reasons
best_model = define_model(trial)
best_optimizer = getattr(optim, trial.params["optimizer"])(
    best_model.parameters(),
    lr=trial.params["lr"],
    weight_decay=trial.params["l2"]
)
criterion = torch.nn.CrossEntropyLoss()

best_model.train()
for inputs, targets in trainloader:
    best_optimizer.zero_grad()
    outputs = best_model(inputs)
    targets = targets - 1 
    loss = criterion(outputs, targets)
    loss.backward()
    best_optimizer.step()

In [19]:
#evaluating on the test data gives 0.259, which show that the model perform slightly better than a uniform guesser because each class has the proportion 1/5 = 0.2

best_model.eval()
with torch.no_grad():
    test_outputs = best_model(testing_data.X)
    test_pred = torch.argmax(test_outputs, dim=1).numpy()
    test_true = testing_data.Y - 1
    test_accuracy = accuracy_score(test_true, test_pred)
    print(f"Test Accuracy: {test_accuracy}")

Test Accuracy: 0.259
