# Supervised learning methods for classification
Methods: 
- Neural networks

Compared to classical methods, neural networks take too long to train, and doesn't work well on low-error-rate data, such as on float `4903052`

## Global settings

In [2]:
import os
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, cohen_kappa_score, roc_auc_score, average_precision_score, confusion_matrix

import numpy as np
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, cohen_kappa_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [40]:
split_type = 'random'

TRAIN_DIR = f'../data/{split_type}split/train/'
VAL_DIR = f'../data/{split_type}split/val/'
TEST_DIR = f'../data/{split_type}split/test/'


float_numbers = [
    '4903052',
    '4903054',
    '4903058',
    '4903215',
    '4903217',
    '4903218',
    '4903220'
]

# float_number = '4903217' # high
# float_number = '4903218' # low1
# float_number = '4903220' # low2
# float_number = '4903052' # low3
float_number = '4903054' # low4


TRAIN_FILE = os.path.join(TRAIN_DIR, f'PR_PF_{float_number}.csv')
VAL_FILE = os.path.join(VAL_DIR, f'PR_PF_{float_number}.csv')
TEST_FILE = os.path.join(TEST_DIR, f'PR_PF_{float_number}.csv')

RESULT_DIR = f'../results/{split_type}split/{float_number}_NN'

os.makedirs(RESULT_DIR, exist_ok=True)

In [41]:
def comp_ratio(dataset):
    ''' Compute anomaly ratio
    '''
    instance = dataset[(dataset['Label']==1)]
    rate=len(instance)/len(dataset)*100
    return round(rate,2), len(instance)

## Classificatioin

### Cost-sensitive learning
```
              Predicted
              '0'   '1'  
Actual  '0' |  0  |  1  |
        '1' |  10  |  0  |

```
'1' stands for anomalies. 



In [42]:
# class CostSensitiveLoss(nn.Module):
#     def __init__(self, cost_matrix):
#         super(CostSensitiveLoss, self).__init__()
#         self.cost_matrix = cost_matrix
        
#     def forward(self, outputs, targets):
#         # Calculate the loss using the cost matrix and predicted probabilities
#         # For example, you can use the cross-entropy loss and apply the cost matrix
#         _, predicted = torch.max(outputs.data, 1)
#         costs = [self.cost_matrix[int(a), int(b)] for a, b in zip(targets, predicted)]
#         costs = torch.tensor(costs)
        
#         ce_loss = nn.CrossEntropyLoss(reduction='none')(outputs, targets)
#         weighted_loss = torch.sum(ce_loss * costs)
#         return weighted_loss

# cost_matrix = torch.tensor([[1, 1], [10, 1]])  # Example cost matrix
# loss_fn = CostSensitiveLoss(cost_matrix)
weights = torch.tensor([1, 1], dtype=torch.float)
loss_fn = nn.CrossEntropyLoss(weight=weights)


### Define model architecture

In [3]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        # self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        # out = self.fc2(out)
        # out = self.relu(out)
        out = self.fc3(out)
        # out = self.softmax(out)
        return out



In [7]:
from torchsummary import summary
model = NeuralNetwork(6, 32, 2)
summary(model, (6,))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                   [-1, 32]             224
              ReLU-2                   [-1, 32]               0
            Linear-3                    [-1, 2]              66
Total params: 290
Trainable params: 290
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00
----------------------------------------------------------------


### Define training and inference pipeline

In [44]:
def create_model_nn(input_dim, hidden_dim, output_dim):
    model = NeuralNetwork(input_dim, hidden_dim, output_dim)
    return model

def fit_model_nn(model, criterion, X_train, y_train, X_val=None, y_val=None, epochs=10, batch_size=32, learning_rate=0.001):
    # Normalize the features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)

    # Convert numpy arrays to tensors
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)

    # Define the optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create a DataLoader for batch processing
    dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Training loop
    train_loss_values = []
    val_loss_values = []
    metric_values = []
    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in dataloader:
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            # loss = criterion(outputs, labels)
            loss = loss_fn(outputs, labels)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        # Calculate the average loss for the epoch
        epoch_loss = running_loss / len(X_train)
        
        # print(f'Epoch: {epoch}; Loss:{epoch_loss}')

        # Store the loss value for this epoch
        train_loss_values.append(epoch_loss)

        # # ------- Validate the results ---------
        # if X_val is not None and y_val is not None: 
        #     # Normalize the features
        #     scaler = StandardScaler()
        #     X_val = scaler.fit_transform(X_val)

        #     # Convert numpy arrays to tensors
        #     X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
        #     y_val_tensor = torch.tensor(y_val.values, dtype=torch.long)

        #     val_outputs = model(X_val_tensor)
        #     val_loss = criterion(val_outputs, y_val_tensor)
        #     val_loss_values.append(val_loss.item())
            
        #     metric = list(evaluate_model(model, X_val, y_val, 'NN'))
        #     metric_values.append(metric)
    return train_loss_values, val_loss_values, metric_values

def evaluate_model_nn(model, X_test):
    # Normalize the features
    scaler = StandardScaler()
    X_test = scaler.fit_transform(X_test)

    # Convert numpy arrays to tensors
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

    # Make predictions
    with torch.no_grad():
        outputs = model(X_test_tensor)
        _, predicted = torch.max(outputs.data, 1)
        probabilities = F.softmax(outputs, dim=1).numpy()[:, 1]

    return predicted, probabilities

In [45]:
def create_model(model_name, input_dim, hidden_dim, output_dim): 
    if model_name.startswith('NN'):
        model = create_model_nn(input_dim, hidden_dim, output_dim)
    else:
        raise ValueError(f"Invalid model name: {model_name}")
    return model

def evaluate_model(model, test_features, test_labels):
    # Test the model on the test set
    predictions, probabilities = evaluate_model_nn(model, test_features)

    precision = precision_score(test_labels, predictions, zero_division=0)
    recall = recall_score(test_labels, predictions, zero_division=0)
    f1 = f1_score(test_labels, predictions, zero_division=0)
    kappa = cohen_kappa_score(test_labels, predictions)
    roc_auc = roc_auc_score(test_labels, probabilities)
    prc_auc = average_precision_score(test_labels, probabilities)
    confusion = confusion_matrix(test_labels, predictions)

    return precision, recall, f1, kappa, roc_auc, prc_auc, confusion

### Load data

In [46]:
def random_sampling(train_data, label_column, sampling_ratio):
    # Separate features and labels
    train_labels = train_data[label_column]
    train_features = train_data.drop(label_column, axis=1)

    # Randomly select a subset of the train set
    train_features_sample, _, train_labels_sample, _ = train_test_split(train_features, train_labels, train_size=sampling_ratio, random_state=42)

    return train_features_sample, train_labels_sample

In [47]:
train_data = pd.read_csv(TRAIN_FILE)
val_data = pd.read_csv(VAL_FILE)
test_data = pd.read_csv(TEST_FILE)
train_data = train_data.drop(['ID', 'Date'], axis=1)
val_data = val_data.drop(['ID', 'Date'], axis=1)
test_data = test_data.drop(['ID', 'Date'], axis=1)
test_data.tail()

Unnamed: 0,Normalized_date,Latitude,Longitude,Pressure,Salinity,Temperature,Label
59147,-0.751492,1.210129,-1.065537,0.346135,-0.918097,-0.735714,0
59148,0.733271,0.720008,0.275434,-0.887326,1.013378,0.790247,0
59149,0.138937,2.164069,-0.913114,-0.888951,1.126371,1.205522,0
59150,-0.391029,0.645212,-0.926159,-0.8812,1.094591,1.124348,0
59151,1.685923,-1.635168,2.098484,-0.713296,0.605543,0.518622,0


In [48]:
train_data.shape, val_data.shape, test_data.shape

((177455, 7), (59152, 7), (59152, 7))

In [49]:
print(f'\tError rate\tAnomalies')
print(f'---------------------------------')
print(f'Train:\t{comp_ratio(train_data)[0]}%\t\t{comp_ratio(train_data)[1]}'), 
print(f'Test:\t{comp_ratio(test_data)[0]}%\t\t{comp_ratio(test_data)[1]}')

	Error rate	Anomalies
---------------------------------
Train:	0.23%		400
Test:	0.22%		133


### Train

In [53]:
# Usage 
label_column = 'Label'  # Name of the label column in the CSV files
sampling_ratio = 0.99 # Sampling ratio of 0.5 (50%)
input_dim = 6
hidden_dim = 32
output_dim = 2
num_epoches = 10
learning_rate = 0.001

# Perform random sampling
train_features_sample, train_labels_sample = random_sampling(train_data, label_column, sampling_ratio)
val_labels = val_data[label_column]
val_features = val_data.drop(label_column, axis=1)
test_labels = test_data[label_column]
test_features = test_data.drop(label_column, axis=1)


model_names = ['NN', 'NN+CS2', 'NN+CS5']  # Model names to evaluate
# model_names = ['NN+CS5']  # Model names to evaluate

results = []
for model_name in model_names: 
    # Fit a model on the sampled train set
    model = create_model(model_name, input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim)
    
    if model_name == 'NN': 
        weights = torch.tensor([1, 1], dtype=torch.float)
    elif model_name.startswith("NN+CS"): 
        cs = int(model_name.strip('NN+CS'))
        weights = torch.tensor([1, cs], dtype=torch.float)
    else: 
        raise ValueError(f"Invalid model name: {model_name}")
    
    print(f'Cost sensitive weights: {weights.data.numpy()}')

    criterion = nn.CrossEntropyLoss(weight=weights)
    train_loss_values, val_loss_values, metric_values = fit_model_nn(model, 
        criterion, 
        train_features_sample, 
        train_labels_sample, 
        val_features, 
        val_labels, 
        epochs=num_epoches, 
        learning_rate=learning_rate
        )
    # Evaluate the model on the test set
    precision, recall, f1, kappa, roc_auc, prc_auc, confusion = evaluate_model(model, test_features, test_labels)
    result = {'Model': model_name, 'Sampling Ratio': sampling_ratio,
                                            'Precision': precision, 'Recall': recall, 'F1-score': f1, "Cohen's Kappa": kappa, 'ROC-AUC': roc_auc, 'PRC-AUC': prc_auc, 'Confusion Matrix': confusion}

    results.append(result)

results_df = pd.DataFrame(results)

results_df


Cost sensitive weights: [1. 1.]
Cost sensitive weights: [1. 2.]
Cost sensitive weights: [1. 5.]


Unnamed: 0,Model,Sampling Ratio,Precision,Recall,F1-score,Cohen's Kappa,ROC-AUC,PRC-AUC,Confusion Matrix
0,NN,0.99,0.0,0.0,0.0,0.0,0.945504,0.166161,"[[59019, 0], [133, 0]]"
1,NN+CS2,0.99,0.0,0.0,0.0,0.0,0.947849,0.129854,"[[59019, 0], [133, 0]]"
2,NN+CS5,0.99,0.0,0.0,0.0,0.0,0.956821,0.22661,"[[59019, 0], [133, 0]]"


In [54]:
filtered_results = results_df[results_df['Sampling Ratio'] == 0.99]

# Select the desired columns
selected_columns = ['Model', 'Precision', 'Recall', 'F1-score', "Cohen's Kappa", 'PRC-AUC']
filtered_results = filtered_results[selected_columns]

# Round numerical values to 4 decimals
filtered_results = filtered_results.round(4)

# Convert the results to LaTeX table format
latex_table = filtered_results.to_latex(index=False, escape=False)

# Print the LaTeX table
print(f"Float: {float_number}")

print(latex_table)


Float: 4903054
\begin{tabular}{lrrrrr}
\toprule
 Model &  Precision &  Recall &  F1-score &  Cohen's Kappa &  PRC-AUC \\
\midrule
    NN &        0.0 &     0.0 &       0.0 &            0.0 &   0.1662 \\
NN+CS2 &        0.0 &     0.0 &       0.0 &            0.0 &   0.1299 \\
NN+CS5 &        0.0 &     0.0 &       0.0 &            0.0 &   0.2266 \\
\bottomrule
\end{tabular}



  latex_table = filtered_results.to_latex(index=False, escape=False)
