# Test diffrent models for NN 

In this notebook we will test the machine learning frameworks TensorFlow, PyTorch, and Scikit-Learnâ€”on our datasets. The models will iterates over six different datasets: non normalized, normalized, and data grouped by slices, annotations, nodules. To observe the impact of data normalization and dataset structure on model performance and also which model we should proceed with.

The notebook implements three neural network models:

TensorFlow Model: A sequential neural network with two hidden layers of 60 neurons each, using ReLU activations and trained with categorical cross-entropy loss.
<br>
<br>
PyTorch Model: A custom neural network with two fully connected layers, using ReLU activation functions and trained using cross-entropy loss.
<br>
<br>
Scikit-Learn MLP Model: A multi-layer perceptron classifier with two hidden layers of 60 neurons, utilizing ReLU activations, trained through the Scikit-Learn library.

In [30]:
#import libraries

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from sklearn.neural_network import MLPClassifier

In [31]:
# list of datasets
datasets = [
    "final_by_slices.csv",
    "final_by_slices_normalized.csv",
    "final_by_annotations.csv",
    "final_by_annotations_normalized.csv",
    "final_by_nodules.csv",
    "final_by_nodules_normalized.csv"
]

## Load and preprocess dataset

In [32]:
# function to load and preprocess the dataset
def preprocess_dataset(filename, label_column="malignancy"):
    # load the dataset
    df = pd.read_csv(filename)
    
    # separate features and labels
    X = df.drop(label_column, axis=1)
    y = df[label_column]
    
    # one-hot encode the labels for TensorFlow and PyTorch 
    encoder = OneHotEncoder(sparse=False)
    y_onehot = encoder.fit_transform(y.values.reshape(-1, 1))
    
    # split the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y_onehot, test_size=0.3, random_state=42, stratify=y)
    
    return X_train, X_test, y_train, y_test

## PyTorch Model

In [33]:
# function to create and train PyTorch model
class SimpleNN(nn.Module):
    def __init__(self, input_size, output_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 60)
        self.fc2 = nn.Linear(60, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

def train_pytorch(X_train, y_train, X_test, y_test):
    model = SimpleNN(X_train.shape[1], y_train.shape[1])
    criterion = nn.CrossEntropyLoss()  
    optimizer = optim.Adam(model.parameters())
    
    # convert data to PyTorch tensors
    X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
    y_train_tensor = torch.tensor(np.argmax(y_train, axis=1), dtype=torch.long)

    # train the model
    for epoch in range(100):
        optimizer.zero_grad()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        loss.backward()
        optimizer.step()

    # evaluate the model
    with torch.no_grad():
        X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
        y_test_tensor = torch.tensor(np.argmax(y_test, axis=1), dtype=torch.long)
        outputs = model(X_test_tensor)
        _, predicted = torch.max(outputs.data, 1)
        accuracy = accuracy_score(y_test_tensor, predicted)
        print(f'PyTorch Model Accuracy: {accuracy:.4f}')



## TensorFlow Model

In [34]:
# function to create and train TensorFlow model
def train_tensorflow(X_train, y_train, X_test, y_test):
    model = Sequential([
        Dense(60, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(60, activation='relu'),
        Dense(y_train.shape[1], activation='softmax')  
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=100, verbose=0)
    _, accuracy = model.evaluate(X_test, y_test, verbose=0)
    print(f'TensorFlow Model Accuracy: {accuracy:.4f}')


## Scikit_Learn Model

In [35]:
# function to create and train Scikit-Learn MLP model
def train_sklearn(X_train, y_train, X_test, y_test):
    model = MLPClassifier(
    hidden_layer_sizes=(60),
    activation='relu',
    learning_rate_init=0.01,
    max_iter=1000, 
    random_state=42
)
    model.fit(X_train, np.argmax(y_train, axis=1))
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(np.argmax(y_test, axis=1), y_pred)
    print(f'Scikit-Learn Model Accuracy: {accuracy:.4f}')

## Print the results

In [36]:
# loop through each dataset and train/evaluate the models
for dataset in datasets:
    print(f"Processing dataset: {dataset}")
    X_train, X_test, y_train, y_test = preprocess_dataset(dataset)
    
    print("Training TensorFlow model...")
    train_tensorflow(X_train, y_train, X_test, y_test)
    
    print("Training PyTorch model...")
    train_pytorch(X_train, y_train, X_test, y_test)
    
    print("Training Scikit-Learn model...")
    train_sklearn(X_train, y_train, X_test, y_test)
    
    print(f"Finished processing dataset: {dataset}\n")

Processing dataset: final_by_slices.csv


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Training TensorFlow model...
TensorFlow Model Accuracy: 0.5005
Training PyTorch model...
PyTorch Model Accuracy: 0.5362
Training Scikit-Learn model...
Scikit-Learn Model Accuracy: 0.7481
Finished processing dataset: final_by_slices.csv

Processing dataset: final_by_slices_normalized.csv


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Training TensorFlow model...
TensorFlow Model Accuracy: 0.9186
Training PyTorch model...
PyTorch Model Accuracy: 0.8340
Training Scikit-Learn model...
Scikit-Learn Model Accuracy: 0.8795
Finished processing dataset: final_by_slices_normalized.csv

Processing dataset: final_by_annotations.csv
Training TensorFlow model...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


TensorFlow Model Accuracy: 0.7184
Training PyTorch model...
PyTorch Model Accuracy: 0.5831
Training Scikit-Learn model...
Scikit-Learn Model Accuracy: 0.6704
Finished processing dataset: final_by_annotations.csv

Processing dataset: final_by_annotations_normalized.csv
Training TensorFlow model...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


TensorFlow Model Accuracy: 0.8630
Training PyTorch model...
PyTorch Model Accuracy: 0.8431
Training Scikit-Learn model...
Scikit-Learn Model Accuracy: 0.8571
Finished processing dataset: final_by_annotations_normalized.csv

Processing dataset: final_by_nodules.csv
Training TensorFlow model...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


TensorFlow Model Accuracy: 0.4869
Training PyTorch model...
PyTorch Model Accuracy: 0.6176
Training Scikit-Learn model...
Scikit-Learn Model Accuracy: 0.7255
Finished processing dataset: final_by_nodules.csv

Processing dataset: final_by_nodules_normalized.csv
Training TensorFlow model...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


TensorFlow Model Accuracy: 0.8840
Training PyTorch model...
PyTorch Model Accuracy: 0.8529
Training Scikit-Learn model...
Scikit-Learn Model Accuracy: 0.8922
Finished processing dataset: final_by_nodules_normalized.csv

