In [1]:
import torch
import mlflow

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import numpy as np

from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import PCA

In [2]:
### Data Exploration with the 28x28 image csv [FUNCTIONAL]

print("Hello")
project_df = pd.read_csv('data/hmnist_28_28_L.csv')
project_df.head(n=4)
print(project_df.info())

Hello
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10015 entries, 0 to 10014
Columns: 785 entries, pixel0000 to label
dtypes: int64(785)
memory usage: 60.0 MB
None


In [3]:
### Do Train-Validation-Test Split on the Data (eg. 60/20/20)

#read csv
df = pd.read_csv('./data/skin_cancer_dataset.csv')

#separate the features and labels
X = df.drop(columns=['label'])
y = df['label']

#split 60-40 for training dataset and a temp dataset that will be split to 50-50
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# Second split of the temporary set: 50% for validation and 50% for test set
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

#sanity check
sizes = (len(X_train), len(X_val), len(X_test))
sizes

(1135, 378, 379)

In [4]:
### Apply PCA (fit the PCA matrix using ONLY the training portion of the dataset). Transform each separately
# fit pca on the training set only
pca = PCA(n_components=0.80, random_state=42)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
X_val_pca = pca.transform(X_val)

#sanity check
X_train_pca.shape

(1135, 9)

In [5]:
# Convert the PCA-transformed features and labels into PyTorch tensors
train_features = torch.tensor(X_train_pca, dtype=torch.float)
val_features = torch.tensor(X_val_pca, dtype=torch.float)
test_features = torch.tensor(X_test_pca, dtype=torch.float)

train_labels = torch.tensor(y_train.values, dtype=torch.long)  # Assuming y_train is a pandas Series
val_labels = torch.tensor(y_val.values, dtype=torch.long)  # Assuming y_val is a pandas Series
test_labels = torch.tensor(y_test.values, dtype=torch.long)  # Assuming y_test is a pandas Series

# Create TensorDatasets
train_dataset = TensorDataset(train_features, train_labels)
val_dataset = TensorDataset(val_features, val_labels)
test_dataset = TensorDataset(test_features, test_labels)

In [19]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_sizes, num_classes=7):
        super(MLP, self).__init__()
        # Define the network layers as before
        layers = []
        prev_size = input_size
        for size in hidden_sizes:
            layers.append(nn.Linear(prev_size, size))
            # layers.append(nn.LayerNorm(size))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.5))
            prev_size = size
        layers.append(nn.Linear(prev_size, num_classes))
        self.layers = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.layers(x)

In [12]:

def train_validate_model(train_dataset, val_dataset, device, params):
    # Model initialization
    model = MLP(input_size=params['input_size'], hidden_sizes=params['hidden_sizes'], num_classes=7).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=params['lr'])
    train_loader = DataLoader(dataset=train_dataset, batch_size=params['batch_size'], shuffle=True)
    val_loader = DataLoader(dataset=val_dataset, batch_size=params['batch_size'], shuffle=False)
    
    # Training loop
    model.train()
    for epoch in range(params['epochs']):
        for batch_idx, (data, targets) in enumerate(train_loader):
            data, targets = data.to(device), targets.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, targets)
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch+1}, Loss: {loss.item()}')
    
    # Validation
    model.eval()
    val_preds, val_targets = [], []
    with torch.no_grad():
        for data, targets in val_loader:
            data, targets = data.to(device), targets.to(device)
            output = model(data)
            preds = torch.max(output, 1)[1]
            val_preds.extend(preds.cpu().numpy())
            val_targets.extend(targets.cpu().numpy())
    
    val_accuracy = accuracy_score(val_targets, val_preds)
    print(f'Validation Accuracy: {val_accuracy:.4f}')
    return val_accuracy, model

def test_model(model, test_loader, device):
    model.eval()  # Set the model to evaluation mode
    total = 0
    correct = 0
    predictions = []
    actuals = []
    
    with torch.no_grad():  # Inference mode, no gradients needed
        for data, labels in test_loader:
            data, labels = data.to(device), labels.to(device)
            outputs = model(data)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            predictions.extend(predicted.cpu().numpy())
            actuals.extend(labels.cpu().numpy())
    
    accuracy = 100 * correct / total
    return accuracy, predictions, actuals

In [16]:
hyperparameter_grid = {
    'lr': [0.001, 0.0005, 0.0001],
    'hidden_sizes': [
        [128, 64],
        [128, 64, 32],
        [128, 64, 32, 16],
        [64,128,32]
    ],
    'batch_size': [16, 32, 64, 128]  # Example of adding batch size to the grid
}

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
best_accuracy = 0.0
best_params = None
best_model = None

for lr in hyperparameter_grid['lr']:
    for hidden_sizes in hyperparameter_grid['hidden_sizes']:
        for batch_size in hyperparameter_grid['batch_size']:
            print(f"Training with lr={lr}, hidden_sizes={hidden_sizes}, batch_size={batch_size}")
            params = {
                'input_size': X_train_pca.shape[1],
                'hidden_sizes': hidden_sizes,
                'lr': lr,
                'epochs': 50,  # Keeping epochs fixed for this example
                'batch_size': batch_size,
            }
            
            # Train and validate the model with the current set of parameters
            val_accuracy, model = train_validate_model(train_dataset, val_dataset, device, params)
            
            print(f"Validation accuracy: {val_accuracy}")
            if val_accuracy > best_accuracy:
                best_accuracy = val_accuracy
                best_params = params
                best_model = model

print(f"Best parameters: {best_params}")
print(f"Best validation accuracy: {best_accuracy}")

Training with lr=0.001, hidden_sizes=[128, 64], batch_size=16
Epoch 1, Loss: 8.058518409729004
Epoch 2, Loss: 5.722539901733398
Epoch 3, Loss: 5.417480945587158
Epoch 4, Loss: 2.0664422512054443
Epoch 5, Loss: 2.603742837905884
Epoch 6, Loss: 1.9799559116363525
Epoch 7, Loss: 2.3556976318359375
Epoch 8, Loss: 2.5933165550231934
Epoch 9, Loss: 1.760725498199463
Epoch 10, Loss: 2.187429189682007
Epoch 11, Loss: 1.9119384288787842
Epoch 12, Loss: 1.4926857948303223
Epoch 13, Loss: 1.9542561769485474
Epoch 14, Loss: 1.6452908515930176
Epoch 15, Loss: 1.7254005670547485
Epoch 16, Loss: 2.078263521194458
Epoch 17, Loss: 1.8188679218292236
Epoch 18, Loss: 1.6525006294250488
Epoch 19, Loss: 1.713840126991272
Epoch 20, Loss: 1.8216408491134644
Epoch 21, Loss: 1.7838021516799927
Epoch 22, Loss: 1.5797027349472046
Epoch 23, Loss: 1.809943675994873
Epoch 24, Loss: 1.6712509393692017
Epoch 25, Loss: 1.75022292137146
Epoch 26, Loss: 1.7486965656280518
Epoch 27, Loss: 1.7453519105911255
Epoch 28, Los

In [21]:
test_loader = DataLoader(dataset=test_dataset, batch_size=best_params['batch_size'], shuffle=False)
test_accuracy, test_predictions, test_actuals = test_model(best_model, test_loader, device)
print(f"Test Accuracy: {test_accuracy:.2f}%")

Test Accuracy: 28.50%
