In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import random
import seaborn as sns
import wandb
import yaml

from sklearn.impute import KNNImputer
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
""""Configuration"""

with open("config.yaml", "r") as file:
    config = yaml.safe_load(file)

# random_seed = random.randint(0, 100000)
random_seed = 123

print_graphs = config["print_graphs"]
features = config["features"]
target = config["target"]
num_epochs = config["num_epochs"]
learning_rate = config["learning_rate"]
dropout_rate = config["dropout_rate"]

np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
"""Classes"""


class MLP(nn.Module):
    def __init__(self, input_size, output_size, dropout_rate):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.ln1 = nn.LayerNorm(256)
        self.fc2 = nn.Linear(256, 128)
        self.ln2 = nn.LayerNorm(128)
        self.fc3 = nn.Linear(128, 64)
        self.ln3 = nn.LayerNorm(64)
        self.fc4 = nn.Linear(64, 32)
        self.ln4 = nn.LayerNorm(32)
        self.output_layer = nn.Linear(32, output_size)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        x = self.relu(self.ln1(self.fc1(x)))
        x = self.dropout(x)
        x = self.relu(self.ln2(self.fc2(x)))
        x = self.dropout(x)
        x = self.relu(self.ln3(self.fc3(x)))
        x = self.dropout(x)
        x = self.relu(self.ln4(self.fc4(x)))
        x = self.dropout(x)
        x = self.output_layer(x)
        return x


class EarlyStopping:
    def __init__(self, patience=10, delta=0):
        self.patience = patience
        self.delta = delta
        self.best_loss = None
        self.counter = 0
        self.early_stop = False

    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss > self.best_loss - self.delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.counter = 0

In [4]:
"""Functions"""


def cap_outliers_percentiles(df, feature, lower_percentile=5, upper_percentile=95):
    lower_limit = np.percentile(df[feature], lower_percentile)
    upper_limit = np.percentile(df[feature], upper_percentile)

    df[feature] = np.where(df[feature] < lower_limit, lower_limit, df[feature])
    df[feature] = np.where(df[feature] > upper_limit, upper_limit, df[feature])


def plot_feature_distributions(data, data_imputed, features):
    plt.figure(figsize=(15, 10))
    for i, feature in enumerate(features):
        plt.subplot(4, 3, i + 1)
        sns.histplot(
            data[feature].dropna(),
            kde=True,
            label="Before Imputation",
            color="blue",
            bins=30,
            alpha=0.5,
        )
        sns.histplot(
            data_imputed[feature].dropna(),
            kde=True,
            label="After Imputation",
            color="red",
            bins=30,
            alpha=0.5,
        )
        plt.title(f"{feature}")
        plt.xlabel("")
        plt.ylabel("")
        plt.legend()
    plt.tight_layout()
    plt.show()


def plot_box_plots_comparison(data, data_imputed, features):
    plt.figure(figsize=(15, 12))

    for i, feature in enumerate(features):
        plt.subplot(4, 3, i + 1)

        df_before_plot = data[[feature]].copy()
        df_before_plot["Imputation Status"] = "Before Imputation"

        df_after_plot = data_imputed[[feature]].copy()
        df_after_plot["Imputation Status"] = "After Imputation"

        df_plot = pd.concat([df_before_plot, df_after_plot], ignore_index=True)

        sns.boxplot(
            x="Imputation Status",
            y=feature,
            data=df_plot,
            hue="Imputation Status",
            palette="Set2",
            showfliers=True,
        )

        plt.title(f"{feature}")
        plt.xlabel("")
        plt.ylabel("")

    plt.tight_layout()
    plt.show()


def plot_correlation_heatmaps(data, data_imputed, features):
    corr_before = data[features].corr()
    corr_after = data_imputed[features].corr()

    plt.figure(figsize=(16, 8))

    plt.subplot(1, 2, 1)
    sns.heatmap(corr_before, annot=True, cmap="coolwarm", fmt=".2f")
    plt.title("Correlation Before Imputation")

    plt.subplot(1, 2, 2)
    sns.heatmap(corr_after, annot=True, cmap="coolwarm", fmt=".2f")
    plt.title("Correlation After Imputation")

    plt.tight_layout()
    plt.show()


def single_plot_feature_distributions(data, features):
    plt.figure(figsize=(15, 10))
    for i, feature in enumerate(features):
        plt.subplot(4, 3, i + 1)
        sns.histplot(
            data[feature].dropna(),
            kde=True,
            label="Normalized",
            color="blue",
            bins=30,
            alpha=1,
        )
        plt.title(f"{feature}")
        plt.xlabel("")
        plt.ylabel("")
        plt.legend()
    plt.tight_layout()
    plt.show()


def trainer(
    model, criterion, optimizer, early_stopping, x_train_tensor, y_train_tensor, x_val_tensor, y_val_tensor, num_epochs
):
    for epoch in range(num_epochs):
        model.train()

        outputs = model(x_train_tensor)
        loss = criterion(outputs, y_train_tensor)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss = loss.item()

        val_accuracy, val_loss = evaluate_model(model, x_val_tensor, y_val_tensor, criterion)

        if (epoch + 1) % 100 == 0:
            print(
                f"Epoch [{epoch + 1}/{num_epochs}], Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}"
            )

        early_stopping(val_loss)
        if early_stopping.early_stop:
            print(f"Early stopping triggered, Epoch [{epoch + 1}/{num_epochs}]")

            return train_loss, val_loss, val_accuracy, (epoch+1)


def evaluate_model(model, x_tensor, y_tensor, criterion):
    model.eval()
    total_samples = len(y_tensor)

    with torch.no_grad():
        outputs = model(x_tensor)
        loss = criterion(outputs, y_tensor).item()
        _, predicted = torch.max(outputs.data, 1)
        accuracy = (predicted == y_tensor).sum().item() / total_samples

    return accuracy, loss


def grid_search(learning_rates, dropout_rates, input_size, output_size, x, y_encoded, num_epochs, k_folds=5):
    kf = KFold(n_splits=5, shuffle=True, random_state=123)
    
    for lr in learning_rates:
        for dr in dropout_rates:
                
                fold_train_losses = []
                fold_val_losses = []
                fold_val_accuracies = []
                
                for fold, (train_idx, val_idx) in enumerate(kf.split(x)):
                    print(f"Learning Rate {lr}, Dropout Rate {dr}, Fold {fold + 1}/{k_folds}")

                    x_train, x_val = x[train_idx], x[val_idx]
                    y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]

                    x_train_tensor = torch.tensor(x_train, dtype=torch.float32)
                    y_train_tensor = torch.tensor(y_train, dtype=torch.long)
                    x_val_tensor = torch.tensor(x_val, dtype=torch.float32)
                    y_val_tensor = torch.tensor(y_val, dtype=torch.long)

                    model = MLP(input_size, output_size, dr)
                    criterion = nn.CrossEntropyLoss()
                    optimizer = optim.Adam(model.parameters(), lr=lr)

                    early_stopping = EarlyStopping(patience=50, delta=0.001)

                    final_train_loss, final_val_loss, final_val_accuracy, total_epochs = trainer(
                        model, criterion, optimizer, early_stopping,
                        x_train_tensor, y_train_tensor, x_val_tensor, y_val_tensor, num_epochs
                    )

                    fold_train_losses.append(final_train_loss)
                    fold_val_losses.append(final_val_loss)
                    fold_val_accuracies.append(final_val_accuracy)

                # Calculate the average results across all folds
                avg_train_loss = np.mean(fold_train_losses)
                avg_val_loss = np.mean(fold_val_losses)
                avg_val_accuracy = np.mean(fold_val_accuracies)
                std_train_loss = np.std(fold_val_losses)
                std_val_loss = np.std(fold_val_losses)
                std_val_accuracy = np.std(fold_val_losses)

                # Log the results to wandb
                wandb.log({
                    'learning_rate': lr,
                    'dropout_rate': dr,
                    'avg_train_loss': avg_train_loss,
                    'avg_val_loss': avg_val_loss,
                    'avg_val_accuracy': avg_val_accuracy,
                    'std_train_loss': std_train_loss,
                    'std_val_loss': std_val_loss,
                    'std_val_accuracy': std_val_accuracy,
                    'total_epochs': total_epochs
                })
                
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)


In [5]:
"""Read Data"""

data = pd.read_csv("almond_data.csv")

data = data.iloc[:, 1:]

data.head()

data.columns = data.columns.str.strip()

In [6]:
"""Impute Features"""

impute_features = [
    "Length (major axis)",
    "Width (minor axis)",
    "Thickness (depth)",
    "Area",
    "Perimeter",
    "Solidity",
    "Compactness",
    "Extent",
    "Convex hull(convex area)",
]

knn_imputer = KNNImputer(n_neighbors=53)  # n_neighbors = root of dataset size

data_imputed = data.copy()

types = data[target].unique()
for almond_type in types:

    type_data = data[data[target] == almond_type].copy()

    type_features = type_data[impute_features]

    imputed_values = knn_imputer.fit_transform(type_features)

    type_data[impute_features] = imputed_values

    data_imputed.update(type_data)

data_imputed["Roundness"] = (4 * data_imputed["Area"]) / (
    np.pi * data_imputed["Length (major axis)"] ** 2
)

data_imputed["Aspect Ratio"] = (
    data_imputed["Length (major axis)"] / data_imputed["Width (minor axis)"]
)

data_imputed["Eccentricity"] = np.sqrt(
    1 - (data_imputed["Width (minor axis)"] / data_imputed["Length (major axis)"]) ** 2
)

In [7]:
"""Cap Outliers"""

for feature in features:
    cap_outliers_percentiles(data_imputed, feature)

In [8]:
if print_graphs:
    plot_feature_distributions(data, data_imputed, features)

In [9]:
if print_graphs:
    plot_box_plots_comparison(data, data_imputed, features)

In [10]:
if print_graphs:
    plot_correlation_heatmaps(data, data_imputed, features)

In [11]:
"""Normalize Data"""

scaler = StandardScaler()

normalized_data = data_imputed.copy()
normalized_data[features] = scaler.fit_transform(data_imputed[features])

# print(normalized_data.head())

In [12]:
if print_graphs:
    single_plot_feature_distributions(normalized_data, features)

In [13]:
''''Data setup'''

x = normalized_data[features].values
y = normalized_data[target].values

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

input_size = len(features)
output_size = len(label_encoder.classes_)


In [14]:
'''Grid search with k fold cross validation using seed 123'''

learning_rates = [0.001, 0.01, 0.1]
dropout_rates = [0.0, 0.2, 0.5]

os.environ["WANDB_NOTEBOOK_NAME"] = "COS711_Assignment_2.ipynb"
wandb.init(project="COS 711_Assignment 2")

set_seed(123)
grid_search(learning_rates, dropout_rates, input_size, output_size, x, y_encoded, num_epochs, k_folds=5)

wandb.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mu21432962[0m ([33mu21432962-university-of-pretoria[0m). Use [1m`wandb login --relogin`[0m to force relogin


Learning Rate 0.001, Dropout Rate 0.0, Fold 1/5
Epoch [100/10000], Training Loss: 0.1260, Validation Loss: 0.1936, Validation Accuracy: 0.9323
Epoch [200/10000], Training Loss: 0.1365, Validation Loss: 0.2036, Validation Accuracy: 0.9216
Early stopping triggered, Epoch [205/10000]
Learning Rate 0.001, Dropout Rate 0.0, Fold 2/5
Epoch [100/10000], Training Loss: 0.0491, Validation Loss: 0.1522, Validation Accuracy: 0.9554
Early stopping triggered, Epoch [164/10000]
Learning Rate 0.001, Dropout Rate 0.0, Fold 3/5
Epoch [100/10000], Training Loss: 0.0548, Validation Loss: 0.1995, Validation Accuracy: 0.9394
Early stopping triggered, Epoch [196/10000]
Learning Rate 0.001, Dropout Rate 0.0, Fold 4/5
Epoch [100/10000], Training Loss: 0.0566, Validation Loss: 0.1431, Validation Accuracy: 0.9464
Epoch [200/10000], Training Loss: 0.0130, Validation Loss: 0.1270, Validation Accuracy: 0.9554
Early stopping triggered, Epoch [237/10000]
Learning Rate 0.001, Dropout Rate 0.0, Fold 5/5
Epoch [100/100

Widget Javascript not detected.  It may not be installed or enabled properly. Reconnecting the current kernel may help.


0,1
avg_train_loss,▁▂▃▃▁▃▂▃█
avg_val_accuracy,██▇▆██▇▇▁
avg_val_loss,▁▁▁▃▁▁▃▂█
dropout_rate,▁▄█▁▄█▁▄█
learning_rate,▁▁▁▂▂▂███
std_train_loss,▁▁▁▄▁▁▂▃█
std_val_accuracy,▁▁▁▄▁▁▂▃█
std_val_loss,▁▁▁▄▁▁▂▃█
total_epochs,▁▇█▁▂▄▂▃▃

0,1
avg_train_loss,0.7697
avg_val_accuracy,0.69288
avg_val_loss,0.65699
dropout_rate,0.5
learning_rate,0.1
std_train_loss,0.2573
std_val_accuracy,0.2573
std_val_loss,0.2573
total_epochs,335.0


In [13]:
"""Train, validation and test set splitting"""

set_seed(random_seed)


x_test, x_temp, y_test, y_temp = train_test_split(
    x, y_encoded, train_size=0.1, random_state=1
)

x_train, x_val, y_train, y_val = train_test_split(
    x_temp, y_temp, train_size=0.8, random_state=random_seed
)

x_train_tensor = torch.tensor(x_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
x_val_tensor = torch.tensor(x_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)
x_test_tensor = torch.tensor(x_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

print("x_train_tensor size:", x_train_tensor.size())
print("y_train_tensor size:", y_train_tensor.size())
print("x_val_tensor size:", x_val_tensor.size())
print("y_val_tensor size:", y_val_tensor.size())
print("x_test_tensor size:", x_test_tensor.size())
print("y_test_tensor size:", y_test_tensor.size())



x_train_tensor size: torch.Size([2018, 12])
y_train_tensor size: torch.Size([2018])
x_val_tensor size: torch.Size([505, 12])
y_val_tensor size: torch.Size([505])
x_test_tensor size: torch.Size([280, 12])
y_test_tensor size: torch.Size([280])


In [None]:
model = MLP(input_size, output_size, dropout_rate)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

early_stopping = EarlyStopping(patience=50, delta=0.001)

trainer(
    model,
    criterion,
    optimizer,
    early_stopping,
    x_train_tensor,
    y_train_tensor,
    x_val_tensor,
    y_val_tensor,
    num_epochs,
)