In [16]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
import copy
import math

from scipy.stats import kurtosis 

from sklearn.preprocessing import StandardScaler

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import FastICA, PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import SparseRandomProjection


from yellowbrick.cluster import SilhouetteVisualizer
from yellowbrick.cluster import InterclusterDistance

import torch
import torch.nn as nn
import torch.optim as optim
import tqdm
#from tqdm import tqdm

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [17]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from yellowbrick.cluster import SilhouetteVisualizer, InterclusterDistance
from ucimlrepo import fetch_ucirepo 

# Load UCI AIDS crinical dataset - https://archive.ics.uci.edu/dataset/890/aids+clinical+trials+group+study+175

# fetch dataset 
aids_clinical_trials_group_study_175 = fetch_ucirepo(id=890) 
  
# data (as pandas dataframes) 
X = aids_clinical_trials_group_study_175.data.features 
y = aids_clinical_trials_group_study_175.data.targets 
y=y.cid

X_raw = X
y_raw = y

In [18]:
# Helper function to train one model
def model_train(model, X_train, y_train, X_val, y_val, n_epochs = 100, lr=0.001):
#def model_train(model, X_train, y_train, X_val, y_val):
    # loss function and optimizer
    loss_fn = nn.BCELoss()  # binary cross entropy
    #optimizer = optim.Adam(model.parameters(), lr=0.0001)
    #optimizer = optim.Adam(model.parameters(), lr=lr)  # modified 
    optimizer = optim.Adam(model.parameters(), lr=0.001)  # modified 
 
    #n_epochs = 300   # number of epochs to run
    #n_epochs = n_epochs   # number of epochs to run # modified
    n_epochs = 100   # number of epochs to run # modified
    batch_size = 10  # size of each batch
    batch_start = torch.arange(0, len(X_train), batch_size)
 
    # Hold the best model
    best_acc = - np.inf   # init to negative infinity
    best_weights = None
 
    for epoch in range(n_epochs):
        model.train()
        with tqdm.tqdm(batch_start, unit="batch", mininterval=0, disable=True) as bar:
            bar.set_description(f"Epoch {epoch}")
            for start in bar:
                # take a batch
                X_batch = X_train[start:start+batch_size]
                y_batch = y_train[start:start+batch_size]
                # forward pass
                y_pred = model(X_batch)
                loss = loss_fn(y_pred, y_batch)
                # backward pass
                optimizer.zero_grad()
                loss.backward()
                # update weights
                optimizer.step()
                # print progress
                acc = (y_pred.round() == y_batch).float().mean()
                bar.set_postfix(
                    loss=float(loss),
                    acc=float(acc)
                )
        # evaluate accuracy at end of each epoch
        model.eval()
        y_pred = model(X_val)
        acc = (y_pred.round() == y_val).float().mean()
        acc = float(acc)
        if acc > best_acc:
            best_acc = acc
            best_weights = copy.deepcopy(model.state_dict())
    # restore model and return best accuracy
    model.load_state_dict(best_weights)
    return best_acc

## NN + PCA

In [19]:
##
## PCA
##

num_clusters = 6

model = PCA(n_components = num_clusters)
X_raw_pca = model.fit_transform(X_raw)

X_raw_tensor = torch.tensor(X_raw_pca, dtype=torch.float32)
y_raw_tensor = torch.tensor(y_raw, dtype=torch.float32).reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(X_raw_tensor, y_raw_tensor, stratify=y_raw_tensor, test_size=0.2, random_state=42)

print(X_train.shape)

torch.Size([1711, 6])


In [22]:
## 
## Neural Network + PCA
##

class Model (nn.Module):
    def __init__(self, x):      
        super().__init__()
        self.layer1 = nn.Linear(6, x)
        self.act1 = nn.ReLU()
        self.layer2 = nn.Linear(x, x)
        self.act2 = nn.ReLU()
        self.output = nn.Linear(x, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.act1(self.layer1(x))
        x = self.act2(self.layer2(x))
        x = self.sigmoid(self.output(x))
        return x

In [23]:
kfold = StratifiedKFold(n_splits=5, shuffle=True)
cv_scores  = []

start_time = time.perf_counter()
for train, validate in kfold.split(X_train, y_train):
    model = Model(36)
    acc = model_train(model, X_train[train], y_train[train], X_train[validate], y_train[validate], 100, 0.001)
    cv_scores.append(acc)
training_time = time.perf_counter() - start_time
print("Training Time: ", training_time)

acc_mean = np.mean(cv_scores)
    
with torch.no_grad():
    y_pred = model(X_test)
    
acc_test = metrics.accuracy_score(y_test.numpy(), np.rint(y_pred.numpy()))    

print("Cross Validation Score: " + str(acc_mean))
print("Test Accuracy: " + str(acc_test))

Training Time:  37.52898621300119
Cross Validation Score: 0.8585613608360291
Test Accuracy: 0.8387850467289719


## NN + SVD

In [24]:
##
## TruncatedSVD
##

num_clusters = 6

model = TruncatedSVD(n_components=num_clusters)
X_raw_truncsvd = model.fit_transform(X_raw)

X_raw_tensor = torch.tensor(X_raw_truncsvd, dtype=torch.float32)
y_raw_tensor = torch.tensor(y_raw, dtype=torch.float32).reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(X_raw_tensor, y_raw_tensor, stratify=y_raw_tensor, test_size=0.2, random_state=42)

In [26]:
## 
## Neural Network + Randomized Projections
##

class Model (nn.Module):
    def __init__(self, x):      
        super().__init__()
        self.layer1 = nn.Linear(6, x)
        self.act1 = nn.ReLU()
        self.layer2 = nn.Linear(x, x)
        self.act2 = nn.ReLU()
        self.output = nn.Linear(x, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.act1(self.layer1(x))
        x = self.act2(self.layer2(x))
        x = self.sigmoid(self.output(x))
        return x

In [27]:
kfold = StratifiedKFold(n_splits=5, shuffle=True)
cv_scores  = []

start_time = time.perf_counter()
for train, validate in kfold.split(X_train, y_train):
    model = Model(36)
    acc = model_train(model, X_train[train], y_train[train], X_train[validate], y_train[validate], 100, 0.001)
    cv_scores.append(acc)
training_time = time.perf_counter() - start_time
print("Training Time: ", training_time)

acc_mean = np.mean(cv_scores)
    
with torch.no_grad():
    y_pred = model(X_test)
    
acc_test = metrics.accuracy_score(y_test.numpy(), np.rint(y_pred.numpy()))    

print("Cross Validation Score: " + str(acc_mean))
print("Test Accuracy: " + str(acc_test))

Training Time:  37.9156664070033
Cross Validation Score: 0.8392921209335327
Test Accuracy: 0.8037383177570093
