# Assignment 2

## Weight Optimization - RHC

In [1]:
#
# 0. Import Packages
#

# han: check if random seed is necessary
RANDOM_SEED = 7641

# Math tools for ML
import numpy as np
import pandas as pd
import math
import time
import copy
from numpy import arange

# Randomized Optimization 
import mlrose_hiive

# Progress bar
#from tqdm import tqdm

# Graph visualization
import matplotlib.pyplot as plt

# Data Preparation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

import torch
import torch.nn as nn
#import torch.optim as optim
#import tqdm

from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import StratifiedKFold

# Model 
from sklearn.tree import DecisionTreeClassifier # Decision Tree
from sklearn.ensemble import AdaBoostClassifier # Boosted Decision Tree
from sklearn.svm import SVC # SVM
from sklearn.neighbors import KNeighborsClassifier # KNN

#from sklearn import tree

import sklearn.metrics as mt
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.preprocessing import StandardScaler


In [2]:
# Load UCI AIDS crinical dataset - https://archive.ics.uci.edu/dataset/890/aids+clinical+trials+group+study+175

from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
aids_clinical_trials_group_study_175 = fetch_ucirepo(id=890) 
  
# data (as pandas dataframes) 
X = aids_clinical_trials_group_study_175.data.features 
y = aids_clinical_trials_group_study_175.data.targets 
y=y.cid

X_raw = X
y_raw = y

print("x_raw dimension: ", X_raw.shape)
print("y_raw dimension: ", y_raw.shape)

x_raw dimension:  (2139, 23)
y_raw dimension:  (2139,)


In [3]:
#
# 1.1 Tensor Data Type (Pytorch)
#

#X_raw = torch.tensor(X_raw, dtype=torch.float32)
#y_raw = torch.tensor(y_raw, dtype=torch.float32).reshape(-1, 1)

In [4]:
#
# 1.2 Split train and test sets
#

# data type change 
#y_raw = y_raw.astype(int)

X_train, X_test, y_train, y_test = train_test_split(X_raw, y_raw, stratify=y_raw, test_size=0.2, random_state=RANDOM_SEED)

scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

one_hot = OneHotEncoder()

y_train_hot = one_hot.fit_transform(y_train.values.reshape(-1, 1)).todense()
y_test_hot = one_hot.transform(y_test.values.reshape(-1, 1)).todense()

# print # of X_train
print(y_train_hot.shape)
print(y_test_hot.shape)

(1711, 2)
(428, 2)


In [11]:
#
# 2. Learning Curve
#

max_attempts = [ 1, 10, 100]
max_iters = [10, 100, 1000]
restarts = [0, 10, 100]

results_column = ["max_attempts", "max_iters", "restart", "accuracy_train", "accuracy_test", "train_time"]
results_list = []
results_df = pd.DataFrame(columns=results_column)

for max_attempt in max_attempts:
    for max_iter in max_iters:
        for restart in restarts: 
            
            kfold = KFold(n_splits=5, shuffle=True)
            cv_scores  = []
            
            start_time = time.perf_counter()
            nn_model = mlrose_hiive.NeuralNetwork(hidden_nodes = [2], 
                                                  activation = 'relu' ,
                                                  algorithm = 'random_hill_climb',
                                                  is_classifier = True,
                                                  early_stopping = True,
                                                  random_state = RANDOM_SEED,
                                                  max_attempts = max_attempt,
                                                  max_iters = max_iter,
                                                  restarts = restart)

            for train, validate in kfold.split(X_train_scaled, y_train_hot):
                nn_model.fit(X_train_scaled[train], y_train_hot[train])
                y_pred_hot = nn_model.predict(X_train_scaled[validate])
                accuracy = mt.accuracy_score(np.asarray(y_pred_hot), np.asarray(y_train_hot[validate]))
                cv_scores.append(accuracy)
            
            train_time = time.perf_counter() - start_time

            accuracy_train = np.mean(cv_scores)
    
            with torch.no_grad():
                y_pred_hot = nn_model.predict(X_test_scaled)
            accuracy_test = mt.accuracy_score(np.asarray(y_test_hot), np.asarray(y_pred_hot))
            
            results_df.loc[len(results_df.index)] = [max_attempt, max_iter, restart, accuracy_train, accuracy_test, train_time]
            print(max_attempt, max_iter, restart, accuracy_train, accuracy_test, train_time)

1 10 0 0.7469200211412886 0.7429906542056075 0.015476392999971722
1 10 10 0.7294085554021107 0.7266355140186916 0.14985384800002066
1 10 100 0.7440130939593883 0.7406542056074766 1.5427550819999851
1 100 0 0.7469149063134026 0.7429906542056075 0.010777537999729248
1 100 10 0.7294085554021107 0.7266355140186916 0.14035427099997833
1 100 100 0.7440130939593883 0.7406542056074766 1.381783538000036
1 1000 0 0.7469149063134026 0.7429906542056075 0.010540342000240344
1 1000 10 0.7294085554021107 0.7266355140186916 0.13488021300008768
1 1000 100 0.7440130939593883 0.7406542056074766 1.3497192700001506
10 10 0 0.7445842497399963 0.7453271028037384 0.05319821500006583
10 10 10 0.7387516410072801 0.7429906542056075 0.646781547000046
10 10 100 0.7381549110872421 0.7336448598130841 6.303099796999959
10 100 0 0.7411095766627452 0.7476635514018691 0.5453115380000781
10 100 10 0.7609670434589876 0.7686915887850467 6.034232010999858
10 100 100 0.7586295671150667 0.7710280373831776 56.82627205400013
10