# Assignment 2

## Weight Optimization - GA

In [20]:
#
# 0. Import Packages
#

RANDOM_SEED = 7641

# Math tools for ML
import numpy as np
import pandas as pd
import math
import time
import copy
from numpy import arange

# Randomized Optimization 
import mlrose_hiive

# Progress bar
#from tqdm import tqdm

# Graph visualization
import matplotlib.pyplot as plt

# Data Preparation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

import torch
import torch.nn as nn
#import torch.optim as optim
#import tqdm

from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import StratifiedKFold

# Model 
from sklearn.tree import DecisionTreeClassifier # Decision Tree
from sklearn.ensemble import AdaBoostClassifier # Boosted Decision Tree
from sklearn.svm import SVC # SVM
from sklearn.neighbors import KNeighborsClassifier # KNN

#from sklearn import tree

import sklearn.metrics as mt
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.preprocessing import StandardScaler


In [21]:
# Load UCI AIDS crinical dataset - https://archive.ics.uci.edu/dataset/890/aids+clinical+trials+group+study+175

from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
aids_clinical_trials_group_study_175 = fetch_ucirepo(id=890) 
  
# data (as pandas dataframes) 
X = aids_clinical_trials_group_study_175.data.features 
y = aids_clinical_trials_group_study_175.data.targets 
y=y.cid

X_raw = X
y_raw = y

print("x_raw dimension: ", X_raw.shape)
print("y_raw dimension: ", y_raw.shape)

x_raw dimension:  (2139, 23)
y_raw dimension:  (2139,)


In [22]:
#
# 1.2 Split train and test sets
#

# data type change 
#y_raw = y_raw.astype(int)

X_train, X_test, y_train, y_test = train_test_split(X_raw, y_raw, stratify=y_raw, test_size=0.2, random_state=RANDOM_SEED)

scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

one_hot = OneHotEncoder()

y_train_hot = one_hot.fit_transform(y_train.values.reshape(-1, 1)).todense()
y_test_hot = one_hot.transform(y_test.values.reshape(-1, 1)).todense()

# print # of X_train
print(y_train_hot.shape)
print(y_test_hot.shape)

(1711, 2)
(428, 2)


In [None]:
#
# 2. Learning Curve
#


max_attempts = [ 1, 10, 100]
max_iters = [10, 100, 1000]
pop_sizes = [200, 400, 800] #default: 200
mutation_props = [0.1, 0.2, 0.4]

results_column = ["max_attempts", "max_iters", "pop_sizes", "mutation_probs", "accuracy_train", "accuracy_test", "train_time"]
results_list = []
results_df = pd.DataFrame(columns=results_column)

for max_attempt in max_attempts:
    for max_iter in max_iters:
        for pop_size in pop_sizes:
            for mutation_prop in mutation_props:
                kfold = KFold(n_splits=5, shuffle=True)
                cv_scores  = []
            
                start_time = time.perf_counter()
                nn_model = mlrose_hiive.NeuralNetwork(hidden_nodes = [2], 
                                                  activation = 'relu' ,
                                                  algorithm = 'genetic_alg',
                                                  is_classifier = True,
                                                  early_stopping = True,
                                                  random_state = RANDOM_SEED,
                                                  max_attempts = max_attempt,
                                                  max_iters = max_iter,
                                                  pop_size = pop_size, 
                                                  mutation_prob = mutation_prop)

                for train, validate in kfold.split(X_train_scaled, y_train_hot):
                    nn_model.fit(X_train_scaled[train], y_train_hot[train])
                    y_pred_hot = nn_model.predict(X_train_scaled[validate])
                    accuracy = mt.accuracy_score(np.asarray(y_pred_hot), np.asarray(y_train_hot[validate]))
                    cv_scores.append(accuracy)
            
                train_time = time.perf_counter() - start_time

                accuracy_train = np.mean(cv_scores)
    
                with torch.no_grad():
                    y_pred_hot = nn_model.predict(X_test_scaled)
                accuracy_test = mt.accuracy_score(np.asarray(y_test_hot), np.asarray(y_pred_hot))
            
                results_df.loc[len(results_df.index)] = [max_attempt, max_iter, pop_size, mutation_prob, accuracy_train, accuracy_test, train_time]
                print(max_attempt, max_iter, pop_size, mutation_prop, accuracy_train, accuracy_test, train_time)

1 10 200 0.1 0.7580601162770872 0.7570093457943925 2.904514019000999
1 10 200 0.2 0.7562937957137743 0.7570093457943925 3.0903144889998657
1 10 200 0.4 0.7562920907711456 0.7570093457943925 3.757933213999422
1 10 400 0.1 0.7591956080677885 0.7616822429906542 4.778446497999539
1 10 400 0.2 0.7562903858285169 0.7570093457943925 6.5589747019985225
1 10 400 0.4 0.7557141152200229 0.7570093457943925 7.236601389999123
1 10 800 0.1 0.7562903858285168 0.7546728971962616 14.235517421000623
1 10 800 0.2 0.7557175251052802 0.7570093457943925 12.871057225000186
1 10 800 0.4 0.7556868361379638 0.7570093457943925 9.84422169500067
1 100 200 0.1 0.7562886808858882 0.7570093457943925 3.3481952140009525
1 100 200 0.2 0.756268221574344 0.7570093457943925 2.952405684000041
1 100 200 0.4 0.7562920907711456 0.7570093457943925 3.7349137510009314
1 100 400 0.1 0.7591956080677885 0.7616822429906542 4.79681159699976
1 100 400 0.2 0.7562903858285169 0.7570093457943925 6.44613337300143
1 100 400 0.4 0.75571411522