# Data training and tuning

# Set up library

In [None]:
github_name = 'maguirr4-uo'
repo_name = 'cis423'
source_file = 'library.py'
url = f'https://raw.githubusercontent.com/{github_name}/{repo_name}/main/{source_file}'
!rm $source_file
!wget $url
%run -i $source_file

## Get future_df to view

In [None]:
url = 'https://raw.githubusercontent.com/maguirr4-uo/cis423/main/EmployeeFuture.csv'
future_df = pd.read_csv(url)
future_df

future_features = future_df.drop(columns=['LeaveOrNot'])
labels = future_df['LeaveOrNot'].to_list()
labels[:5]

future_transformer = Pipeline(steps=[
    ('education', MappingTransformer('Education', {'Bachelors': 0, 'Masters': 1, 'PHD': 2})),
    ('year', MappingTransformer('JoiningYear', {'2012': 6, '2013': 5, '2014': 4, '2015': 3, '2016': 2, '2017': 1, '2018': 0,})),
    ('gender', MappingTransformer('Gender', {'Male': 0, 'Female': 1})),
    ('benched', MappingTransformer('EverBenched', {'No': 0, 'Yes': 1})),
    ('ohe', OHETransformer('City')),
    ('age', TukeyTransformer('Age', 'outer')),
    ('exp', TukeyTransformer('ExperienceInCurrentDomain', 'outer')),
    ('scale', MinMaxTransformer()), 
    ], verbose=True)


In [None]:
%%capture

#X_train, X_test, y_train, y_test = train_test_split(future_features, labels, test_size=0.2, shuffle=True,
#                                                   random_state=9, stratify=future_df['LeaveOrNot'])

X_train, y_train, X_test, y_test = dataset_setup(future_features, labels, future_transformer, rs=9, ts=.2)

## KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn_grid = dict(n_neighbors=range(5,100,10),
                weights=['uniform', 'distance'],
                algorithm=['auto', 'ball_tree', 'kd_tree', 'brute'],
                p=[1,2]
)

In [None]:
%%capture
grid_result = halving_search(KNeighborsClassifier(), knn_grid, X_train, y_train, factor=3, scoring='roc_auc')

In [None]:
grid_result.best_params_

In [None]:
best_knn_model = grid_result.best_estimator_
best_knn_model.score(X_test, y_test)

In [None]:
yraw = best_knn_model.predict_proba(X_test)[:,1]

In [None]:
result_df = threshold_results(np.linspace(0,1,19,endpoint=True), y_test, yraw)
result_df

In [None]:
result_df.to_csv('knn_thresholds.csv', index=False)

In [None]:
from joblib import dump
dump(best_knn_model, 'knn_model.joblib')

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegressionCV

In [None]:
penalty_L2 = dict( 
                penalty=['l2'],                              
                solver=['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                max_iter=range(50,1000,50),
                class_weight=['balanced', None]
)

penalty_L1 = dict(
                penalty=['l1'],
                solver=['liblinear', 'saga'],
                max_iter=range(50,1000,50),
                class_weight=['balanced', None]
)

penalty_elasticnet = dict(
                penalty=['elasticnet'],
                solver=['saga'],
                max_iter=range(50,1000,50),
                class_weight=['balanced', None],
                l1_ratios=[[0.2], [0.5], [0.8]]
)

In [None]:
%%capture
logreg_model = LogisticRegressionCV()

grids = [penalty_L1, penalty_L2, penalty_elasticnet]

grid_result = halving_search(logreg_model, grids, X_train, y_train)

In [None]:
grid_result.best_params_  

In [None]:
best_logreg_model = grid_result.best_estimator_
best_logreg_model.score(X_test,y_test)

In [None]:
yraw = best_logreg_model.predict_proba(X_test)[:,1]
result_df = threshold_results(np.linspace(0,1,19,endpoint=True), y_test, yraw)
result_df

In [None]:
result_df.to_csv('logreg_thresholds.csv', index=False)

In [None]:
from joblib import dump
dump(best_logreg_model, 'logreg_model.joblib')

## XGBoost

In [None]:
xgb_grid = {
    "n_estimators": range(10,201,10),  #number of trees
    "max_depth": range(1,15),              #max tree depth
    "learning_rate": [0.1, 0.2, 0.3, 0.4],
    "subsample": [.25, .5, 0.75],  # Fix subsample
    "booster": ['dart', 'gbtree', 'gblinear'],
}

In [None]:
from xgboost import XGBClassifier
xgb_model = XGBClassifier(random_state=1234)

In [None]:
%%capture
grid_result = halving_search(xgb_model, xgb_grid, X_train, y_train)
best_model = grid_result.best_estimator_

In [None]:
best_model.score(X_test, y_test)

In [None]:
grid_result.best_params_ 

In [None]:
yraw = best_model.predict_proba(X_test)[:,1]
result_df = threshold_results(np.linspace(0,1,19,endpoint=True), y_test, yraw)
result_df

In [None]:
result_df.to_csv('xgb_thresholds.csv', index=False)

In [None]:
from joblib import dump
dump(best_model, 'xgb_model.joblib')

# Artificial Neural Network

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
import tensorflow as tf
from tensorflow import keras

In [None]:
early_stop_cb = tf.keras.callbacks.EarlyStopping(
    monitor='loss',  #or binary_accuracy
    min_delta=0,
    patience=5,
    verbose=0
)

In [None]:
auc = tf.keras.metrics.AUC(from_logits=False)  #using roc auc to be consistent with prior models https://www.tensorflow.org/api_docs/python/tf/keras/metrics/AUC
loss=tf.keras.losses.BinaryCrossentropy(from_logits=False)  #https://www.tensorflow.org/api_docs/python/tf/keras/losses/BinaryCrossentropy, https://towardsdatascience.com/cross-entropy-loss-function-f38c4ec8643e
feature_n = len(X_train[0])  # Number of features

In [None]:
def ann_build_binary_model(*, n:int, architecture, metrics=auc, loss=loss, learning_rate=.02):
  assert isinstance(n, int), f'n is an int, the number of columns/features of each sample. Instead got {type(n)}'
  assert isinstance(architecture, list) or isinstance(architecture, tuple), f'architecture is a list or tuple, the number of nodes per layer. Instead got {type(architecture)}'
  assert architecture, f'architecture is empty'
  assert isinstance(architecture[0], list), f'architecture should be list of one or more lists but instead {architecture}'

  l2_regu = tf.keras.regularizers.L2(0.01)  #weight regularization during gradient descent
  initializer = tf.keras.initializers.HeNormal(seed=1234)  #works best with Relu: https://machinelearningmastery.com/weight-initialization-for-deep-learning-neural-networks/

  model = Sequential()

  # handle first hidden layer separately because of input_dim

  layer_units = architecture[0][0]
  layer_dropout = architecture[0][1]
  layer_act = architecture[0][2]
  model.add(Dense(units=layer_units, activation=layer_act, activity_regularizer=l2_regu, kernel_initializer=initializer, input_dim=n))  #first hidden layer needs number of inputs
  model.add(Dropout(layer_dropout))

  for layer in architecture[1:]:
    layer_units = layer[0]
    layer_dropout = layer[1]
    layer_act = layer[2]
    model.add(Dense(units=layer_units, activation=layer_act, activity_regularizer=l2_regu, kernel_initializer=initializer))
    model.add(Dropout(layer_dropout))
    
  # output layer
  model.add(Dense(units=1, activation='sigmoid'))

  model.compile(loss=loss,
              optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
              metrics=[metrics])
  return model

In [None]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
model_builder = KerasClassifier(build_fn=ann_build_binary_model, verbose=0)  # Wrap the model to use Sklearn on it

In [None]:
from sklearn.model_selection import RandomizedSearchCV

## ANN architectures

In [None]:
np.random.seed(seed=1234)
tf.random.set_seed(1234)

In [None]:
af = 'elu' # Worked in prior testing

In [None]:
architecture_1 = [[16, .2, 'relu']]
architecture_2 = [[16, .2, 'relu'], [8, .4, 'relu']]
architecture_3 = [[4, .2, 'relu']]
architecture_4 = [[16, .4, 'relu'], [8, .2, 'relu']]
architecture_5 = [[8, .4, 'relu'], [4, .6, 'relu']]
architecture_6 = [[16, .8, 'relu']]
architecture_7 = [[16, .2, 'relu'], [8, .4, 'relu']]
architecture_8 = [[4, .4, 'relu']]

architectures = [architecture_1, architecture_2, architecture_3, architecture_4, architecture_5, architecture_6, architecture_7, architecture_8]

# Generating random numbers from range
learn_rate = np.random.uniform(low=1e-2, high=1e-4, size=5)  #generate 5 choices in low/high range
batch_size = np.random.randint(10,200,5)  #generate 5 choices between 10 and 200
epochs = np.random.randint(10,50,5)  #generate 5 choices between 10, 50

param_grid = dict(n=[feature_n],
                  architecture=architectures,
                  batch_size=batch_size,
                  epochs=epochs,
                  learning_rate=learn_rate)
param_grid

In [None]:
searcher_model = RandomizedSearchCV(estimator=model_builder, n_jobs=1,  #errors with n_jobs=-1
                              cv=5,  #does stratification by default
                              verbose=1,
                              n_iter=50, #number of random samples to try
                              random_state=1234,
	                            param_distributions=param_grid, scoring="roc_auc")

searcher_model

In [None]:
searchResults = searcher_model.fit(X_train, y_train, callbacks=[early_stop_cb])

In [None]:
searchResults.best_score_  #from training

In [None]:
bestParams = searchResults.best_params_
bestParams

## Rebuild model with the best parameters

In [None]:
np.random.seed(seed=1234)
tf.random.set_seed(1234)

In [None]:
ann_model = ann_build_binary_model(n=bestParams['n'],
                                   architecture=bestParams['architecture'],
                                   learning_rate=bestParams['learning_rate'])

In [None]:
training = ann_model.fit(x=X_train,
                        y=y_train,
                         batch_size=bestParams['batch_size'],
                         epochs=bestParams['epochs'],
                         verbose=0,
                         callbacks=[early_stop_cb],
)

## Evaluate the model on the testing set

In [None]:
yraw = ann_model.predict(X_test)[:,0]  #replaces predict_proba
yraw[:5]

In [None]:
binary = [1 if y>.5 else 0 for y in yraw]  # use normal threshold of .5
sum([x==y for x,y in zip(binary,y_test)])/len(binary)

## Compute the treshold table

In [None]:
yraw = ann_model.predict(X_test)
result_df = threshold_results(np.linspace(0,1,19,endpoint=True), y_test, yraw)
result_df

### I saved to my local Google Drive storage after this. All models are still in there.