
# deepFried Nets: Model tuning


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

## Load dataset

In [None]:
df = pd.read_csv('/home/cdsw/Data/EPI/Raw/train_sub1M_imputed_dd.csv')

### Check for missing values


In [None]:
       
df = df.dropna()
df_sample = df.sample(frac=0.4)

### Define feature and target tables 

In [None]:
targ_index = df.columns.get_loc('patient_type')
X_sample = df_sample.iloc[:, 1:targ_index].values
y_sample = df_sample.iloc[:, -1].values

X = df.iloc[:, 1:targ_index].values
y = df.iloc[:, -1].values

### Class balancing with random undersampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from collections import Counter

sampling_strategy = {1:10}
rus = RandomUnderSampler(random_state=0, ratio=0.1)

Xsample_resampled, ysample_resampled = rus.fit_resample(X_sample,y_sample)
X_resampled, y_resampled = rus.fit_resample(X, y)

print(sorted(Counter(y_resampled).items()))
print(sorted(Counter(ysample_resampled).items()))

y_resampled= pd.get_dummies(y_resampled)
ysample_resampled = pd.get_dummies(ysample_resampled)

### One hot encode gender 


In [None]:
onehotencoder = OneHotEncoder(categorical_features=[0])
Xsample_resampled = onehotencoder.fit_transform(Xsample_resampled).toarray()
X_resampled = onehotencoder.fit_transform(X_resampled).toarray()

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.33, 
                                                    shuffle= True)
Xsample_train, Xsample_test, ysample_train, ysample_test = train_test_split(Xsample_resampled, 
                                                                            ysample_resampled, 
                                                                            test_size=0.33,
                                                                            shuffle= True)

### Feature scaling

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
Xsample_train = scaler.fit_transform(Xsample_train)
Xsample_test = scaler.fit_transform(Xsample_test)

## Hyperparameter and architecture scanning (RandomisedSearchCV)

Scan over hyperparameter values using `Keras.wrappers` and a parameter dictionary.

In [None]:
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Dropout
from keras.constraints import unit_norm
from keras.wrappers.scikit_learn import KerasClassifier
from keras.optimizers import SGD, Adadelta
from keras.callbacks import EarlyStopping
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.metrics import f1_score, average_precision_score
from scipy.stats import randint
from sklearn.utils.fixes import loguniform

### Model definition

Instantiate the model to be used in the `wrapper`. Modify according to the type model and hyperparameter to scan over.

In [None]:
def create_model(n_hidden=1, n_neurons=30, learning_rate = 0.01, drop_rate = 0.5, act_func = 'LeakyReLU',
                act_func_out = 'sigmoid',kernel_init = 'uniform', opt= 'Adadelta'):
    model = Sequential()
    model.add(Dense(n_neurons, input_shape=(X_resampled.shape[1],), activation=act_func,
                   kernel_initializer = kernel_init))
    model.add(BatchNormalization())
    model.add(Dropout(drop_rate))
    # Add as many hidden layers as specified in n_hidden
    for layer in range(n_hidden):
    # Layers have nn neurons model.add(Dense(nn, activation='relu'
        model.add(Dense(n_neurons, activation=act_func, kernel_initializer = kernel_init))
        model.add(BatchNormalization())
        model.add(Dropout(drop_rate))
    model.add(Dense(2, activation=act_func_out, kernel_initializer = kernel_init))
    opt= Adadelta(lr=learning_rate)
    model.compile(loss='binary_crossentropy',optimizer=opt, metrics=['accuracy'])
    return model

In [None]:
model = KerasClassifier(build_fn=create_model)
#kfold = cross_val_score(model, X, y, cv=5)

### Hyperparameter dictionary
Optimise for:
- number of hidden layers
- number of neurons
- activation func and optimizers
- number of epochs
- learning rate
- initialisation krnel
- batch size
- dropout rate

Define dictionary and pass a range or set of values.

In [None]:
params = dict(n_hidden=[16, 24, 32, 64, 128],
              epochs=[100, 20, 30],
              n_neurons=[8, 16, 32, 64, 128, 256, 512],
              act_func=['relu'],
              act_func_out=['softmax'],
              learning_rate= [0.01, 0.1, 0.3, 0.5],
              opt = ['adam','Adadelta', 'Adagrad','Rmsprop'],
              kernel_init = ['uniform', 'normal', 'glorot_uniform'],
              batch_size=[256, 512, 1024, 2048],
              drop_rate=[0.1, 0.2, 0.3, 0.5])
              #scoring = ['accuracy', 'average_precision']

### RandomizedSearchCV implementation

In the dictionary above parameters are presented as lists. it is highly recommended to **use continuous distributions for continuous parameters** using `loguniform` or `randint`, e.g.:

```bash
    {'C': loguniform(1e0, 1e3),
    'gamma': loguniform(1e-4, 1e-3)
    }
```
If all parameters are presented as a list, sampling without replacement is performed. If at least one parameter is given as a distribution, sampling with replacement is used. 

The number of parameter settings that are tried is given by `n_iter`. For continuous parameters, increasing n_iter will lead to a finer search.

In [None]:
random_search = RandomizedSearchCV(model,  params, n_iter=10, scoring='average_precision', 
                                   cv=5)

# if running on CPUs --> add n_jobs=-1 arg to RandomizedSearchCV
random_search_results = random_search.fit(Xsample_train, ysample_train, 
                                          validation_data =(Xsample_test, ysample_test),
                                          callbacks=[EarlyStopping(patience=15)])

print('best score (average percision):', random_search_results.best_score_)
print('best parameters:',random_search_results.best_params_)