In [None]:
import os

os.chdir(os.path.dirname(os.getcwd()))

Purpose of KFold: select the best model with respect to the val data.

Usually a machine learning algorithm involves multiple hyperparameters (can be a lot), here is an example from logistic regression:


class sklearn.linear_model.LogisticRegression(penalty='l2', \*,  dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)


choosing the right combination can significantly improve the performance of your model. In order to find the optimized hyperparameters, we need to test the trained model with respect to a dataset which is unseen to avoid getting an overfitted model.

In order to do so, we cross-validation which can be achived with the following ways:

1. KFold: KFold divides all the samples in  groups of samples, called folds (if , this is equivalent to the Leave One Out strategy), of equal sizes (if possible). The prediction function is learned using  folds, and the fold left out is used for test.

2. StratifiedKFold: StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set.

3. LeaveOneOut (LOO): use n -1 data for training and validation your model on one sample. Extremely expensive

In [1]:
import os

In [2]:
os.getcwd()

'/Users/mling/OutSource/deepLearningTutorial/notebook'

In [3]:
os.path.dirname(os.getcwd())

'/Users/mling/OutSource/deepLearningTutorial'

os.chdir 更改目前作業用資料夾到指定資料夾下面

In [None]:
iris.feature_names

[(0, 'c1'), (1, 'c2'), ..... (n-1, 'cm')]

In [4]:
# Example
import pandas as pd
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn import datasets

iris = datasets.load_iris()

# KFold

X = iris.data
y = iris.target

iris_df = pd.DataFrame(data=X, columns=iris.feature_names)
iris_df['class'] = y

In [5]:
iris_df.head(5)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [None]:
kfold = KFold(n_splits=3, shuffle=True, random_state=0)

for train_index, val_index in kfold.split(iris_df):
    
    iris_train = iris_df.iloc[train_index]
    iris_test = iris_df.iloc[val_index]
    
    # ... #

In [None]:
skfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

for train_index, val_index in skfold.split(iris_df, y):
    
    iris_train = iris_df.iloc[train_index]
    iris_test = iris_df.iloc[val_index]
    
    # ... #

## How to combine kfold into neural net?

1. Let tensorflow API solve all the problems.

In [None]:
# Simple way, let tensorflow 

tf.keras.preprocessing.image.ImageDataGenerator(
    #....
    validation_split=0.0, #Float. Fraction of images reserved for validation (strictly between 0 and 1).
    #...
)

2. create an dataframe

In [None]:
import os

from tensorflow.keras.preprocessing.image import ImageDataGenerator

# load your csv or excel with pandas 
# df = pd.read_csv('filename')

add_params_train = {'rotation_range': 90,
                    'width_shift_range': 0.1,
                    'height_shift_range': 0.1,
                    'zoom_range': 0.1,
                    'horizontal_flip': True,
                    'vertical_flip': True,
                    'preprocessing_function': preprocess_input}

train_datagen = ImageDataGenerator(**add_params_train)
val_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)


kfold = KFold(n_splits=3, shuffle=True, random_state=0)

for train_index, val_index in kfold.split(df):
    
    train = df.iloc[train_index]
    test = df.iloc[val_index]

    train_dataset = train_datagen.flow_from_dataframe(train, # x_col = column of filename, y_col = 'column of label'
                                                      target_size = (224, 224),
                                                      batch_size=16,
                                                      class_mode='categorical')


    val_dataset = val_datagen.flow_from_dataframe(test, # x_col = column of filename, y_col = 'column of label'
                                                  target_size = (224, 224),
                                                  batch_size=16, class_mode='categorical',
                                                  shuffle=False)

In [None]:
# How to do this?

import os
from shutil import copyfile
from glob import glob

import numpy


current_dir = os.getcwd()

data_folder = os.path.join(os.pardir, 'data', '<資料夾名稱>')

data_train = os.path.join(os.pardir, 'data', 'train')
data_val = os.path.join(os.pardir, 'data', 'val')

if not os.path.isdir(data_train):
    os.makedirs(data_train)
if not os.path.isdir(data_val):
    os.makedirs(data_val)
    
jpg_files = glob(os.path.join(data_folder, "*.jpg"))

root_folder = os.path.join(os.pardir, 'data')

for j in jpg_files:
    
    p = np.random.rand()  # some number between 0 and 1
    
    basename = os.path.basename(j)  # 檔案基本名稱
    
    # <資料夾名稱>/<class>/圖片
    
    _, class_, _ = f.split("/")
    
    if p >= 0.9:
        target_folder = os.path.join(root_folder, 'val', f'{class_}')
    else:
        target_folder = os.path.join(root_folder, 'train', f'{class_}')
    
    if not os.path.isdir(target_folder):
        os.makedirs(target_folder)
    
    dst_path = os.path.join(target_folder, basename)
    src_path = j
    
    copyfile(src_path, dst_path) # 移動檔案

In [6]:
## Example 

from tensorflow.keras.optimizers import Adam

?Adam

[0;31mInit signature:[0m
[0mAdam[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mlearning_rate[0m[0;34m=[0m[0;36m0.001[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbeta_1[0m[0;34m=[0m[0;36m0.9[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbeta_2[0m[0;34m=[0m[0;36m0.999[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mepsilon[0m[0;34m=[0m[0;36m1e-07[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mamsgrad[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mname[0m[0;34m=[0m[0;34m'Adam'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0mkwargs[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Optimizer that implements the Adam algorithm.

Adam optimization is a stochastic gradient descent method that is based on
adaptive estimation of first-order and second-order moments.

According to
[Kingma et al., 2014](http://arxiv.org/abs/1412.6980),
the method is "*computationally
efficient, has lit

In [8]:
import numpy as np

np.logspace(-3, -1, 20)

array([0.001     , 0.00127427, 0.00162378, 0.00206914, 0.00263665,
       0.00335982, 0.00428133, 0.00545559, 0.00695193, 0.00885867,
       0.01128838, 0.0143845 , 0.01832981, 0.02335721, 0.02976351,
       0.0379269 , 0.0483293 , 0.06158482, 0.078476  , 0.1       ])

In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

label_encoder = LabelEncoder()

label_encoder.fit(df['label'])

best_params = {}
max_acc = 0   # min_mse = np.inf


learning_rates = np.logspace(-3, -1, 20)


for lr in learning_rates:
    
    y_true = np.zeros_like(df)
    y_pred = np.zeros_like(df)
    
    kfold = KFold(n_splits=3, shuffle=True, random_state=0)

    for train_index, val_index in kfold.split(df):

        train = df.iloc[train_index]
        test = df.iloc[val_index]

        train_dataset = train_datagen.flow_from_dataframe(train, # x_col = column of filename, y_col = 'column of label'
                                                          target_size = (224, 224),
                                                          batch_size=16,
                                                          class_mode='categorical')


        val_dataset = val_datagen.flow_from_dataframe(test, # x_col = column of filename, y_col = 'column of label'
                                                      target_size = (224, 224),
                                                      batch_size=16, class_mode='categorical',
                                                      shuffle=False)
        
        model.fit_generator(train_dataset)
        
        # In the end of training
        
        y_pred_proba = model.predict_generator(val_dataset)
        y_pred[val_index] = y_pred_proba.argmax(axis=1)
        y_true[val_index] = label_encoder.transform(test['label'].values) 
    
    acc = accuracy_score(y_true, y_pred)
    
    if acc > max_acc:
        max_acc = acc
        best_params['learning_rate'] = lr
        best_params['accuracy'] = acc
        

1. 現有 train, val 資料夾檔案全部合在一起
2. dataframe: [path, class]
