## Create folds
Notebook to create K folds for data train and validation and save it in dataframe format

It will get the quantity of classes automatically

In [1]:
'''
CSV_INPUT_ALL_CLASSES: Path to csv file containing all the dataset
OUTPUT_TRAIN_FOLDER: Folder to save split TRAIN folds
OUTPUT_TEST_FOLDER: Folder to save split TEST folds
K_FOLDS: Folds number to be generated
'''
DATA_FOLDER='covnet-5folds-rr/'
# CSV_INPUT_ALL_CLASSES='../csv/input/all_classes_hmv_hcpa.csv'
CSV_INPUT_ALL_CLASSES='../csv/input/hmv_hcpa-covid_others-decovnet.csv'
OUTPUT_TRAIN_FOLDER='../csv/input/'+ DATA_FOLDER +'train/'
OUTPUT_TEST_FOLDER='../csv/input/'+ DATA_FOLDER +'validation/'
K_FOLDS=5

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import StratifiedKFold

import glob
import os
from datetime import datetime

In [3]:
'''
Make all paths exist
'''
if not os.path.exists('../csv/'): 
    os.mkdir('../csv/')
if not os.path.exists('../csv/input/'): 
    os.mkdir('../csv/input/')
if not os.path.exists('../csv/input/'+DATA_FOLDER): 
    os.mkdir('../csv/input/'+DATA_FOLDER)
if not os.path.exists(OUTPUT_TRAIN_FOLDER): 
    os.mkdir(OUTPUT_TRAIN_FOLDER)
if not os.path.exists(OUTPUT_TEST_FOLDER): 
    os.mkdir(OUTPUT_TEST_FOLDER)

In [4]:
df = pd.read_csv(CSV_INPUT_ALL_CLASSES)
df

Unnamed: 0,nome,covid
0,ATY-001,others
1,ATY-003,others
2,ATY-004,others
3,ATY-005,others
4,ATY-006,others
...,...,...
380,TYP-027,covid
381,TYP-028,covid
382,TYP-029,covid
383,TYP-030,covid


In [5]:
# Get all classes in the dataframe
ar_classes = df.covid.unique()
ar_classes.sort()
ar_classes

array(['covid', 'others'], dtype=object)

In [6]:
cov = len(df[df['covid'] == 'covid'])
oth = len(df[df['covid'] == 'others'])

print(cov, cov/len(df))
print(oth, oth/len(df))
print(200*0.316883) # 63
print(200*0.683116) # 137

122 0.3168831168831169
263 0.6831168831168831
63.3766
136.6232


Generate the split to train and test samples.

In [9]:
# for i in range(K_FOLDS):
df.loc[df["covid"] == "covid", "covid"] = 0
df.loc[df["covid"] == "others", "covid"] = 1
for i in range(K_FOLDS):
    fold = i+1
    df_cov = df[df["covid"] == 0]
    df_cov.reset_index()
    chosen_idx = np.random.choice(len(df_cov), size=63, replace=False)
#     print(chosen_idx)
    df_train_cov = df_cov.iloc[chosen_idx].reset_index(drop=True)
    df_val_cov = pd.concat([df_cov.reset_index(drop=True),df_train_cov]).drop_duplicates(keep=False)
    
    df_oth = df[df["covid"] == 1]
    df_oth.reset_index()
    chosen_idx = np.random.choice(len(df_oth), size=137, replace=False)
    df_train_oth = df_oth.iloc[chosen_idx].reset_index(drop=True)
    df_val_oth = pd.concat([df_oth.reset_index(drop=True),df_train_oth]).drop_duplicates(keep=False)
    
    df_train = pd.concat([df_train_cov, df_train_oth]).sort_values(by=['covid'])
    df_val = pd.concat([df_val_cov, df_val_oth]).sort_values(by=['covid'])
    df_train = df_train.rename(columns={'nome': 'case', 'covid': 'label'})
    df_train.to_csv(OUTPUT_TRAIN_FOLDER+'train'+str(fold)+'.csv', index=False)
    df_tr = df_val.rename(columns={'nome': 'case', 'covid': 'label'})
    df_val.to_csv(OUTPUT_TEST_FOLDER+'validation'+str(fold)+'.csv', index=False)
    
    print('train: ', len(df_train_cov), len(df_train_oth), len(df_train))
    print('val: ', len(df_val_cov), len(df_val_oth), len(df_val))
    print()


train:  63 137 200
val:  59 126 185

train:  63 137 200
val:  59 126 185

train:  63 137 200
val:  59 126 185

train:  63 137 200
val:  59 126 185

train:  63 137 200
val:  59 126 185



# Test for 1 fold

In [45]:
s_a = df[df['covid'] == "ATYPICAL"].sample(40).index
s_i = df[df['covid'] == "INDETERMINATE"].sample(40).index
s_n = df[df['covid'] == "NEGATIVE"].sample(40).index
s_p = df[df['covid'] == "POSITIVE"].sample(120).index


df2 = df.loc[s_a.union(s_i).union(s_n).union(s_p)]
df2.index
print(len(df2))
print(df2.index)
df2.to_csv('../csv/input/all_classes_hmv_hcpa_SAMPLE.csv', index=False)

df_test = df[ ~df.isin(df2)].dropna()

print(len(df_test))
print(df_test.index)
df_test.to_csv('../csv/input/all_classes_hmv_hcpa_TEST.csv', index=False)

240
Int64Index([  1,   3,   6,   8,  11,  15,  16,  17,  18,  19,
            ...
            366, 368, 371, 372, 374, 376, 378, 380, 381, 382],
           dtype='int64', length=240)
143
Int64Index([  0,   2,   4,   5,   7,   9,  10,  12,  13,  14,
            ...
            359, 362, 363, 367, 369, 370, 373, 375, 377, 379],
           dtype='int64', length=143)
