# Dataset : 

- List of files to read
- File to save to (can be created or appended)
- Arbitrary label associated with the region, mask or whatsoever

- Updates : 
    - create dataset for train-test for the final comparison with 1'000 samples per class in both sets 
    - Add folders to the dataset for the training 


In [9]:
import numpy as np
import pandas as pd
import os, glob, sys

from sklearn.model_selection import train_test_split
from itertools import combinations

sys.path.append('../../Lamp/AttrDict/')

from AttrDict import *

### Train Laboratory

In [2]:
ifile = 'dataset_lab_train.yaml'

if os.path.isfile(ifile) and os.path.splitext(ifile)[-1] in [".yaml",".yml"]:
    inputs = AttrDict.from_yaml_path(ifile) # change to argv
else:
    raise AssertionError("Wrong input type")

In [3]:
paths = []
labels = []
folders = []

for key, items in inputs.Train.items():
    for folder in items.Folder:

        path = glob.glob(inputs.rootData + folder + '*.png')
        label = [items.Label] * len(path)

        folders.append([folder] * len(path))
        paths.append(path)
        labels.append(label)

list_of_paths = [item for elem in paths for item in elem]
list_of_labels = [item for elem in labels for item in elem]
list_of_folders = [item for elem in folders for item in elem]

In [4]:
df_train = pd.DataFrame(np.array([list_of_paths,list_of_folders,list_of_labels]).T,columns=['Paths','Folders','Label'])
df_train['Paths'] = '../' + df_train['Paths'].str.replace('\\','/').str.split('/',expand=True,n=9)[9]

In [5]:
df_train

Unnamed: 0,Paths,Folders,Label
0,../Data/Train/BL3-1-DL/slice00200 (2020_05_17 ...,BL3-1-DL/,0
1,../Data/Train/BL3-1-DL/slice00200 (2020_05_17 ...,BL3-1-DL/,0
2,../Data/Train/BL3-1-DL/slice00200 (2020_05_17 ...,BL3-1-DL/,0
3,../Data/Train/BL3-1-DL/slice00200 (2020_05_17 ...,BL3-1-DL/,0
4,../Data/Train/BL3-1-DL/slice00200 (2020_05_17 ...,BL3-1-DL/,0
...,...,...,...
18039,../Data/Train/OL7-3-DL/slice01320 (2020_05_17 ...,OL7-3-DL/,4
18040,../Data/Train/OL7-3-DL/slice01320 (2020_05_17 ...,OL7-3-DL/,4
18041,../Data/Train/OL7-3-DL/slice01320 (2020_05_17 ...,OL7-3-DL/,4
18042,../Data/Train/OL7-3-DL/slice01320 (2020_05_17 ...,OL7-3-DL/,4


In [6]:
df_train_bbox = df_train.loc[df_train.Paths.str.find('bbox',start=-16) > -1, :].reset_index(drop=True)
df_train_mar = df_train.loc[df_train.Paths.str.find('mar',start=-16) > -1, :].reset_index(drop=True)

In [7]:
df_train_mar

Unnamed: 0,Paths,Folders,Label
0,../Data/Train/BL3-1-DL/slice00200 (2020_05_17 ...,BL3-1-DL/,0
1,../Data/Train/BL3-1-DL/slice00200 (2020_05_17 ...,BL3-1-DL/,0
2,../Data/Train/BL3-1-DL/slice00200 (2020_05_17 ...,BL3-1-DL/,0
3,../Data/Train/BL3-1-DL/slice00200 (2020_05_17 ...,BL3-1-DL/,0
4,../Data/Train/BL3-1-DL/slice00200 (2020_05_17 ...,BL3-1-DL/,0
...,...,...,...
9017,../Data/Train/OL7-3-DL/slice01310 (2020_05_17 ...,OL7-3-DL/,4
9018,../Data/Train/OL7-3-DL/slice01310 (2020_05_17 ...,OL7-3-DL/,4
9019,../Data/Train/OL7-3-DL/slice01320 (2020_05_17 ...,OL7-3-DL/,4
9020,../Data/Train/OL7-3-DL/slice01320 (2020_05_17 ...,OL7-3-DL/,4


In [8]:
# Loop groupby label and every combination of folders
lists = [i for folders_all in df_train_mar.groupby(by='Label')['Folders'].unique().to_list() for i in combinations(folders_all,r=2)]

for i in range(3):
    train_folders = [item for t in lists[i::3] for item in t]
    test_folders = list(set(df_train_mar['Folders'].unique()).difference(set(train_folders)))

    df_train_mar.loc[df_train_mar['Folders'].isin(train_folders),:].groupby(by=['Label','Folders']).sample(500,replace=True,random_state=0).reset_index(drop=True).to_csv(f'lab/train_mar_{i}.csv')
    df_train_mar.loc[df_train_mar['Folders'].isin(test_folders),:].groupby(by=['Label','Folders']).sample(200,replace=True,random_state=0).reset_index(drop=True).to_csv(f'lab/test_mar_{i}.csv')

### Train Boreholes

In [9]:
ifile = 'dataset_borehole_train.yaml'

if os.path.isfile(ifile) and os.path.splitext(ifile)[-1] in [".yaml",".yml"]:
    inputs = AttrDict.from_yaml_path(ifile) # change to argv
else:
    raise AssertionError("Wrong input type")

In [10]:
paths = []
labels = []
folders = []

for key, items in inputs.Train.items():
    for folder in items.Folder:

        path = glob.glob(inputs.rootData + folder + '*.png')
        label = [items.Label] * len(path)

        folders.append([folder] * len(path))
        paths.append(path)
        labels.append(label)

list_of_paths = [item for elem in paths for item in elem]
list_of_labels = [item for elem in labels for item in elem]
list_of_folders = [item for elem in folders for item in elem]

In [11]:
df_train = pd.DataFrame(np.array([list_of_paths,list_of_folders,list_of_labels]).T,columns=['Paths','Folders','Label'])
df_train['Paths'] = '../' + df_train['Paths'].str.replace('\\','/').str.split('/',expand=True,n=9)[9]

In [12]:
df_train

Unnamed: 0,Paths,Folders,Label
0,../Data/Test_Borehole/BL-DB-1/slice00200_bbox_...,BL-DB-1/,0
1,../Data/Test_Borehole/BL-DB-1/slice00200_bbox_...,BL-DB-1/,0
2,../Data/Test_Borehole/BL-DB-1/slice00200_bbox_...,BL-DB-1/,0
3,../Data/Test_Borehole/BL-DB-1/slice00200_bbox_...,BL-DB-1/,0
4,../Data/Test_Borehole/BL-DB-1/slice00200_bbox_...,BL-DB-1/,0
...,...,...,...
11583,../Data/Test_Borehole/OL-DB-3/slice00850_bbox_...,OL-DB-3/,4
11584,../Data/Test_Borehole/OL-DB-3/slice00850_bbox_...,OL-DB-3/,4
11585,../Data/Test_Borehole/OL-DB-3/slice00850_mar_r...,OL-DB-3/,4
11586,../Data/Test_Borehole/OL-DB-3/slice00850_mar_r...,OL-DB-3/,4


In [13]:
df_train_mar = df_train.loc[df_train.Paths.str.find('mar',start=-16) > -1, :].reset_index(drop=True)

In [14]:
# Loop groupby label and every combination of folders
lists = [i for folders_all in df_train_mar.groupby(by='Label')['Folders'].unique().to_list() for i in combinations(folders_all,r=2)]

for i in range(3):
    train_folders = [item for t in lists[i::3] for item in t]
    test_folders = list(set(df_train_mar['Folders'].unique()).difference(set(train_folders)))

    df_train_mar.loc[df_train_mar['Folders'].isin(train_folders),:].groupby(by=['Label','Folders']).sample(500,replace=True,random_state=0).reset_index(drop=True).to_csv(f'borehole/train_mar_{i}.csv')
    df_train_mar.loc[df_train_mar['Folders'].isin(test_folders),:].groupby(by=['Label','Folders']).sample(200,replace=True,random_state=0).reset_index(drop=True).to_csv(f'borehole/test_mar_{i}.csv')

### Test Laboratory

In [16]:
ifile = 'dataset_lab_train_test.yaml'

if os.path.isfile(ifile) and os.path.splitext(ifile)[-1] in [".yaml",".yml"]:
    inputs = AttrDict.from_yaml_path(ifile) # change to argv
else:
    raise AssertionError("Wrong input type")

#### Train

In [17]:
paths = []
labels = []
folders = []

for key, items in inputs.Train.items():
    for folder in items.Folder:

        path = glob.glob(inputs.rootData + folder + '*.png')
        label = [items.Label] * len(path)

        folders.append([folder] * len(path))
        paths.append(path)
        labels.append(label)

list_of_paths = [item for elem in paths for item in elem]
list_of_labels = [item for elem in labels for item in elem]
list_of_folders = [item for elem in folders for item in elem]

In [18]:
df_train = pd.DataFrame(np.array([list_of_paths,list_of_folders,list_of_labels]).T,columns=['Paths','Folders','Label'])
df_train['Paths'] = '../' + df_train['Paths'].str.replace('\\','/').str.split('/',expand=True,n=9)[9]

In [19]:
df_train_mar = df_train.loc[df_train.Paths.str.find('mar',start=-16) > -1, :].reset_index(drop=True)

In [20]:
df_train_mar

Unnamed: 0,Paths,Folders,Label
0,../Data/Train/BL3-1-DL/slice00200 (2020_05_17 ...,BL3-1-DL/,0
1,../Data/Train/BL3-1-DL/slice00200 (2020_05_17 ...,BL3-1-DL/,0
2,../Data/Train/BL3-1-DL/slice00200 (2020_05_17 ...,BL3-1-DL/,0
3,../Data/Train/BL3-1-DL/slice00200 (2020_05_17 ...,BL3-1-DL/,0
4,../Data/Train/BL3-1-DL/slice00200 (2020_05_17 ...,BL3-1-DL/,0
...,...,...,...
6042,../Data/Train/OL4-3-DL/slice01280 (2020_05_17 ...,OL4-3-DL/,4
6043,../Data/Train/OL4-3-DL/slice01290 (2020_05_17 ...,OL4-3-DL/,4
6044,../Data/Train/OL4-3-DL/slice01290 (2020_05_17 ...,OL4-3-DL/,4
6045,../Data/Train/OL4-3-DL/slice01300 (2020_05_17 ...,OL4-3-DL/,4


In [21]:
df_train_mar.groupby(by=['Label','Folders']).sample(500,replace=True,random_state=0).reset_index(drop=True).to_csv(f'lab/train_test_train_mar.csv')

#### Test

In [22]:
paths = []
labels = []
folders = []

for key, items in inputs.Test.items():
    for folder in items.Folder:

        path = glob.glob(inputs.rootData + folder + '*.png')
        label = [items.Label] * len(path)

        folders.append([folder] * len(path))
        paths.append(path)
        labels.append(label)

list_of_paths = [item for elem in paths for item in elem]
list_of_labels = [item for elem in labels for item in elem]
list_of_folders = [item for elem in folders for item in elem]

In [23]:
df_test = pd.DataFrame(np.array([list_of_paths,list_of_folders,list_of_labels]).T,columns=['Paths','Folders','Label'])
df_test['Paths'] = '../' + df_test['Paths'].str.replace('\\','/').str.split('/',expand=True,n=9)[9]

In [24]:
df_test_mar = df_test.loc[df_test.Paths.str.find('mar',start=-16) > -1, :].reset_index(drop=True)

In [25]:
df_test_mar.groupby(by=['Label','Folders']).sample(200,replace=True,random_state=0).reset_index(drop=True).to_csv(f'lab/train_test_test_mar.csv')

### Test Boreholes

In [26]:
ifile = 'dataset_borehole_train_test.yaml'

if os.path.isfile(ifile) and os.path.splitext(ifile)[-1] in [".yaml",".yml"]:
    inputs = AttrDict.from_yaml_path(ifile) # change to argv
else:
    raise AssertionError("Wrong input type")

#### Train

In [28]:
paths = []
labels = []
folders = []

for key, items in inputs.Train.items():
    for folder in items.Folder:

        path = glob.glob(inputs.rootData + folder + '*.png')
        label = [items.Label] * len(path)

        folders.append([folder] * len(path))
        paths.append(path)
        labels.append(label)

list_of_paths = [item for elem in paths for item in elem]
list_of_labels = [item for elem in labels for item in elem]
list_of_folders = [item for elem in folders for item in elem]

In [29]:
df_train = pd.DataFrame(np.array([list_of_paths,list_of_folders,list_of_labels]).T,columns=['Paths','Folders','Label'])
df_train['Paths'] = '../' + df_train['Paths'].str.replace('\\','/').str.split('/',expand=True,n=9)[9]

In [30]:
df_train_mar = df_train.loc[df_train.Paths.str.find('mar',start=-16) > -1, :].reset_index(drop=True)

In [31]:
df_train_mar.groupby(by=['Label','Folders']).sample(500,replace=True,random_state=0).reset_index(drop=True).to_csv(f'borehole/train_test_train_mar.csv')

#### Test

In [32]:
paths = []
labels = []
folders = []

for key, items in inputs.Test.items():
    for folder in items.Folder:

        path = glob.glob(inputs.rootData + folder + '*.png')
        label = [items.Label] * len(path)

        folders.append([folder] * len(path))
        paths.append(path)
        labels.append(label)

list_of_paths = [item for elem in paths for item in elem]
list_of_labels = [item for elem in labels for item in elem]
list_of_folders = [item for elem in folders for item in elem]

In [33]:
df_test = pd.DataFrame(np.array([list_of_paths,list_of_folders,list_of_labels]).T,columns=['Paths','Folders','Label'])
df_test['Paths'] = '../' + df_test['Paths'].str.replace('\\','/').str.split('/',expand=True,n=9)[9]

In [34]:
df_test_mar = df_test.loc[df_test.Paths.str.find('mar',start=-16) > -1, :].reset_index(drop=True)

In [35]:
df_test_mar.groupby(by=['Label','Folders']).sample(200,replace=True,random_state=0).reset_index(drop=True).to_csv(f'borehole/train_test_test_mar.csv')