In [1]:
import os
import shutil

import pandas as pd
import numpy as np

from pathlib import Path
from sklearn.model_selection import train_test_split

In [531]:
# Make directories for each one of our data sets
%mkdir ../data/train
%mkdir ../data/valid
%mkdir ../data/test

mkdir: ../data/train: File exists
mkdir: ../data/valid: File exists
mkdir: ../data/test: File exists


In [532]:
DIR_DATA = "../data"

In [533]:
df_set1 = pd.read_csv(Path(DIR_DATA, 'set_a.csv'))
df_set2 = pd.read_csv(Path(DIR_DATA, 'set_b.csv'))

Cool! they have the same structure! to make things easy, we'll combine them together

In [535]:
df_files = pd.concat([df_set1, df_set2], axis = 0).reset_index()

df_files.shape

(832, 5)

In [536]:
df_files.isna().sum()

index         0
dataset       0
fname         0
label       247
sublabel    683
dtype: int64

In [537]:
# Too many unknown sublabels - going to drop them
df_files = df_files.drop('sublabel', axis = 1)

---

Lets look at the labels! To use `PyTorch` to train a CNN as a multiclass classifier, we need to structure our files so we can easily tell which files have which label, without relying on a CSV that could easily be corrupted or edited

In [30]:
# Combining extrastole and exrahls since they're distinct in both set a and set b.
df_test['label2'] = df_test['label']
df_test.loc[(df_test['label2'] == 'extrahls') | (df_test['label2'] == 'extrastole'), 'label2'] = 'extra'

## Getting our own labels
After much trial and error, turns out that the actual file names actually match the file names in the csvs. This makes it incredibly difficult to properly segment and organize them. Fortunately each filename has its classification at the begginging, we are just going to grab it directly

The unlabeled data will be left behind since we don't have it classified anyway

In [629]:
file_names = []
dataset = []

for i in ['set_a', 'set_b']:
    f = os.listdir(Path(DIR_DATA, i))
    file_names.extend(f)
    dataset.extend([i] * len(f))
    

In [630]:
df_classification = pd.DataFrame({
    'dataset': dataset,
    'file_name': file_names
})

In [631]:
# Drop unlabelled
df_classification = df_classification.loc[~df_classification['file_name'].str.contains('unlabelled')]

In [632]:
# Get label from filename
df_classification.loc[:, 'label'] = df_classification['file_name'].str.split('_').str[0].copy()

In [633]:
# Update 'extrahls' and 'extrastole' to 'extra'
df_classification.loc[df_classification['label'].str.contains('extra'), ['label']] = 'extra'

## Stratification
___

In [644]:
X = df_classification['label']
y = df_classification[['dataset', 'file_name', 'label']] # OK to have label - Not using in SkLearn

In [650]:
# Stratified split of our files
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.20,
                                                    stratify = y[['dataset', 'label']])

In [646]:
# Create subfolder for each label
for i in y['label'].unique():
    try:
        os.makedirs(Path(DIR_DATA,'train', i)) # Training
        os.makedirs(Path(DIR_DATA,'valid', i)) # Validation
    except FileExistsError:
        pass

In [657]:
# Training Files
for index, val in y_train.iterrows():
    
    orig_path = Path(DIR_DATA, val['dataset'], val['file_name'])
    dest_path = Path(DIR_DATA, 'train', val['label'], '_'.join([val['dataset'], val['file_name']]))

    try: 
        shutil.move(orig_path, dest_path)
    except FileNotFoundError:
        pass

In [658]:
# Validation Files
for index, val in y_test.iterrows():
    
    orig_path = Path(DIR_DATA, val['dataset'], val['file_name'])
    dest_path = Path(DIR_DATA, 'valid', val['label'], '_'.join([val['dataset'], val['file_name']]))

    try: 
        shutil.move(orig_path, dest_path)
    except FileNotFoundError:
        pass