In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Packages

## Download packages

In [None]:
!pip install split_folders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting split_folders
  Downloading split_folders-0.5.1-py3-none-any.whl (8.4 kB)
Installing collected packages: split_folders
Successfully installed split_folders-0.5.1


## Importing packages

In [None]:
import tensorflow as tf
import numpy as np
import os
import shutil 
import random
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix
from PIL import Image
from zipfile import ZipFile, error
from tqdm import tqdm
from pathlib import Path
import splitfolders
import math

tfk = tf.keras
tfkl = tf.keras.layers

In [None]:
# Random seed for reproducibility
seed = 42

random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
tf.compat.v1.set_random_seed(seed)

In [None]:
import warnings
import logging

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)
tf.get_logger().setLevel('INFO')
tf.autograph.set_verbosity(0)

tf.get_logger().setLevel(logging.ERROR)
tf.get_logger().setLevel('ERROR')
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

# Splitting

In [None]:
initial_zip_path = r'/content/drive/MyDrive/Courses/Applied AI for Biomedicine/Project/train_set.zip'
initial_folder_path = r'/content/drive/MyDrive/Courses/Applied AI for Biomedicine/Project/data'
csv_path = r'/content/drive/MyDrive/Courses/Applied AI for Biomedicine/Project/data/train/labels_train.csv'
csv2_path = r'/content/drive/MyDrive/Courses/Applied AI for Biomedicine/Project/data/labels_train.csv'
train_path = r'/content/drive/MyDrive/Courses/Applied AI for Biomedicine/Project/data/train'
project_path = r'/content/drive/MyDrive/Courses/Applied AI for Biomedicine/Project'

## Splitting by images

In [None]:
data_splitted_path = r'/content/drive/MyDrive/Courses/Applied AI for Biomedicine/Project/data/data_splitted'

In [None]:
splitfolders.ratio(
    train_path, 
    output=data_splitted_path, 
    seed=1337, # or seed already prepared before
    ratio=(.7, .15, .15), 
    group_prefix=None, 
    move=False)
# it takes more time for the first images of each class folder


We count the number of images in train, val and test.

In [None]:
split = ["train", "val", "test"]
for s in split: 
  print(s, " has:")
  main_path = os.path.join(data_splitted_path, s)
  c = 0 
  p = 0
  for i in ["N","P","T"]: 
    dir_path = os.path.join(main_path, i)
    count = 0
    path_size = 0
    # Iterate directory
    for path in os.listdir(dir_path):
        # check if current path is a file
        if os.path.isfile(os.path.join(dir_path, path)):
          count += 1
          path_size += os.path.getsize(os.path.join(dir_path, path))
    print("   Class ", i, " has ", count, " images", "and the total size is:", path_size*1e-9, "GB")
    c = c + count
    p = p + path_size
  print(s, " has ", c, " images and", p*1e-9, "GB of size\n")

train  has:
   Class  N  has  6547  images and the total size is: 1.742229352 GB
   Class  P  has  2975  images and the total size is: 0.835284938 GB
   Class  T  has  1306  images and the total size is: 0.706987785 GB
train  has  10828  images and 3.2845020750000002 GB of size

val  has:
   Class  N  has  1403  images and the total size is: 0.331561949 GB
   Class  P  has  637  images and the total size is: 0.169287922 GB
   Class  T  has  279  images and the total size is: 0.16079147200000002 GB
val  has  2319  images and 0.661641343 GB of size

test  has:
   Class  N  has  1404  images and the total size is: 0.361314511 GB
   Class  P  has  638  images and the total size is: 0.17924994300000002 GB
   Class  T  has  281  images and the total size is: 0.14848792700000002 GB
test  has  2323  images and 0.689052381 GB of size



In [None]:
def get_size(start_path = project_path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            # skip if it is symbolic link
            if not os.path.islink(fp):
                total_size += os.path.getsize(fp)

    return total_size

print(get_size(), 'bytes')

## Splitting by patient

### Full split

We create the output directories.

In [None]:
os.makedirs('/content/drive/MyDrive/Courses/Applied AI for Biomedicine/Project/data_splitted_patient')
os.makedirs('/content/drive/MyDrive/Courses/Applied AI for Biomedicine/Project/data_splitted_patient/train')
os.makedirs('/content/drive/MyDrive/Courses/Applied AI for Biomedicine/Project/data_splitted_patient/val')
os.makedirs('/content/drive/MyDrive/Courses/Applied AI for Biomedicine/Project/data_splitted_patient/test')
base_outdir = '/content/drive/MyDrive/Courses/Applied AI for Biomedicine/Project/data_splitted_patient/'

We split the data.

In [None]:
root_dir = '/content/drive/MyDrive/Courses/Applied AI for Biomedicine/Project/train'
classes = ['N', 'P', 'T']
folders = ['train', 'val', 'test']

for clss in classes:
  dirtry = root_dir + '/' + clss
  files = os.listdir(dirtry)
  np.random.shuffle(files)
  for folder in folders:
    target_dir = base_outdir + folder
    if not os.path.exists(target_dir + '/' + clss):
      os.makedirs(target_dir + '/' + clss)
    target_class = target_dir + '/' + clss

    if folder == 'train':
      images_to_pass = files[: math.floor(0.6*len(files))]
      for img in images_to_pass:
        #check if the patient of this image is not in the other splits

        folders_two = [x for x in folders if x != folder] # i take the other two splits
        copied = False
        for fold in folders_two:
          if copied == True: break
          path1 = base_outdir + '/' + fold
          for c in os.listdir(path1):
            path2 = path1 + '/' + c
            files2 = os.listdir(path2) # all the images copied so far in this folder
            result = [i for i in files2 if i.startswith(img[:7])]
            if len(result) != 0:
              # i have the same patient in this folder then i must copy the image in this split
              new_target = base_outdir + '/' + fold + '/' + clss
              img_dir = dirtry + '/' + img
              shutil.copy(img_dir, new_target)
              copied = True
              break

        if copied == False: # we didnt find any same patient in other split
          img_dir = dirtry + '/' + img
          shutil.copy(img_dir, target_class)
          #print(img + 'dont have any prefix in ' + fold + folder)
    
    elif folder == 'val':
      images_to_pass = files[math.floor(0.6*len(files)): math.floor(0.8*len(files))]
      for img in images_to_pass:
        #check if the patient of this image is not in the other splits

        folders_two = [x for x in folders if x != folder] # i take the other two splits
        copied = False
        for fold in folders_two:
          if copied == True: break
          path1 = base_outdir + '/' + fold
          for c in os.listdir(path1):
            path2 = path1 + '/' + c
            files2 = os.listdir(path2) # all the images copied so far in this folder
            result = [i for i in files2 if i.startswith(img[:7])]
            if len(result) != 0:
              # i have the same patient in this folder then i must copy the image in this split
              new_target = base_outdir + '/' + fold + '/' + clss
              img_dir = dirtry + '/' + img
              shutil.copy(img_dir, new_target)
              copied = True
              break

        if copied == False: # we didnt find any same patient in other split
          img_dir = dirtry + '/' + img
          shutil.copy(img_dir, target_class)
          #print(img + 'dont have any prefix in ' + fold + folder)
    else:
      images_to_pass = files[math.floor(0.8*len(files)):]
      for img in images_to_pass:
        #check if the patient of this image is not in the other splits

        folders_two = [x for x in folders if x != folder] # i take the other two splits
        copied = False
        for fold in folders_two:
          if copied == True: break
          path1 = base_outdir + '/' + fold
          for c in os.listdir(path1):
            path2 = path1 + '/' + c
            files2 = os.listdir(path2) # all the images copied so far in this folder
            result = [i for i in files2 if i.startswith(img[:7])]
            if len(result) != 0:
              # i have the same patient in this folder then i must copy the image in this split
              new_target = base_outdir + '/' + fold + '/' + clss
              img_dir = dirtry + '/' + img
              shutil.copy(img_dir, new_target)
              copied = True
              break

        if copied == False: # we didnt find any same patient in other split
          img_dir = dirtry + '/' + img
          shutil.copy(img_dir, target_class)

We check the number of images.

In [None]:
split = ["train", "val", "test"]
for s in split: 
  print(s, " has:")
  main_path = os.path.join('/content/drive/MyDrive/Courses/Applied AI for Biomedicine/Project/data_splitted_patient', s)
  c = 0 
  p = 0
  for i in ["N","P","T"]: 
    dir_path = os.path.join(main_path, i)
    count = 0
    path_size = 0
    # Iterate directory
    for path in os.listdir(dir_path):
        # check if current path is a file
        if os.path.isfile(os.path.join(dir_path, path)):
          count += 1
          path_size += os.path.getsize(os.path.join(dir_path, path))
    print("   Class ", i, " has ", count, " images", "and the total size is:", path_size*1e-9, "GB")
    c = c + count
    p = p + path_size
  print(s, " has ", c, " images and", p*1e-9, "GB of size\n")

train  has:
   Class  N  has  6647  images and the total size is: 1.7770441620000001 GB
   Class  P  has  3025  images and the total size is: 0.874228153 GB
   Class  T  has  1320  images and the total size is: 0.7481700910000001 GB
train  has  10992  images and 3.3994424060000004 GB of size

val  has:
   Class  N  has  1506  images and the total size is: 0.37489813200000005 GB
   Class  P  has  678  images and the total size is: 0.174640596 GB
   Class  T  has  303  images and the total size is: 0.14397784700000002 GB
val  has  2487  images and 0.6935165750000001 GB of size

test  has:
   Class  N  has  1201  images and the total size is: 0.283163518 GB
   Class  P  has  547  images and the total size is: 0.13495405400000002 GB
   Class  T  has  243  images and the total size is: 0.124119246 GB
test  has  1991  images and 0.542236818 GB of size

