In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Packages

## Download packages

In [None]:
!pip install split_folders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting split_folders
  Downloading split_folders-0.5.1-py3-none-any.whl (8.4 kB)
Installing collected packages: split_folders
Successfully installed split_folders-0.5.1


## Importing packages

In [None]:
import tensorflow as tf
import numpy as np
import os
import shutil 
import random
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix
from PIL import Image
from zipfile import ZipFile, error
from tqdm import tqdm
from pathlib import Path
import splitfolders
import math

tfk = tf.keras
tfkl = tf.keras.layers

In [None]:
# Random seed for reproducibility
seed = 42

random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
tf.compat.v1.set_random_seed(seed)

In [None]:
import warnings
import logging

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)
tf.get_logger().setLevel('INFO')
tf.autograph.set_verbosity(0)

tf.get_logger().setLevel(logging.ERROR)
tf.get_logger().setLevel('ERROR')
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

# Creating data

In [None]:
initial_zip_path = r'/content/drive/MyDrive/Courses/Applied AI for Biomedicine/Project/train_set.zip'
initial_folder_path = r'/content/drive/MyDrive/Courses/Applied AI for Biomedicine/Project/data'
csv_path = r'/content/drive/MyDrive/Courses/Applied AI for Biomedicine/Project/data/train/labels_train.csv'
csv2_path = r'/content/drive/MyDrive/Courses/Applied AI for Biomedicine/Project/data/labels_train.csv'
train_path = r'/content/drive/MyDrive/Courses/Applied AI for Biomedicine/Project/data/train'
project_path = r'/content/drive/MyDrive/Courses/Applied AI for Biomedicine/Project'

### Preliminary

We unzip the data.

In [None]:
with ZipFile(initial_zip_path) as zf:
  for member in tqdm(zf.infolist(), desc='Extracting '):
    try:
      zf.extract(member, initial_folder_path)
    except error as e:
      pass
  # Close the file
  zf.close()

Extracting : 100%|██████████| 15471/15471 [02:58<00:00, 86.63it/s] 


After the unzip, we have a folder 'data' where we have a folder 'train' with both all the images and the csv file for the targets. We move the csv file in the directory above, so that inside 'data' we have the csv file with the targets and the folder 'train' with the images.

In [None]:
p = Path(csv_path).absolute()
parent_dir = p.parents[1]
p.rename(parent_dir / p.name)

PosixPath('/content/drive/MyDrive/Courses/Applied AI for Biomedicine/Project/data/labels_train.csv')

### Make train folder

We count how many images and other docs we have in the train folder.

In [None]:
dir_path = train_path
count_img = 0
count_docs = 0
# Iterate directory
for path in os.listdir(dir_path):
    # check if current path is a file
    if os.path.isfile(os.path.join(dir_path, path)):
      if path.endswith(".png") or path.endswith(".jpeg"):
        count_img += 1
      if path.endswith(".csv"):
        count_docs += 1
print('Images count:', count_img, ". Docs count:", count_docs)


We save the target from the csv file, skipping the first row since it is the head.

In [None]:
# Read the labels from ground truth except the header
df = pd.read_csv(csv2_path, names=['file','label'], skiprows=1)
df.shape[0]

15470

In [None]:
# Extract the labels and store in a new data frame called labels
labels = df.sort_values('label')

# Create a Python list of Unique labels in data frame labels
class_names = list(labels.label.unique())

We create for each class a folder inside the train folder, and then we move each image in the corresponding folder based on its class.

In [None]:
for name_class in class_names:
    os.makedirs(os.path.join(train_path, name_class))

In [None]:
for i in class_names:                # I ->  class label names
  for c in list(df[df['label']== i]['file']):    # c  -> name of the individual image that have the i class
    # Creating path to the image
    get_image = os.path.join(train_path,c)
      # get_image to that path
    if not os.path.exists(train_path +'/'+i+'/'+c):
      # move the image to this path
      move_image = shutil.copy(get_image, train_path +'/'+i)

We check the number of images in the new folders.

In [None]:
for i in class_names: 
  dir_path = os.path.join(train_path, i)
  count = 0
  # Iterate directory
  for path in os.listdir(dir_path):
      # check if current path is a file
      if os.path.isfile(os.path.join(dir_path, path)):
        if path.endswith(".png") or path.endswith(".jpeg"):
          count += 1
  print("Class ", i, " has ", count, " images")

Class  N  has  9354  images
Class  P  has  4250  images
Class  T  has  1866  images


We check if the images has been placed in the right folders and we count them.

In [None]:
for i in class_names: 
  count = 0
  for c in list(df[df['label']== i]['file']):
    if not os.path.exists(train_path +'/'+i+'/'+c):
      print("error")
    else:
      count += 1
  print("Class ", i, " has ", count, " images")

Class  N  has  9354  images
Class  P  has  4250  images
Class  T  has  1866  images


We delete the original images and we keep only the ones in the class folders.

In [None]:
dir_name = project_path
test = os.listdir(dir_name)

for item in test:
    if item.endswith(".png") or item.endswith(".jpeg"):
        os.remove(os.path.join(dir_name, item))

### Check sizes and number of images

We count the number of images in each class folder.

In [None]:
for i in class_names: 
  dir_path = os.path.join(train_path, i)
  count = 0
  path_size = 0
  # Iterate directory
  for path in os.listdir(dir_path):
      # check if current path is a file
      if os.path.isfile(os.path.join(dir_path, path)):
        count += 1
        path_size += os.path.getsize(os.path.join(dir_path, path))
  print("Class ", i, " has ", count, " images", "and the total size is:", path_size*1e-9, "GB")

Class  N  has  9354  images and the total size is: 2.435105812 GB
Class  P  has  4250  images and the total size is: 1.183822803 GB
Class  T  has  1866  images and the total size is: 1.0162671840000002 GB


In [None]:
def get_size(start_path = project_path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            # skip if it is symbolic link
            if not os.path.islink(fp):
                total_size += os.path.getsize(fp)

    return total_size

print(get_size(), 'bytes')

8994285261 bytes
