<a href="https://colab.research.google.com/github/ian-byrne/MADSmilestone2/blob/main/ian_testingground.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Notebook used to test project ideas
Author: Ian Byrne

Email: ianbyrne@umich.edu

In [7]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## Reading in data stream

In [1]:
# Clone the entire repo.
!git clone -l -s https://github.com/ian-byrne/MADSmilestone2.git

# Change directory into cloned repo
%cd MADSmilestone2

# List repo contents
!ls

Cloning into 'MADSmilestone2'...
remote: Enumerating objects: 432, done.[K
remote: Counting objects: 100% (432/432), done.[K
remote: Compressing objects: 100% (360/360), done.[K
remote: Total 432 (delta 240), reused 160 (delta 66), pack-reused 0[K
Receiving objects: 100% (432/432), 3.23 MiB | 4.23 MiB/s, done.
Resolving deltas: 100% (240/240), done.
/content/MADSmilestone2
Data			 Images.ipynb  Labeling  README.md
ian_testingground.ipynb  Images.py     Loading	 Supervised_CNN.ipynb


Load imports as necessary

In [3]:
from tqdm import tqdm

In [4]:
import Loading.load_data as loading

import io
import logging
import os
import ast
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import glob



In [None]:
# List files in the directory
path = '/content/gdrive/MyDrive/Data/Nhats Dataset/NHATS_R11_ClockDrawings_V2'
files = glob.glob(os.path.join(path, '*.tif'))
files[:5]

['/content/gdrive/MyDrive/Data/Nhats Dataset/NHATS_R11_ClockDrawings_V2/10000008.tif',
 '/content/gdrive/MyDrive/Data/Nhats Dataset/NHATS_R11_ClockDrawings_V2/10000019.tif',
 '/content/gdrive/MyDrive/Data/Nhats Dataset/NHATS_R11_ClockDrawings_V2/10000021.tif',
 '/content/gdrive/MyDrive/Data/Nhats Dataset/NHATS_R11_ClockDrawings_V2/10000022.tif',
 '/content/gdrive/MyDrive/Data/Nhats Dataset/NHATS_R11_ClockDrawings_V2/10000036.tif']

~~Import dataframe containing spid, description of clock, diagnosis, round~~

Import the text file containing the dictionary of rounds and their spids and lebels. 
- Transform into a nested list to make loops easier. 

In [None]:
file = open("Data/roundIdLabelDict.txt", "r")

contents = file.read()
round_dict = ast.literal_eval(contents)

In [None]:
round_dict[4][0]

('10001875', 0)

### Custom data set, idea:
- take in the dataframe from the rounds_df
- utilize the spid number and the round number to load the correct .tif image from Google Drive
- tbd: any transforms to be done on the images to prepare for learning
    - Can likely utilize this to loop through the large images and convert to smaller images and then save them to a new directory/ or locally to make the data more manageable 
- Should return the image in tensor format and the label (diagnosis)

In [None]:
class ClockImages(Dataset):
    """Original large format clock drawing dataset"""

    def __init__(self, round, round_labels):
        """
        Args:
            round (int): Round to grab images from. 
            values (list of tuples): Corresponding values for the round.
        """
        self.round = round
        self.vals = round_labels
        self.base_path = '/content/gdrive/MyDrive/Data/Nhats Dataset/NHATS_R11_ClockDrawings_V2'


    def __len__(self):
        return len(self.vals)

    
    def __getitem__(self, idx):
        spid = self.vals[idx][0]
        label = self.vals[idx][1]
        filename = f"{spid}.tif"
        file_path = os.path.join(self.base_path, filename)

        im = Image.open(file_path)
        resized = im.resize((160, 207)) 
        im_arr = np.array(resized)

        sample = {'image': im_arr, 'name': str(self.round)+"_"+filename}

        return sample

In [None]:
data = ClockImages(10, round_dict[10])
clock_dataloader = DataLoader(dataset=data)
for sample in tqdm(clock_dataloader):

    # convert from tensor to array
    im_arr = np.asarray(sample['image'][0])

    # convert back to image
    im = Image.fromarray(im_arr)

    # save image locally
    im.save("im2upload.tif", "TIFF")
    print(f"Processed: {sample['name'][0]}")

## Resizing images, convert any RGB images to grayscale, setting to binary, and saving into Train, Validate and Test image and label numpy array files

In [20]:
class ResizedClocks(Dataset):
    #Resized clock drawing dataset

    def __init__(self, round, round_labels):
        
       # Args:
           # round (int): Round to grab images from. 
           # values (list of tuples): Corresponding values for the round.
        
        self.round = round
        self.vals = round_labels
        self.base_path = '/content/gdrive/MyDrive/Data/Nhats Dataset/NHATS_R11_ClockDrawings_V2'
    
    def __len__(self):
        return len(self.vals)

    def __getitem__(self, idx):
        spid = self.vals[idx][0]
        label = self.vals[idx][1]
        filename = f"{spid}.tif"
        file_path = os.path.join(self.base_path, filename)

        try:
          im = Image.open(file_path)

          gray = im.convert('1')
          resized = gray.resize((160, 207)) 
          im_arr = np.array(resized).astype(int)

          sample = {'image': im_arr, 'label': label}
          
          return sample
          
        except FileNotFoundError as e:
          #logging.error(e)
          return None





"""Load the dictionary with IDs, rounds and labels
# train_dict.txt
# test_dict.txt
# val_dict.txt"""
customDict_file = open("Data/train_dict.txt", "r")
#print(customDict_file.readline())
contents = customDict_file.read()
dictionary = ast.literal_eval(contents)
customDict_file.close()





"""Create arrays to store the data and labels
Create separate files for train, val, test
by passing in only the corresponding dictionary """
image_array = []
label_array = []
round_val = [1,2,3,4,5,6,7,8]#,9,10]

for val in round_val:
  data = ResizedClocks(val, dictionary[val])
  for sample in tqdm(data):
    if sample is not None:
      try:
        image_array.append(sample['image'])
        label_array.append(sample['label'])
      except TypeError as e:
        #logging.error(e)
        pass 

np.save('train_images{}.npy'.format(val), image_array) # save images
np.save('train_labels{}.npy'.format(val), label_array) # save labels

100%|█████████████████| 1/1 [00:00<00:00,  2.24it/s]
