# Data Cleaning

Note: this is done with the given reorientation/renumbering to ensure data quality. See test_transformations.ipynb to see the code for automating the detection of reorientation/renumbering.

In [1]:
from PIL import Image
import numpy as np
import os
import cv2
import matplotlib.pyplot as plt
import pandas as pd

os.chdir("..")

import torch
from torchvision import transforms
from models import U2NETP
import boto3
import src.credentials as credentials
import io

In [2]:
patients = pd.read_csv("/home/ra-ugrad/Documents/Haleigh/MedSegmentDeploy/preprocessing.csv")
map_bool = {"Yes": True, "No": False}
patients['Reorientation'] = patients.Reorientation.map(map_bool)
patients['Renumber'] = patients.Renumber.map(map_bool)

In [3]:
s3 = boto3.client('s3', aws_access_key_id=credentials.ACCESS_KEY, aws_secret_access_key=credentials.SECRET_KEY)

response = s3.get_object(Bucket='raw-data-mris-segs', Key='seg_list_test.xlsx') 
data = response['Body'].read()
patient_data = pd.read_excel(io.BytesIO(data))

patient_data

Unnamed: 0,MRI/Patient ID,Number of Brightness Levels,Number of Slices,Brightness Level 1,Brightness Level 2
0,ACRIN 6698_207837,2,320,5,6
1,ACRIN 6698_277831,2,480,6,7
2,Duke_062,2,492,1,2
3,Duke_077,2,522,2,3


In [4]:
patients = patients[patients['Patient ID'].isin(patient_data['MRI/Patient ID'])]
patients

Unnamed: 0,Patient ID,Renumber,Reorientation
5,ACRIN 6698_207837,False,False
7,ACRIN 6698_277831,True,False
17,Duke_062,True,True
21,Duke_077,False,True


In [5]:
patients = patients[1:3]

In [6]:
patients

Unnamed: 0,Patient ID,Renumber,Reorientation
7,ACRIN 6698_277831,True,False
17,Duke_062,True,True


In [8]:
new_bucket = 'cleaned-mri-data'
old_bucket = 'raw-data-mris-segs'
for patient in patients['Patient ID']:
    row = patients[patients['Patient ID'] == patient]
    renum = row.Renumber.item()
    reorient = row.Reorientation.item()
    slices = patient_data[patient_data['MRI/Patient ID'] == patient]['Number of Slices'].item()
    level_1 = patient_data[patient_data['MRI/Patient ID'] == patient]['Brightness Level 1'].item()
    level_2 = patient_data[patient_data['MRI/Patient ID'] == patient]['Brightness Level 2'].item()
    levels = [level_1, level_2]
    print(slices)
    for i in range(1, slices+1):
        img_num = str(i).zfill(5)
        for n, b in enumerate(levels):
            # get image
            old_key = f"MRIs/{patient}/MRI PNGs/Brightness level {b}/png_{img_num}.png"
            response = s3.get_object(Bucket=old_bucket, Key=old_key)
            image_data = response['Body'].read()
            img = Image.open(io.BytesIO(image_data))

            # perform operations as needed
            if reorient:
                modified = cv2.rotate(np.array(img), cv2.ROTATE_180)
                success, modified = cv2.imencode('.png', modified)
            else:
                success, modified = cv2.imencode('.png', np.array(img))
            
            if renum:
                new_num = slices - (i-1)
                new_img_num = str(new_num).zfill(5)
                new_key = f"MRIs/{patient}/MRI PNGs/Brightness level {n+1}/png_{new_img_num}.png"
            else:
                new_key = f"MRIs/{patient}/MRI PNGs/Brightness level {n+1}/png_{img_num}.png"

            # save image to new location
            buffer = io.BytesIO(modified.tobytes())
            s3.upload_fileobj(buffer, new_bucket, new_key, ExtraArgs={'ContentType': 'image/png'})

            # save segmentation to new location
            seg_key = f"MRIs/{patient}/Seg PNGs/segpng_{img_num}.png"
            response = s3.get_object(Bucket=old_bucket, Key=seg_key)
            seg_data = response['Body'].read()
            seg = Image.open(io.BytesIO(seg_data))
            buffer = io.BytesIO()
            seg.save(buffer, format='PNG')
            buffer.seek(0)
            s3.upload_fileobj(buffer, new_bucket, seg_key)

480
492
