## **EDA Notebook**

**IMPORTANT:** run `setup_data.py`

In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import os


import cv2
import matplotlib.pyplot as plt
from PIL import Image


from utils.visualizations import plot_distribution_pie, plot_images_compare_magnification, check_image_resolutions
from utils.preproc import preproc_pipeline

In [None]:
img_metadata_df = pd.read_csv('../image_metadata/image_data.csv')
img_metadata_df.isnull().sum()

In [None]:
img_metadata_df.dropna(inplace=True)
img_metadata_df.head(3)

In [None]:
img_metadata_df.isnull().sum()

In [None]:
def update_image_paths(metadata):
    
    def get_image_location(row):
        
        possible_locations = ['train', 'test', 'val']
        for location in possible_locations:
            full_path = os.path.join('..', 'data', row['Magnification'], location, row['image_name'])
            if os.path.exists(full_path):
                return location
            
        print(f"WARNING: {row['image_name']} not found in any folder.")
        return "NOT FOUND"
    
    # These are temporary columns to help us find the new paths of the images
    metadata['image_name'] = metadata['path_to_image'].apply(lambda x: os.path.basename(x))
    metadata['image_location'] = metadata.apply(get_image_location, axis=1)
    
    # Update the paths to our new structure of folders
    metadata['path_to_image'] = metadata.apply(
        lambda row: os.path.join('..', 'data', row['Magnification'], row['image_location'], row['image_name']),
        axis=1
    )
    
    metadata.drop(columns=['image_name', 'image_location'], inplace=True)
    return metadata


img_metadata_df = update_image_paths(img_metadata_df)
img_metadata_df.head(3)

<i> The updated version of the image metadata csv files contains the actual image paths

In [None]:
# export the updated metadata
#img_metadata_df.to_csv('../image_metadata/updated_image_data.csv', index=False)

In [None]:
grouped = img_metadata_df.groupby(
    ['Benign or Malignant', 'Cancer Type', 'Magnification']
).size().reset_index(name='Count')

pivot_table = grouped.pivot_table(
    index=['Benign or Malignant', 'Cancer Type'],
    columns='Magnification',
    values='Count',
    aggfunc='sum',
    fill_value=0
)

magnification_order = ['40X', '100X', '200X', '400X']
pivot_table = pivot_table.reindex(columns=magnification_order, fill_value=0)

pivot_table['Total'] = pivot_table.sum(axis=1)

column_sub_totals_benign = pivot_table[:4].sum(axis=0)
column_sub_totals_malignant = pivot_table[4:8].sum(axis=0)
column_totals = pivot_table.sum(axis=0)

pivot_table.loc[('Sub Total', 'Benign'), :] = column_sub_totals_benign
pivot_table.loc[('Sub Total', 'Maligant'), :] = column_sub_totals_malignant
pivot_table.loc[('Total', ''), :] = column_totals

pivot_table = pivot_table.astype(int)
pivot_table

In [None]:
plot_distribution_pie(img_metadata_df, 'Benign or Malignant')

In [None]:
plot_distribution_pie(img_metadata_df, 'Cancer Type')

In [None]:
resolutions = check_image_resolutions(img_metadata_df)
resolutions_df = pd.DataFrame(resolutions, columns=['width', 'height'])
print("\nImage Resolutions:\n", resolutions_df.describe())

In [None]:
magnifications = ['40X', '100X', '200X', '400X']
cancer_types = list(img_metadata_df['Cancer Type'].unique())
            
plot_images_compare_magnification(img_metadata_df, cancer_types, magnifications)

In [None]:
X_train, y_train, X_test, y_test, X_val, y_val = \
    preproc_pipeline(desired_magnification='40X', 
                     image_resolution=(224, 224),
                     csv_path='../image_metadata/updated_image_data.csv', 
                     classification_type='binary')

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_val.shape, y_val.shape

In [None]:
# pqq tem uma diferenca de 1 entre o total de '40X' no pivot_table e o y_train.shape[0]?