In [None]:
import numpy as np
import pandas as pd 
import plotly.express as px
import plotly.graph_objects as go
from IPython.display import HTML
from glob import glob
import matplotlib.pyplot as plt
from collections import defaultdict
from openslide import OpenSlide
from PIL import Image
import os
import cv2
import gc


In [None]:
HTML('<iframe width="675" height="506" src="https://www.youtube.com/embed/3CInkjVReDA" title="Stroke Prevention & Transient Ischemic Attack (TIA)" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>')


In [None]:
train_df = pd.read_csv('../input/mayo-clinic-strip-ai/train.csv')

In [None]:
print("There are {} unique patients".format(train_df["patient_id"].nunique()))

count_patients = train_df.groupby("patient_id")["patient_id"].count()

num_images = count_patients[count_patients > 1]

print("There are {} unique patients who have images greater than 1".format(num_images.shape[0]))

print("Lets Visulaize patients who have images > 2")

num_images = count_patients[count_patients > 2]


fig=go.Figure()
fig.add_trace(go.Pie(labels=num_images.index, values=num_images.values, hole=.25,sort=True, textinfo='value'))

fig.update_layout(title='Images Distribution',width=700)
fig.show()

In [None]:
train_df["center_id"] = train_df["center_id"].astype("string")

num_centers = train_df.groupby("center_id")["center_id"].agg("count")

fig=go.Figure()
fig.add_trace(go.Bar(x=num_centers.index, y=num_centers.values,
                     text=num_centers.values, texttemplate='%{text:.0f}', 
                     textposition='inside',insidetextanchor="middle"))

fig.update_layout(xaxis_title='center_id', yaxis_title='Number of Images',
                  xaxis={'categoryorder':'total descending'})
              
fig.show()

In [None]:
print("Lets check how many unique patients are in centers")

num_patients = train_df.groupby("center_id")["patient_id"].apply(lambda x: len(np.unique(x)))

fig=go.Figure()
fig.add_trace(go.Bar(x=num_patients.index, y=num_patients.values,
                     text=num_patients.values, texttemplate='%{text:.0f}', 
                     textposition='inside',insidetextanchor="middle"))

fig.update_layout(xaxis_title='center_id', yaxis_title='Number of Patients',
                  xaxis={'categoryorder':'total descending'})
              
fig.show()



In [None]:
print("Lets see how many patients are with CE and LAA")

CE_pateints = train_df[train_df["label"]=="CE"]

CE_count = CE_pateints.groupby("center_id")["patient_id"].apply(lambda x: len(np.unique(x)))

LAA_pateints = train_df[train_df["label"]=="LAA"]

LAA_count = LAA_pateints.groupby("center_id")["patient_id"].apply(lambda x: len(np.unique(x)))

fig = go.Figure(data=[
    go.Bar(name='CE', x=CE_count.index, y=CE_count.values, hovertemplate = "%{value}"),
    go.Bar(name='LAA', x=LAA_count.index, y=LAA_count.values, hovertemplate = "%{value}")
])

fig.update_layout(barmode='group', xaxis={'categoryorder':'total descending'})
fig.show()

In [None]:
train_images = glob("/kaggle/input/mayo-clinic-strip-ai/train/*.tif")

In [None]:
img_dict = defaultdict(list)

for path in train_images:
    
    slide = OpenSlide(path)
    
    width = slide.dimensions[0]
    height = slide.dimensions[1]
    
    img_dict['image_id'].append(path[-12:-4])
    img_dict['pixels'].append(width * height)
    img_dict['aspect_ratio'].append(width / height)
    img_dict['size_MB'].append(round(os.path.getsize(path) / 1e6, 2))
    img_dict['path'].append(path)
    

image_data = pd.DataFrame(img_dict)
image_data.sort_values(by='size_MB',ascending=False, inplace=True)
image_data.reset_index(inplace=True, drop=True)

image_data = image_data.merge(train_df, on='image_id')
image_data.head()

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(name="images",x=image_data['size_MB'],
                           hovertemplate = "Number of images: %{y}, Range: %{x}", opacity=0.65))

fig.update_layout(xaxis_title='size of images', yaxis_title='Number of images')

fig.show()

In [None]:
cv_max_image_pixels = 1073741824   #This is max image pixels cv2 can afford to read

min_pxl_imgs = image_data[image_data["pixels"] <= cv_max_image_pixels]
print("There are {} images which is less than cv2 threshold".format(min_pxl_imgs.shape[0]))

In [None]:
CE_paths = min_pxl_imgs.loc[min_pxl_imgs["label"]=="CE", "path"]
LAA_paths = min_pxl_imgs.loc[min_pxl_imgs["label"]=="LAA", "path"]

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(20, 20))
img_path = CE_paths.iloc[0]

img = cv2.imread(img_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

axes.imshow(img)

del img
gc.collect()


In [None]:
from openslide.deepzoom import DeepZoomGenerator

img_path = CE_paths.iloc[0]
slide = OpenSlide(img_path)

LEVEL = 16
ROW = 2
COLUMN = 3

deep_tile = DeepZoomGenerator(slide, tile_size=2000, overlap=0, limit_bounds=False)

print("There are {} Levels".format(deep_tile.level_count))

print("Total number of dimensions in each level are\n{} ".format(deep_tile.level_dimensions))

print("There are {} rows and {} columns in tiled image at {}th level".format(deep_tile.level_tiles[LEVEL][0],
                                                                           deep_tile.level_tiles[LEVEL][1],
                                                                           LEVEL + 1))

print("Below image is placed at {} row and {} column".format(ROW, COLUMN))

tile_image = deep_tile.get_tile(LEVEL, (ROW, COLUMN))
tile_image = tile_image.convert('RGB')

plt.figure(figsize=(20, 20))
plt.imshow(tile_image)
plt.show()

In [None]:
#https://www.kaggle.com/code/jirkaborovec/bloodclots-eda-load-wsi-prune-background

def prune_image_rows_cols(im, mask, thr=0.960):
    
    for l in reversed(range(im.shape[1])):
        if (np.sum(mask[:, l]) / float(mask.shape[0])) > thr:
            im = np.delete(im, l, 1)
   
    for l in reversed(range(im.shape[0])):
        if (np.sum(mask[l, :]) / float(mask.shape[1])) > thr:
            im = np.delete(im, l, 0)
            
    return im

def mask_median(im, val=255):
    masks = [None] * 3
    for c in range(3):
        masks[c] = im[..., c] >= np.median(im[:, :, c]) - 5
    mask = np.logical_and(*masks)
    im[mask, :] = val
    return im, mask

### Lets see CE images and compare them with normalized images

In [None]:
fig, axes = plt.subplots(nrows=10, ncols=1, figsize=(20, 20))
fig.tight_layout()

for k, (i, j) in enumerate(zip(range(0, 10, 2), range(1, 10, 2))):
    
    img_path = cv2.imread(CE_paths.iloc[k])
    img = cv2.cvtColor(img_path, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (0,0), fx=0.05, fy=0.05)

    if img.shape[0] > img.shape[1]:
        img = np.rollaxis(img, 0, 2)
    axes[i].imshow(img)
    axes[i].set_title("Original")
    
    img, mask = mask_median(np.array(img))
    img = prune_image_rows_cols(img, mask)
    
    axes[j].imshow(img)
    axes[j].set_title("Normalized")

fig.show()


### Lets see LAA images and compare them with normalized images

In [None]:
fig, axes = plt.subplots(nrows=10, ncols=1, figsize=(20, 20))
fig.tight_layout()

for k, (i, j) in enumerate(zip(range(0, 10, 2), range(1, 10, 2))):
    
    img_path = cv2.imread(LAA_paths.iloc[k])
    img = cv2.cvtColor(img_path, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (0,0), fx=0.05, fy=0.05)

    if img.shape[0] > img.shape[1]:
        img = np.rollaxis(img, 0, 2)
    axes[i].imshow(img)
    axes[i].set_title("Original")
    
    img, mask = mask_median(np.array(img))
    img = prune_image_rows_cols(img, mask)
    
    axes[j].imshow(img)
    axes[j].set_title("Normalized")

fig.show()