In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm.notebook import tqdm
import os
import matplotlib.pyplot as plt
import skimage.io
from skimage import io
import cv2
import PIL
import plotly.express as px

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras
from tensorflow.keras import backend as K

# CSV Files

## Train File

In [None]:
train_df = pd.read_csv("/kaggle/input/prostate-cancer-grade-assessment/train.csv")
train_df.head()

In [None]:
#Empty Cells
train_df.isna().sum()

In [None]:
#Unique Values
train_df.nunique()

In [None]:
for col in train_df.drop("image_id", axis = 1):
    print("Unique values in the Column {}: {}".format(col, train_df[col].unique()))

In [None]:
for gleason in sorted(train_df["gleason_score"].unique()):
    filter_df = train_df[train_df["gleason_score"] == gleason]
    print("Unique ISUP Grade of {} Gleason Score: {}".format(gleason,filter_df["isup_grade"].unique()))
    
print("")
    
for gleason in sorted(train_df["gleason_score"].unique()):
    filter_df = train_df[train_df["gleason_score"] == gleason]
    print("Number of {} Entries: {}".format(gleason,filter_df["gleason_score"].count()))



In [None]:
#drop mislabelled 4+3 gleason score and change negative gleason score to "0+0"
train_df = train_df.drop(train_df[(train_df["gleason_score"] == "4+3") & (train_df["isup_grade"] == 2)].index)
train_df["gleason_score"] = train_df["gleason_score"].replace(to_replace = "negative", value= "0+0")

In [None]:
#ISUP Grade
dp = train_df.groupby("isup_grade")["image_id"].count().reset_index()
fig = px.bar(dp, x = "isup_grade", y = "image_id")
fig.update_layout(title = "Distribution of ISUP Grade", xaxis_title = "ISUP Grade", yaxis_title = "Count")
fig.show()

In [None]:
#data provided per data provider
dp = train_df.groupby("data_provider")["image_id"].count().reset_index()

fig = px.bar(dp, x = "data_provider", y = "image_id")
fig.update_layout(title = "Data per Data Provider", xaxis_title = "Data Provider", yaxis_title = "Count")
fig.show()

In [None]:
dp = train_df.groupby(["data_provider", "isup_grade"])["image_id"].count().reset_index()

fig = px.bar(dp, x = "isup_grade", y = "image_id", color = "data_provider", barmode = "group")

fig.update_layout(xaxis_title = "ISUP Grade", yaxis_title = "Count", title = "Distribution of ISUP Grade and Data Provider", legend_title = "Data Provider")

fig.show()

In [None]:
dp = train_df.groupby(["data_provider", "gleason_score"])["image_id"].count().reset_index()

fig = px.bar(dp, x = "gleason_score", y = "image_id", color = "data_provider", barmode = "group")

fig.update_layout(title = "Distribution of Gleason Score per Data Provider",
                  xaxis_title = "Gleason Score", yaxis_title = "Count", legend_title = "Data Provider"
                 )

fig.show()

# Image Data

In [None]:
data_path = '/kaggle/input/prostate-cancer-grade-assessment/train_images/'
mask_path = '/kaggle/input/prostate-cancer-grade-assessment/train_label_masks/'

In [None]:
#load training data
train_data = []
train_id = []
for biop in tqdm(os.listdir(data_path)):
    
    train_id.append(biop[:-5])
    biopsy = skimage.io.MultiImage(data_path + biop)
    img = cv2.resize(biopsy[-1], (512,512))
    train_data.append(img)

del biopsy

In [None]:
mask_data = []
mask_id = []

#Discard all channels except of the R Channel.
for pic in tqdm(os.listdir(mask_path)):
    
    mask_id.append(pic[:-10])
    mask_img = skimage.io.MultiImage(mask_path + pic)
    img = cv2.resize(mask_img[-1][:,:,0], (512,512))
    mask_data.append(img)
    
del mask_img
del img

## Compare following Image IDs

- 0cc35bc0fe4dd912b20f72d66888fd49  Karolinska
- 0018ae58b01bdadc8e347995b69f99aa	Rabound
- 001c62abd11fa4b57bf7a6c603a11bb9	Karolinska

Radboud: Prostate glands are individually labelled. Valid values are:
- 0: background (non tissue) or unknown
- 1: stroma (connective tissue, non-epithelium tissue)
- 2: healthy (benign) epithelium
- 3: cancerous epithelium (Gleason 3)
- 4: cancerous epithelium (Gleason 4)
- 5: cancerous epithelium (Gleason 5)

Karolinska: Regions are labelled. Valid values are:
- 1: background (non tissue) or unknown
- 2: benign tissue (stroma and epithelium combined)
- 3: cancerous tissue (stroma and epithelium combined)

In [None]:
#For coloring the different areas based on the labels for each institute
import matplotlib
cmap = matplotlib.colors.ListedColormap(['black', 'gray', 'green', 'yellow', 'orange', 'red'])

In [None]:
train_img = pd.DataFrame({"img": train_data, "image_id": train_id})
mask_img = pd.DataFrame({"img": mask_data, "image_id": mask_id})

## Karolinska Picture

In [None]:
plt.figure()
plt.imshow(train_img.iloc[0]["img"])
plt.show()

In [None]:
#Healthy Tissue
plt.figure()
plt.imshow(mask_img.iloc[8394]["img"], cmap = cmap, interpolation='nearest', vmin=0, vmax=5)
plt.show()

In [None]:
plt.figure()
plt.imshow(train_img.iloc[4690]["img"])
plt.show()

In [None]:
plt.figure()
plt.imshow(mask_img.iloc[5306]["img"], cmap = cmap, interpolation = "nearest", vmin = 0, vmax = 5)
plt.show()

## Rabound Picture

In [None]:
plt.figure()
plt.imshow(train_img.iloc[7183]["img"])
plt.show()

In [None]:
plt.figure()
plt.imshow(mask_img.iloc[7829]["img"],cmap = cmap, interpolation='nearest', vmin=0, vmax=5)
plt.show()