In [1]:
import os
import pandas as pd
import sys
import getpass
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append("../")

from connector import DremioDataframeConnector
pd.set_option('display.max_columns', None)

FONT_BASE = {
    #"family": "sans-serif",
    #"sans-serif": "helvetica",
    "weight": "normal",
    "size": 18,
}

plt.rc("font", **FONT_BASE)
plt.rc("axes", unicode_minus=False)
from matplotlib import rcParams
plt.rcParams.update({'figure.autolayout': True})
import matplotlib as mpl
mpl.rcParams['figure.facecolor'] = 'white'


In [2]:
!pwd

/Users/kohlia/Documents/codebase/datasheets-for-datasets/impact-slide/hobbit


In [3]:
def create_summary_plot(df:pd.DataFrame, field:str, sort=True, annotate=False, truncate=False, title=""):
    """creates a simple count histogram of a particular field, stratified by patient and sample ID"""
    fig, ax1 = plt.subplots(nrows=1, ncols=1, figsize=(8,8))
        
    if sort:
        df_sample = df.groupby(by=[field])['image_id'].nunique().reset_index(name='count').sort_values(['count'], ascending=False).reset_index()

    else:
        df_sample = df.groupby(by=[field])['image_id'].nunique().reset_index(name='count')
    
    index  = df_sample.index
    counts = df_sample['count']
    labels = df_sample[field]
    if truncate:
        index  = index[:10]
        counts = counts[:10]
        labels = labels[:10]
    
    bars = ax1.bar(index, counts)
    ax1.set_xticks(index)
    ax1.set_xticklabels(labels, rotation=45, ha='right')
    ax1.set_ylabel("Number of Slides")
    if annotate:
        ax1.bar_label(bars)
    

    plt.title(title)
    return plt


In [4]:
# Setup Dremio connector
# Credentials (also could be read via .env)

DREMIO_USER = input("Username: ")
DREMIO_PASSWORD = getpass.getpass(prompt="Password or PAT: ", stream=None)

dremio_session = DremioDataframeConnector(
   scheme="grpc+tcp",
   hostname="tlvidreamcord1",
   flightport=32010,
   dremio_user=DREMIO_USER,
   dremio_password=DREMIO_PASSWORD,
   connection_args={},
)



Username: kohlia
Password or PAT: ········


## HoBBIT casebreakdown

In [None]:
# Querying HoBBIT Casebreakdown table
query = 'select * from "hobbit-poc"."case_breakdown"'
df = dremio_session.get_table(query)
display(df)

[INFO] Query:  select * from "hobbit-poc"."case_breakdown"


In [None]:
# Check for missingness
df.isnull().sum(axis=0)

In [None]:
# Plotting Available Stains

plt = create_summary_plot(df, 'stain_group', truncate=True, title='Stain Groups')
plt.savefig("./figures/available_stains.png", bbox_inches='tight', facecolor='white')

In [None]:
# Plotting available IHC stains

plt = create_summary_plot(df[df['stain_group']=='IHC'], 'stain_name', truncate=True, title='Examples of Available IHC Stains')
plt.savefig("./figures/available_ihc.png", bbox_inches='tight', facecolor='white')

In [None]:
# Plotting available slide magnifications by scanner model

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(4, 6)) #, figsize=(32,32))

_df = pd.crosstab(df['model'], df['magnification'])
display(_df)

fig = sns.heatmap(_df, linewidths=0.5, cmap="coolwarm", annot=False)
fig.set_xticklabels(labels=_df.columns, rotation=45, ha='right') #, labelsize=8)
plt.savefig("./figures/magnification.png", bbox_inches='tight', facecolor='white')