In [28]:
import pandas as pd
import numpy as np
import plotly.express as px
from plotly import graph_objs as go
from plotly.graph_objs import *
from datetime import date,timedelta
from be.controllers.roman import make_roman
from be.controllers.counters import count_by_result,count_cases,count_categories,count_result_by_category

Load the data and set the necessary filters

In [12]:
df=pd.read_csv("data/USCAP_Large.csv")
for col in ["ACCESS_DATE","SIGN_DATE"]:
    df[col]=df[col].apply(lambda z:pd.Timestamp(z))
################################################RENAME PATHOLOGISTS
df["CYTOPATHOLOGIST"]=df["CYTOPATHOLOGIST"].apply(lambda z: "Pathologist " + str(int(z)) if(str(z) != 'nan') else z)
###################################### LIST OF PATHOLOGISTS

pathologists=df["CYTOPATHOLOGIST"].tolist()
pathologists=[x for x in pathologists if str(x) !="nan"]
#pathologists=list(map(lambda z:int(z),pathologists))
pathologists=list(set(pathologists))


### In this line we set a treshold for filtering out pathologists
# pathologists=[x for x in pathologists if df[df["CYTOPATHOLOGIST"]==x].shape[0]>=200]

pathologists=sorted(pathologists,key=lambda z: eval(z[11:]))
df=df[df["CYTOPATHOLOGIST"].isin(pathologists)]


first_day=min(df["SIGN_DATE"].to_list())
last_day=max(df["SIGN_DATE"].to_list())
years=list(df["YEAR"].unique())


Default_time_ranges=dict()

Default_time_ranges["Historical"]=[first_day,date.today()]
Default_time_ranges["2022"]=[date(2022,1,1),date(2022,12,31)]
Default_time_ranges["2021"]=[date(2021,1,1),date(2021,12,31)]
Default_time_ranges["2020"]=[date(2020,1,1),date(2020,12,31)]
Default_time_ranges["2019"]=[date(2019,1,1),date(2019,12,31)]
Default_time_ranges["2018"]=[date(2018,1,1),date(2018,12,31)]

In [14]:
len(list(df["CYTOPATHOLOGIST"].unique()))

12

### Here we set the names that should be used for the pathologists:



In [18]:
naming_dict=dict()
naming_dict["Pathologist 2"]="SP1"
naming_dict["Pathologist 5"]="SP2"
naming_dict["Pathologist 1"]="JP1"
naming_dict["Pathologist 3"]="JP2"
naming_dict["Pathologist 4"]="JP3"
naming_dict["Pathologist 6"]="JP4"
naming_dict["Pathologist 7"]="JP5"
naming_dict["Pathologist 8"]="JP6"
naming_dict["Pathologist 9"]="JP7"
naming_dict["Pathologist 10"]="JP8"
naming_dict["Pathologist 11"]="JP9"
naming_dict["Pathologist 12"]="JP10"



In [19]:
df["PATHOLOGIST"]=df["CYTOPATHOLOGIST"].apply(lambda z: naming_dict[z])

## Category distribution by pathologists

In [71]:
### In this line we set a treshold for filtering out pathologists
pathologists=[x for x in pathologists if df[df["CYTOPATHOLOGIST"]==x].shape[0]>=5]
pathologists

['Pathologist 1',
 'Pathologist 2',
 'Pathologist 3',
 'Pathologist 4',
 'Pathologist 5',
 'Pathologist 7',
 'Pathologist 8',
 'Pathologist 9',
 'Pathologist 10',
 'Pathologist 12']

In [73]:
def make_stacked_bar(data_frame,labels_column,values_column,title,yaxislabel):
    feg = px.bar( data_frame,x=labels_column, y=values_column, title="Wide-Form Input",
    color_discrete_sequence=px.colors.qualitative.T10
    # color_discrete_sequence=["red", "green", "blue", "goldenrod", "magenta","orange"]
    )
    feg.update_layout(
            autosize=True,
            margin=dict(
                l=0,
                r=0,
                b=0,
                t=40,
                pad=0
            ),
            # template="plotly_dark",
            title={
            "text":title,
            'y':0.98,
            'x':0.46,
            'xanchor': 'center',
            'yanchor': 'top'
            },
            legend_title="",
            xaxis_title=None,
            yaxis_title=yaxislabel
            # legend_traceorder="reversed",

        )
    return feg

In [74]:
############# CATEGORY RATIOS BY PATHOLOGISTS
def count_data(all_paths_df):
    count_data=pd.DataFrame()

    count_data["Pathologists"]=[naming_dict[pathologist] for pathologist in pathologists]
    for i in range(1,7):
        count_data[make_roman(i)]=[count_categories(all_paths_df,pathologist,i) for pathologist in pathologists]
    count_data["Cases"]=[count_cases(all_paths_df,pathologist) for pathologist in pathologists]
    count_data["Positives"]=[count_by_result(all_paths_df,pathologist,"POSITIVE") for pathologist in pathologists]
    count_data["Currently Negative"]=[count_by_result(all_paths_df,pathologist,"CURRENTLY NEGATIVE") for pathologist in pathologists]
    count_data["Positives or CN"]=count_data["Positives"]+count_data["Currently Negative"]
    count_data["positive_rate"]=count_data["Positives or CN"]/count_data["Cases"]
    count_data["positive_rate"]=count_data["positive_rate"].apply(lambda z:round(z,2))
    for i in range(1,7):
        count_data["TBS "+make_roman(i)]=count_data[make_roman(i)]/count_data["Cases"]
        count_data["TBS "+make_roman(i)]=count_data["TBS "+make_roman(i)].apply(lambda z:round(z,2))
    # count_data["Pathologists"]=pathologists

    count_data["TBS III Positives"]=[count_result_by_category(df,pathologist,3,"POSITIVE") for pathologist in pathologists]
    count_data["TBS III + Rate"]=round(count_data["TBS III Positives"]/count_data["III"],2)
    new_row=dict()
    new_row["Pathologists"]=["Overall"]
    new_row["Cases"]=[count_data["Cases"].sum()]
    for i in range(1,7):
        new_row[make_roman(i)]=[count_data[make_roman(i)].sum()]
    for i in range(1,7):
        new_row["ratio category "+make_roman(i)]=[new_row[make_roman(i)][0]/new_row["Cases"][0]]
    new_row["Positives"]=[count_data["Positives"].sum()]
    new_row["positive_rate"]=[count_data["Positives"].sum()/count_data["Cases"].sum()]
    new_row["TBS III Positives"]=[count_data["TBS III Positives"].sum()]
    new_row["TBS III + Rate"]=[round(count_data["TBS III Positives"].sum()/count_data["III"].sum(),2)]
    for i in range(1,7):
        new_row["TBS "+ make_roman(i)]=[round(count_data[make_roman(i)].sum()/count_data["Cases"].sum(),2)]

    new_row=pd.DataFrame.from_dict(new_row)
    count_data=pd.concat([count_data,new_row])
    count_data["TBS III Call Rate"]=count_data["TBS III"]
    return count_data

In [75]:
data=count_data(df)

In [76]:
data.head()

Unnamed: 0,Pathologists,I,II,III,IV,V,VI,Cases,Positives,Currently Negative,...,TBS VI,TBS III Positives,TBS III + Rate,ratio category I,ratio category II,ratio category III,ratio category IV,ratio category V,ratio category VI,TBS III Call Rate
0,JP1,6,44,17,2,0,2,71,6,3.0,...,0.03,4,0.24,,,,,,,0.24
1,SP1,109,835,132,14,6,39,1135,42,21.0,...,0.03,34,0.26,,,,,,,0.12
2,JP2,11,90,23,3,0,6,133,3,1.0,...,0.05,2,0.09,,,,,,,0.17
3,JP3,6,51,8,0,1,2,68,0,0.0,...,0.03,0,0.0,,,,,,,0.12
4,SP2,101,707,329,21,7,59,1224,77,43.0,...,0.05,64,0.19,,,,,,,0.27


In [77]:
def compare_ratios(dataframe):
        return make_stacked_bar(dataframe,"Pathologists",["TBS "+make_roman(i)  for i in range(1,7)], "Category Distribution By Pathologist","Rate")

In [None]:
### In this line we set a treshold for filtering out pathologists
# pathologists=[x for x in pathologists if df[df["CYTOPATHOLOGIST"]==x].shape[0]>=200]

In [78]:
counting_data=count_data(df)

In [79]:
compare_ratios(counting_data)