In [2]:
import pandas as pd
import numpy as np
import os
import re
import plotly.graph_objects as go
import plotly.express as px
from urllib.request import urlopen
from scipy.stats import spearmanr, kruskal
import json

## Import Data

In [3]:
os.chdir('../')

In [4]:

data = pd.read_csv('data/processed/litreview_analysis.csv')

In [5]:
data.head()

Unnamed: 0,Title,URL,Year,Disease,Primary or Secondary Care,Uses additional data,Country,Time Type,N varaibles,N,...,External evaluation metric 4,demographics,lab tests,symptoms,medication,comorbidities,lifestyle factors,diagnoses,healthcare utilisation,procedures
0,"Derivation, Validation, and Potential Treatmen...",https://pubmed.ncbi.nlm.nih.gov/31104070/,2019.0,Sepsis,Secondary,no,USA,cohort,29.0,20189.0,...,Comparison to tradition cluster groups,1,1,0,0,1,0,0,0,0
1,Exploration of critical care data by using uns...,https://pubmed.ncbi.nlm.nih.gov/32403049/,2020.0,ICU patients,Secondary,no,USA,cohort,9.0,1503.0,...,,0,1,0,0,0,0,0,0,0
2,Multimorbidity and functional status in older ...,https://pubmed.ncbi.nlm.nih.gov/32297200/,2020.0,Multi Morbidity,Primary,yes,Spain,cohort,38.0,813.0,...,,1,0,0,1,1,1,0,1,0
3,Deep representation learning of electronic hea...,https://pubmed.ncbi.nlm.nih.gov/32699826/,2020.0,T2D,Both,no,USA,longitudinal,31659.0,100506.0,...,,1,1,1,1,0,0,0,1,1
4,Deep representation learning of electronic hea...,https://pubmed.ncbi.nlm.nih.gov/32699826/,2020.0,AD,Both,no,USA,longitudinal,31659.0,6748.0,...,,1,1,1,1,0,0,0,1,1


In [6]:
short = data.drop_duplicates('Title')
len(short)

64

## Date Barplot

In [65]:
date_df = short['Year'].value_counts().reset_index().rename(columns={'index':'Year','Year':'Number of Studies'})
date_df['colour'] = date_df['Year'].apply(lambda x: str(x%4))
date_plot = px.bar(date_df, x='Year',y='Number of Studies',color='colour', color_discrete_sequence=['#D90368','#197278','#541388','#F18805'])
date_plot.update_layout(showlegend=False,plot_bgcolor="#F0e7E2",font=dict(size=14)) 
date_plot.update_layout(
    width=600,
)

date_plot.show()

## Disease Barplot

In [66]:
disease_df = data.drop_duplicates(['Title','Disease'])['Disease'].value_counts().reset_index().rename(columns={
    'index':'Disease',
    'Disease':'Number of Studies'}).sort_values('Disease')
disease_df['colour'] = [str(i%4) for i in range(len(disease_df))]
disease_plot = px.bar(disease_df,x='Disease',y='Number of Studies',color='colour', color_discrete_sequence=['#D90368','#197278','#541388','#F18805'])
disease_plot.update_layout(
    showlegend=False,xaxis = go.layout.XAxis(
        tickangle = -60),
    plot_bgcolor="#F0e7E2",
    font=dict(size=14)  
)
disease_plot.update_xaxes(categoryorder='category ascending')
disease_plot.show()

In [9]:
len(disease_df)

32

## NXM Scatter

In [67]:
nxm_df = data.drop_duplicates(['Title','N'])
nxm_plot = px.scatter(nxm_df,x='N varaibles',y='N',log_x=True,log_y=True,color='Time Type',color_discrete_sequence=['#D90368','#197278'])
nxm_plot.update_layout(plot_bgcolor="#F0e7E2",font=dict(size=14)) 

nxm_plot.show()

In [11]:
print('N median ' + str(nxm_df['N'].median()))
print('N max ' + str(nxm_df['N'].max()))
print('N min ' + str(nxm_df['N'].min()))
print('N iqr ' + str(np.percentile(nxm_df.drop_duplicates('Title')['N'],[25,75])))

N median 8490.0
N max 1127114.0
N min 43.0
N iqr [ 1576.75 46575.75]


In [12]:
print('N var median ' + str(nxm_df['N varaibles'].median()))
print('N max ' + str(nxm_df['N varaibles'].max()))
print('N min ' + str(nxm_df['N varaibles'].min()))
print('N var iqr ' + str(np.percentile(nxm_df.drop_duplicates('Title')['N varaibles'],[25,75])))

N var median 34.0
N max 31659.0
N min 1.0
N var iqr [ 16.   137.75]


In [13]:
coef,p = spearmanr(nxm_df['N'],nxm_df['N varaibles'])
print('correlation = ' + str(coef))
print('significance = '+ str(p))

correlation = 0.24600118966454057
significance = 0.0415964374671848


In [14]:
print('cohort N median ' + str(nxm_df[nxm_df['Time Type']=='cohort']['N'].median()))
print('cohort N max ' + str(nxm_df[nxm_df['Time Type']=='cohort']['N'].max()))
print('cohort N min ' + str(nxm_df[nxm_df['Time Type']=='cohort']['N'].min()))
print('cohort N iqr ' + str(np.percentile(nxm_df[nxm_df['Time Type']=='cohort'].drop_duplicates('Title')['N'],[25,75])))
print('cohort N var median ' + str(nxm_df[nxm_df['Time Type']=='cohort']['N varaibles'].median()))
print('cohort N var max ' + str(nxm_df[nxm_df['Time Type']=='cohort']['N varaibles'].max()))
print('cohort N var min ' + str(nxm_df[nxm_df['Time Type']=='cohort']['N varaibles'].min()))
print('cohort N var iqr ' + str(np.percentile(nxm_df[nxm_df['Time Type']=='cohort'].drop_duplicates('Title')['N varaibles'],[25,75])))

cohort N median 10499.0
cohort N max 853085.0
cohort N min 159.0
cohort N iqr [ 2326. 52368.]
cohort N var median 34.0
cohort N var max 2521.0
cohort N var min 9.0
cohort N var iqr [ 23. 140.]


In [15]:
print('longitudinal N median ' + str(nxm_df[nxm_df['Time Type']=='longitudinal']['N'].median()))
print('longitudinal N max ' + str(nxm_df[nxm_df['Time Type']=='longitudinal']['N'].max()))
print('longitudinal N min ' + str(nxm_df[nxm_df['Time Type']=='longitudinal']['N'].min()))
print('longitudinal N iqr ' + str(np.percentile(nxm_df[nxm_df['Time Type']=='longitudinal'].drop_duplicates('Title')['N'],[25,75])))
print('longitudinal N var median ' + str(nxm_df[nxm_df['Time Type']=='longitudinal']['N varaibles'].median()))
print('longitudinal N var max ' + str(nxm_df[nxm_df['Time Type']=='longitudinal']['N varaibles'].max()))
print('longitudinal N var min ' + str(nxm_df[nxm_df['Time Type']=='longitudinal']['N varaibles'].min()))
print('longitudenal N var iqr ' + str(np.percentile(nxm_df[nxm_df['Time Type']=='longitudinal'].drop_duplicates('Title')['N varaibles'],[25,75])))

longitudinal N median 5893.5
longitudinal N max 1127114.0
longitudinal N min 43.0
longitudinal N iqr [ 1250. 31567.]
longitudinal N var median 39.5
longitudinal N var max 31659.0
longitudinal N var min 1.0
longitudenal N var iqr [  8.5 131.5]


In [16]:
n_stat, n_p = kruskal(nxm_df[nxm_df['Time Type']=='cohort']['N'],nxm_df[nxm_df['Time Type']=='longitudinal']['N'])
var_stat, var_p = kruskal(nxm_df[nxm_df['Time Type']=='cohort']['N varaibles'],nxm_df[nxm_df['Time Type']=='longitudinal']['N varaibles'])
print('n significant = ' + str(n_p))
print('var significant = ' + str(var_p))

n significant = 0.3104593146224299
var significant = 0.8162764301905032


In [17]:
print('cohort N mean ' + str(nxm_df[nxm_df['Time Type']=='cohort']['N varaibles'].mean()))
print('longitudinal N mean ' + str(nxm_df[nxm_df['Time Type']=='longitudinal']['N varaibles'].mean()))

cohort N mean 241.3658536585366
longitudinal N mean 6886.071428571428


## Primary Seconday Venn

In [18]:
care_df = short['Primary or Secondary Care'].value_counts().rename({
    'Primary or Secondary Care':'Count'
})
care_df['Secondary'] = care_df['Secondary'] + care_df['Tertiary']
care_df

Secondary    27
Both         21
Primary      16
Tertiary      1
Name: Primary or Secondary Care, dtype: int64

In [19]:
p_d = care_df.loc['Primary'] + care_df.loc['Both']
s_d = care_df.loc['Secondary'] + care_df.loc['Both']
b_d = care_df.loc['Both']
s_w = s_d/p_d
b_w = b_d/p_d

p_x0 = 0
p_x1 = 1
p_y0 = 0
p_y1 = 1

s_x0 = 1-b_w
s_x1 = s_x0 + s_w
s_y0 = 0.5-s_w/2
s_y1 = 0.5+s_w/2

In [20]:
import plotly.graph_objects as go

fig = go.Figure()

# Create scatter trace of text labels
fig.add_trace(go.Scatter(
    x=[p_x1/3-0.1,s_x1/2-0.1,s_x1-s_w/3],
    y=[0.5, 0.5, 0.5],
    text=[i + '<br>'+str(care_df.loc[i]) + ' papers' if i == 'Both' else i + ' care<br>'+str(care_df.loc[i]) + ' papers' for i in["Primary", "Both", "Secondary"]],
    mode="text",
    textfont=dict(
        color="black",
        size=13,
        family="Arial",
    )
))

# Update axes properties
fig.update_xaxes(
    showticklabels=False,
    showgrid=False,
    zeroline=False,
)

fig.update_yaxes(
    showticklabels=False,
    showgrid=False,
    zeroline=False,
)

# Add circles
fig.add_shape(type="circle",
    line_color='#541388', fillcolor='#541388',
    x0=p_x0, y0=p_y0, x1=p_x1, y1=p_y1
)
fig.add_shape(type="circle",
    line_color='#197278', fillcolor='#197278',
    x0=s_x0, y0=s_y0, x1=s_x1, y1=s_y1
)
fig.update_shapes(opacity=0.3, xref="x", yref="y")

fig.update_layout(
    margin=dict(l=20, r=20, b=100),
    height=390, width=300,
    plot_bgcolor="white"
)

fig.show()

## Location Map

In [21]:
country_df = short['Country'].value_counts().reset_index().rename(columns={'index':'Country','Country':'Count'})
country_df.loc[country_df['Country']=='USA','Country']='United States'
country_df.loc[country_df['Country']=='UK','Country']='United Kingdom'

df = px.data.gapminder().query("year==2007")
country_df = pd.merge(df[['country','iso_alpha']],country_df, how = 'right',left_on='country',right_on='Country')
country_df.loc[3,'country'] = 'Russia'
country_df.loc[3,'iso_alpha'] = 'RUS'
country_df.loc[5,'country'] = 'South Korea'
country_df.loc[5,'iso_alpha'] = 'KOR'
country_df

Unnamed: 0,country,iso_alpha,Country,Count
0,United States,USA,United States,43
1,Spain,ESP,Spain,10
2,United Kingdom,GBR,United Kingdom,5
3,Russia,RUS,Russia,1
4,Australia,AUS,Australia,1
5,South Korea,KOR,South Korea,1
6,Portugal,PRT,Portugal,1
7,South Africa,ZAF,South Africa,1
8,Croatia,HRV,Croatia,1


In [22]:
country_plot = px.choropleth(country_df, locations="iso_alpha",
                    color="Count", # lifeExp is a column of gapminder
                    hover_name="Country", # column to add to hover information
                    color_continuous_scale=px.colors.sequential.Plasma)
country_plot.show()

## Variables upset plot

In [23]:
vars_df = short[['lifestyle factors', 'healthcare utilisation',
       'demographics', 'lab tests', 'procedures', 'symptoms', 'medication',
       'comorbidities']]
var_types = [i[1][i[1]==1].index.to_list() for i in vars_df.iterrows()]
var_set = sorted(list(set(tuple(sorted(i)) for i in var_types)))
var_unique = sorted(vars_df.columns.tolist())
var_code = {i:(idx*3+1)/10 for idx,i in enumerate(var_unique)}
coords = [[var_code[j] for j in i] for i in var_set]
var_types2 = [tuple(sorted(i)) for i in var_types]
var_counts = [var_types2.count(i)for i in var_set]

In [199]:
up = go.Figure()
up.update_xaxes(
    showticklabels=False,
    showgrid=False,
    zeroline=False,
)

up.update_yaxes(
    showticklabels=False,
    showgrid=False,
    zeroline=False,
)
up.update_layout(
    plot_bgcolor="white"
)
up.add_trace(go.Scatter(
    x=[0-(len(i)/24)-0.02 for i in var_code.keys()],
    y=[(i+0.1)*-1 for i in var_code.values()],
    text=list(var_code.keys()),
    mode="text",
    textfont=dict(
        color="black",
        size=14,
        family="Arial",
    )
))

up.add_trace(go.Scatter(
    x=[i*0.3 +0.18 for i in range(len(var_counts))],
    y=[0.02 for i in range(len(var_counts))],
    text=var_counts,
    mode="text",
    textfont=dict(
        color="black",
        size=16,
        family="Arial",
    )
))
    


for i in range(len(var_unique)):
    x0=0
    x1=len(var_set)*0.3+0.1
    y0= i*0.3 +0.05 
    y1=y0+0.3
    y0 = y0*(-1)
    y1 = y1*(-1)
    if i%2==0:
        col = '#DCDCDC'
    else:
        col = '#f2f2f2'
    up.add_shape(type='rect',x0=x0,x1=x1,y0=y0,y1=y1,line_color=col,fillcolor=col)

    
    
for idx,i in enumerate(coords):
    c_x0 = (idx*3+1)/10
    c_x1 = c_x0 + 0.2
    for j in range(len(var_unique)):
        c_y0 = (j*3+1)/10
        c_y1 = c_y0+ 0.2
        if c_y0 in i:
            c_col = '#000000'
        else:
            c_col = '#bfbfbf'
        c_y0 = c_y0*(-1)
        c_y1 = c_y1*(-1)
        up.add_shape(type="circle",line_width=0, fillcolor=c_col,x0=c_x0, y0=c_y0, x1=c_x1, y1=c_y1)
    if len(i)>1:
        l_x = c_x0+0.1
        l_y0 = (min(i)+ 0.1)*-1
        l_y1 = (max(i)+ 0.1)*-1
        up.add_shape(type='line',line=dict(width=2,color='black'),x0=l_x,x1=l_x,y0=l_y0,y1=l_y1)

up.update_layout(
    showlegend=False,height=400, width=300*4.5+12)        

up.show()

In [25]:
for i in ['lifestyle factors', 'healthcare utilisation',
       'demographics', 'lab tests', 'procedures', 'symptoms', 'medication',
       'comorbidities']:
    print(i)
    print(short[i].sum())

lifestyle factors
10
healthcare utilisation
17
demographics
37
lab tests
19
procedures
8
symptoms
15
medication
24
comorbidities
39


In [26]:
vars_df.sum(axis=1).value_counts()

1    18
2    17
3    14
5     7
6     4
4     4
dtype: int64

## Method Plots

In [68]:
def plot_methods(df,method):
    df_count = df[['Time Type',method]]
    df_count['Number of experiments'] = 1
    df_count=df_count.groupby(['Time Type',method])['Number of experiments'].count().reset_index()
    method_plot = px.bar(df_count,x=method,y='Number of experiments',color='Time Type',barmode="group", 
                            color_discrete_sequence=['#541388','#F18805'])
    method_plot.update_layout(plot_bgcolor="#F0e7E2",font=dict(size=14))

    method_plot.show()

In [72]:
def plot_evaluation(df_a,eval_name):
    df =df_a.copy()
    eval_cols = [i for i in df.columns if eval_name in i]
    eval_df = df.drop_duplicates(['Title']+eval_cols)
    change_cols = [i for i in eval_cols if '1' not in i]
    eval_num = eval_df.copy()
    eval_df[change_cols] = eval_df.loc[:,change_cols].replace({'None':''})

    eval_count = pd.melt(eval_df[['Title']+eval_cols],id_vars = 'Title', value_vars = eval_cols)
    eval_count = eval_count[eval_count['value']!='']['value'].value_counts().reset_index().rename(columns={'index':eval_name,'value':'Number of experiments'})
    eval_count = eval_count.sort_values(eval_name)
    eval_count['colour'] = [str(i%4) for i in range(len(eval_count))]



    eval_plot = px.bar(eval_count,x=eval_name,y='Number of experiments',color='colour', color_discrete_sequence=['#D90368','#197278','#541388','#F18805'])
    eval_plot.update_layout(
        showlegend=False,xaxis = go.layout.XAxis(
            tickangle = -60),
        plot_bgcolor="#F0e7E2",font=dict(size=14)

    )
    eval_plot.update_xaxes(categoryorder='category ascending')
    eval_plot.show()
    if eval_name == 'Deciding K':
        eval_num = eval_num[eval_num['Deciding K 1'] != 'doesnt say']
    eval_num['Number of methods used'] = [4-len(i[1][i[1]=='None']) for i in eval_num[eval_cols].iterrows()]
    eval_num = eval_num['Number of methods used'].value_counts().reset_index().rename(columns={'index':'Number of methods used','Number of methods used':'Number of experiments'})
    eval_num = eval_num.sort_values('Number of methods used')
    eval_num['Number of methods used'] = eval_num['Number of methods used'].astype(str)
    evnum_plot = px.bar(eval_num,x='Number of methods used',y='Number of experiments',color='Number of methods used', 
                        color_discrete_sequence=['#D90368','#197278','#541388','#F18805','#11151C'])
    evnum_plot.update_layout(
        showlegend=False,
        plot_bgcolor="#F0e7E2"

    )
    evnum_plot.update_xaxes(categoryorder='category ascending')
    evnum_plot.update_layout(
    width=600,font=dict(size=14)
)
    evnum_plot.show()

### Data Transformation

In [70]:
plot_methods(data,'Data Transformation')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [30]:
plot_methods(data,'Clustering algorithm ')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



### Deciding K

In [73]:
plot_evaluation(data,'Deciding K')

In [40]:
dec_cols = [i for i in data.columns if 'Decid' in i]
dec_list = set([j for i in dec_cols for j in data[i]])
dec_list

{'AIC',
 'Adjusted Rand',
 'Adjusted Wallace',
 'BIC',
 'Between cluster variance',
 'C index ',
 'CVNN',
 'Calinski Harabasz index',
 'Clest',
 'Cluster size',
 'Davies Bouldin',
 'Dunn',
 'Fawkes mallows',
 'Gap statistic',
 'Hierarchical Clustering',
 'Inspection of dendogram',
 'None',
 'PCA',
 'Partition coefficient',
 'R squared',
 'Silhouette Score',
 'Stability',
 'TSS',
 'Variance explained-elbow plot',
 'Xie-Beni',
 'clinical evaluation',
 'comparisons of outcomes',
 'consensus matrix heatmap seperation',
 'correlation coefficient',
 'cubic clustering criterion',
 'doesnt say',
 'eigan gap heuristic',
 'intrinsic in algorithm',
 'k-distance tuning',
 'magnitude of topic',
 'manually chosen',
 'pairwise concensus values',
 'partition entropy',
 'pseudo F',
 'pseudo T',
 'test train group(s)',
 'topic coherance',
 'within group dispersion'}

In [50]:
short['Missing data'].value_counts()

None                               43
remove incomplete records          12
incorperated                        5
Multiple imputation                 2
mean imputation                     1
fully conditional specification     1
Name: Missing data, dtype: int64

In [51]:
short['Feature Selection'].value_counts()

Manually chosen                33
None                           10
Prevelance threshold            9
Previous research               5
Remove correlated variables     2
Predictiction task              2
Remove unique variables         1
PheWAS                          1
Survey                          1
Name: Feature Selection, dtype: int64

In [52]:
short['Charactersing clusters'].value_counts()

most prevalent                          21
statistical test                        13
mean and SD                              9
observed expected ratio                  7
researcher/clinician                     5
cumulative trajectory analysis           2
ranking variables by mean difference     1
None                                     1
Elixhauser Comorbidity Index             1
rate of change of variable               1
SPADE                                    1
prediction model                         1
pearson rank correlation                 1
Name: Charactersing clusters, dtype: int64

## Eval plots

### Internal evaluation

In [74]:
plot_evaluation(data,'Internal evaluation')

### External Evaluation

In [75]:
plot_evaluation(data,'External evaluation')

## Method Alluvial

In [44]:
def melt_df(df,col_type):
    df2 = df.copy()
    val_vars =  [i for i in df2.columns if col_type in i]
    id_vars = [i for i in df2.columns if i not in val_vars]
    df3 = pd.melt(df2,id_vars = id_vars, value_vars = val_vars,value_name = col_type).drop(columns='variable')
    df3 = df3[df3[col_type] != '']
    return df3

In [45]:
def get_sets(df,col):
    df2 = df.copy()
    col_list = [i for i in df2.columns if col in i]
    if len(col_list) > 1: 
        df2 = melt_df(df2,col)
    return set(df2[col].to_list())

In [46]:
def rename_repeats(col_list,col_name,repeat_cols):
    if 'None' not in repeat_cols:
        repeat_cols = repeat_cols + ['None']
    rename_dict = {i: 'No '+col_name if i =='None' else i+' '+col_name for i in repeat_cols}
    new_col = [rename_dict[i] if i in rename_dict.keys() else i for i in col_list]
    return new_col

In [47]:
def get_counts(df,col_list,col_n,repeat_cols):
    col1 = col_list[col_n-1]
    col2 = col_list[col_n]
    col1_list = [i for i in df.columns if col1 in i]
    col2_list = [i for i in df.columns if col2 in i]
    df_short = df[col1_list+col2_list]
    if len(col1_list) > 1:
        df_short = melt_df(df_short,col1)
    if len(col2_list) > 1:
        df_short = melt_df(df_short,col2)
    df_short['Value'] = 1
    df_short = df_short.groupby([col1,col2]).count().reset_index().rename(columns={col1:'Source',col2:'Target'})
    df_short['Source'] = rename_repeats(df_short['Source'].tolist(),col1,repeat_cols)
    df_short['Target'] = rename_repeats(df_short['Target'].tolist(),col2,repeat_cols)
    return df_short

In [48]:
def get_node_df(method_df,col_list,col,repeat_cols): 

    none_cols = [i for i in method_df.columns if any(j in i for j in ['2','3','4'])]
    method_df[none_cols] = method_df[none_cols].replace({'None':''})

    node_df = pd.concat([get_counts(method_df,col_list,i,repeat_cols)for i in range(1,len(col_list))])
    node_df['colour']=col
    return node_df

In [186]:
def get_sanky(df, col_list,color_key,pad=10,height = 800):
    useful_cols = ['Feature Selection','Data Transformation',
       'Clustering algorithm ', 'Deciding K 1', 'Deciding K 2', 'Deciding K 3',
       'Deciding K 4','Charactersing clusters',
       'Internal evaluation metric 1', 'Internal evaluation metric 2',
       'Internal evaluation metric 3', 'Internal evaluation metric 4',
       'External evaluation metric 1', 'External evaluation metric 2',
       'External evaluation metric 3', 'External evaluation metric 4',]
    df_list = [df[df[color_key]==i][useful_cols] for i in df[color_key].unique()]
    color_list = ['#D90368','#197278','#541388','#F18805','#150578','#57B8FF','#388697',
                 '#FF7B9C','#11151C','#1f0812','#394053','#BFC0C0','#241E4E','#CA054D',
                 '#1b998b','#B76d68','#D3c1d2','#9E2a2b','#BABFD1','#E2C2C6','#75D1B9']
    
    set_list = [j for i in col_list for j in get_sets(df,i)]
    repeat_cols = [i for i in set(set_list) if set_list.count(i) > 1]

    node_df = pd.concat([get_node_df(df_list[i],col_list,color_list[i],repeat_cols) for i in range(len(df_list))])
    node_list = list(set(node_df['Source'].tolist()+node_df['Target'].tolist()))
    node_keys = {i:idx for idx,i in enumerate(node_list)}
    labels = [i if (node_df[node_df['Source']==i]['Value'].sum()+node_df[node_df['Target']==i]['Value'].sum()) > 0 else '' for i in node_list]
    node_df[['Source','Target']] = node_df[['Source','Target']].replace(node_keys)
    
    fig = go.Figure(data=[go.Sankey(
        node = dict(
            pad = pad,
            thickness = 1,
            line = dict(color = "black", width = 0.5),
            label=labels,  
            color = "blue"
    ),
        link = dict(
            source = node_df['Source'].tolist(), # indices correspond to labels, eg A1, A2, A1, B1, ...
            target = node_df['Target'].tolist(),
            value = node_df['Value'].tolist(),
            color = node_df['colour'].tolist()
        ))])
    fig.update_layout(height=height,
    font=dict(size = 14, color = 'black'),
)
    fig.show()


In [160]:
data_cohort = data[data['Time Type']=='cohort']
data_long = data[data['Time Type']=='longitudinal']
eval_list = ['Clustering algorithm ','Internal evaluation','External evaluation']
method_type = ['Data Transformation', 'Clustering algorithm ',
       'Charactersing clusters', 'Deciding K']


### cohort evals

In [161]:
get_sanky(data_cohort,eval_list,'Clustering algorithm ')

### Long eval

In [162]:
get_sanky(data_cohort,method_type,'Clustering algorithm ')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [178]:
get_sanky(data_cohort,['Data Transformation', 'Clustering algorithm '],'Clustering algorithm ')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [164]:
get_sanky(data_long,['Data Transformation', 'Clustering algorithm '],'Clustering algorithm ')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [165]:
data = data.sort_values(by =['Clustering algorithm '])
data['Clustering algorithm ']

0           Consensus K-means
37                     DBSCAN
35            Factor Analysis
50              Fuzzy c-means
42    Hierarchical Clustering
               ...           
57            kernel k-means 
30            sphere analysis
39      tensor factorisation 
41      tensor factorisation 
22      tensor factorisation 
Name: Clustering algorithm , Length: 74, dtype: object

In [187]:
get_sanky(data,['Deciding K', 'Clustering algorithm '],'Clustering algorithm ',pad=7,height=1400)

In [188]:
get_sanky(data_long,['Deciding K', 'Clustering algorithm '],'Clustering algorithm ')

### Data Transformation X cluster method

In [80]:
dimXcm = data_cohort[['Data Transformation', 'Clustering algorithm ']]
dimXcm['Clustering algorithm '] = dimXcm['Clustering algorithm '].apply(lambda x: x if x in ['Hierarchical Clustering','K-means'] else 'other method')
dimXcm['count'] = 1
dimXcm_tot = dimXcm.groupby(['Data Transformation', 'Clustering algorithm '])['count'].count()
dimXcm_tot



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Data Transformation          Clustering algorithm   
MCA                          Hierarchical Clustering    1
                             K-means                    4
None                         Hierarchical Clustering    8
                             K-means                    5
                             other method               7
PCA                          K-means                    2
                             other method               1
SOM                          Hierarchical Clustering    1
TFIDF                        Hierarchical Clustering    1
                             K-means                    1
                             other method               1
autoencoders                 K-means                    1
factor analysis              K-means                    1
manually combined variables  Hierarchical Clustering    1
new deep learning model      K-means                    1
new statistical model        K-means                    1
t-SNE              

In [81]:
dimXcml = data_long[['Data Transformation', 'Clustering algorithm ']]
dimXcml['count'] = 1
dimXcml_tot = dimXcml.groupby(['Data Transformation', 'Clustering algorithm '])['count'].count()
dimXcml_tot



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Data Transformation       Clustering algorithm            
Graph network             Spectral clustering                 1
LCS                       Hierarchical Clustering             2
MCA                       K-means                             1
None                      Hierarchical Clustering             1
                          Longitudinal k-means                2
                          Probabalistic subtyping model       1
                          tensor factorisation                2
TDA                       graph based clustering algorithm    1
TNW                       Hierarchical Clustering             1
autoencoders              Longitudinal k-means                1
b splines                 Longitudinal k-means                1
medAl                     K-means                             1
new deep learning model   Hierarchical Clustering             6
                          K-means                             1
                          Spectral clustering

### deciding k X cluster method 

In [82]:
dkXcm = data_cohort[['Clustering algorithm ', 'Deciding K 1', 'Deciding K 2', 'Deciding K 3',
       'Deciding K 4']]
rep_cols = [i for i in dkXcm.columns if any(str(j) in i for j in [2,3,4])]
dkXcm[rep_cols] = dkXcm[rep_cols].replace({'None':''}) 
dkXcm =pd.melt(dkXcm,id_vars = 'Clustering algorithm ', value_vars = ['Deciding K 1', 'Deciding K 2', 'Deciding K 3',
       'Deciding K 4']).drop(columns='variable')
dkXcm = dkXcm[dkXcm['value'] != '']
dkXcm['count'] = 1
dkXcm = dkXcm.groupby(['Clustering algorithm ','value'])['count'].count()
dkXcm

Clustering algorithm      value                              
Consensus K-means         Cluster size                           1
                          consensus matrix heatmap seperation    1
                          pairwise concensus values              1
Factor Analysis           Variance explained-elbow plot          1
Fuzzy c-means             Partition coefficient                  1
                          Xie-Beni                               1
                          partition entropy                      1
                          test train group(s)                    1
Hierarchical Clustering   Adjusted Rand                          2
                          C index                                1
                          CVNN                                   1
                          Gap statistic                          1
                          Inspection of dendogram                1
                          Silhouette Score                       2


In [83]:
dc2 = dkXcm.reset_index()
dc2['Clustering algorithm '] = dc2['Clustering algorithm '].apply(lambda x: x if x in ['Hierachical Clustering','K-means'] else 'other method')
dc_dict = {i:dc2[dc2['value'] == i]['Clustering algorithm '].unique() for i in dc2['value'].unique()}
dc_dict

{'Cluster size': array(['other method'], dtype=object),
 'consensus matrix heatmap seperation': array(['other method'], dtype=object),
 'pairwise concensus values': array(['other method'], dtype=object),
 'Variance explained-elbow plot': array(['other method'], dtype=object),
 'Partition coefficient': array(['other method'], dtype=object),
 'Xie-Beni': array(['other method'], dtype=object),
 'partition entropy': array(['other method'], dtype=object),
 'test train group(s)': array(['other method'], dtype=object),
 'Adjusted Rand': array(['other method'], dtype=object),
 'C index ': array(['other method'], dtype=object),
 'CVNN': array(['other method'], dtype=object),
 'Gap statistic': array(['other method', 'K-means'], dtype=object),
 'Inspection of dendogram': array(['other method', 'K-means'], dtype=object),
 'Silhouette Score': array(['other method', 'K-means'], dtype=object),
 'TSS': array(['other method', 'K-means'], dtype=object),
 'clinical evaluation': array(['other method', 'K-

### long dk 

In [84]:
dkXcml = data_long[['Clustering algorithm ', 'Deciding K 1', 'Deciding K 2', 'Deciding K 3',
       'Deciding K 4']] 
rep_cols = [i for i in dkXcml.columns if any(str(j) in i for j in [2,3,4])]
dkXcml[rep_cols] = dkXcml[rep_cols].replace({'None':''}) 
dkXcml =pd.melt(dkXcml,id_vars = 'Clustering algorithm ', value_vars = ['Deciding K 1', 'Deciding K 2', 'Deciding K 3',
       'Deciding K 4']).drop(columns='variable')
dkXcml = dkXcml[dkXcml['value'] != '']
dkXcml['count'] = 1
dkXcml = dkXcml.groupby(['Clustering algorithm ','value'])['count'].count()
dcl2 = dkXcml.reset_index()

dcl_dict = {i:dcl2[dcl2['value'] == i]['Clustering algorithm '].unique() for i in dcl2['value'].unique()}
dcl_dict

{'k-distance tuning': array(['DBSCAN'], dtype=object),
 'Adjusted Rand': array(['Hierarchical Clustering'], dtype=object),
 'Adjusted Wallace': array(['Hierarchical Clustering'], dtype=object),
 'Cluster size': array(['Hierarchical Clustering', 'Longitudinal k-means', 'k-medians'],
       dtype=object),
 'Fawkes mallows': array(['Hierarchical Clustering'], dtype=object),
 'Gap statistic': array(['Hierarchical Clustering', 'K-means'], dtype=object),
 'Inspection of dendogram': array(['Hierarchical Clustering'], dtype=object),
 'Silhouette Score': array(['Hierarchical Clustering', 'K-means', 'Longitudinal k-means',
        'Spectral clustering', 'kernel k-means '], dtype=object),
 'Stability': array(['Hierarchical Clustering'], dtype=object),
 'TSS': array(['Hierarchical Clustering', 'K-means', 'Longitudinal k-means',
        'Spectral clustering', 'kernel k-means '], dtype=object),
 'comparisons of outcomes': array(['Hierarchical Clustering'], dtype=object),
 'doesnt say': array(['Hiera

## K by method

In [88]:
kcm_plot = px.scatter(data,x='Clustering algorithm ',y='K',color = 'Time Type',color_discrete_sequence=['#D90368','#197278'])
kcm_plot.update_layout(
        plot_bgcolor="#F0e7E2",xaxis = go.layout.XAxis(
            tickangle = -60),font=dict(size=14)

    )
kcm_plot.show()

In [86]:
data['K'].median()

5.0

In [89]:
data['K'].min()

2.0

In [90]:
data['K'].max()

42.0

In [91]:
data[data['Time Type'] == 'cohort']['K'].value_counts()

4.0     9
5.0     9
3.0     6
6.0     4
9.0     2
13.0    2
12.0    2
2.0     2
8.0     1
15.0    1
7.0     1
42.0    1
30.0    1
20.0    1
Name: K, dtype: int64

In [92]:
dkXk = data[['Clustering algorithm ', 'Deciding K 1', 'Deciding K 2', 'Deciding K 3',
       'Deciding K 4','K']] 
rep_cols = [i for i in dkXk.columns if any(str(j) in i for j in [2,3,4])]
dkXk[rep_cols] = dkXk[rep_cols].replace({'None':''}) 
dkXk =pd.melt(dkXk,id_vars = ['Clustering algorithm ','K'], value_vars = ['Deciding K 1', 'Deciding K 2', 'Deciding K 3',
       'Deciding K 4']).drop(columns='variable')
dkXk = dkXk[dkXk['value']!='']



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [93]:
dkXk['value'].unique()

array(['Cluster size', 'k-distance tuning',
       'Variance explained-elbow plot', 'test train group(s)', 'TSS',
       'Inspection of dendogram', 'within group dispersion',
       'Silhouette Score', 'doesnt say', 'Gap statistic', 'pseudo F',
       'Adjusted Rand', 'clinical evaluation', 'Stability',
       'manually chosen', 'Calinski Harabasz index',
       'comparisons of outcomes', 'cubic clustering criterion', 'BIC',
       'intrinsic in algorithm', 'eigan gap heuristic',
       'Hierarchical Clustering', 'Between cluster variance', 'Dunn',
       'magnitude of topic', 'consensus matrix heatmap seperation',
       'Partition coefficient', 'C index ', 'CVNN', 'pseudo T',
       'Fawkes mallows', 'Davies Bouldin', 'AIC',
       'correlation coefficient', 'topic coherance',
       'pairwise concensus values', 'partition entropy', 'PCA',
       'Xie-Beni', 'Adjusted Wallace', 'Clest', 'R squared'], dtype=object)

In [95]:
dkK_plot = px.scatter(dkXk,x='value',y='K',color='Clustering algorithm ',color_discrete_sequence= ['#D90368','#197278','#541388','#F18805',"#F0e7E2",'#B76d68','#D8FFDD','#BABFD1',
                 '#FF7B9C','#3a2d32','#75D1B9','#9FB84D','#462749','#11151C','#150578','#1f0812',
                 '#D3c1d2','#394053','#839073','#1b998b','#AA7BC3','#9E2a2b','#FFF3B0','#848FA5'])
dkK_plot.update_layout(font=dict(size=14),
        plot_bgcolor="#F0e7E2",xaxis = go.layout.XAxis(
            tickangle = -60),
    height = 700

    )
dkK_plot.show()

In [96]:
len(data['Clustering algorithm '].unique())

21

In [189]:
get_sanky(data,['Internal evaluation','Clustering algorithm '],'Clustering algorithm ')

In [190]:
get_sanky(data,['External evaluation','Clustering algorithm '],'Clustering algorithm ')

### DKxM 

In [102]:
def count_heatmat(df_a,var1,var2):
    df = df_a.copy()
    var1_cols = [i for i in df.columns if var1 in i]
    var2_cols = [i for i in df.columns if var2 in i]
    df[var1 + ' count'] = [4-len(i[1][i[1]=='None']) for i in df[var1_cols].iterrows()]
    df[var2 + ' count'] = [4-len(i[1][i[1]=='None']) for i in df[var2_cols].iterrows()]
    df['count'] = 1
    df_count = df.drop_duplicates(['Title',var1 + ' count',var2 + ' count']).groupby([var1 + ' count',var2 + ' count'])['count'].count().reset_index()
    df_count = df_count.pivot(index = var1 + ' count', columns = var2 + ' count', values = 'count').fillna(0)
    heat = px.imshow(df_count)
    heat.update_layout(font=dict(size=16))
    heat.update_yaxes(
        ticktext=[i for i in df[var1+ ' count'].unique()],
        tickvals=[i for i in df[var1+ ' count'].unique()],
        autorange = True
    )
    heat.show()

In [103]:
count_heatmat(data,'Deciding K','Internal evaluation')

In [104]:
count_heatmat(data,'Internal evaluation','External evaluation')