In [2]:
import pandas as pd
import numpy as np
import os
import re
import plotly.graph_objects as go
import plotly.express as px
from urllib.request import urlopen
import json

## Import Data

In [3]:
os.chdir('../')

In [4]:
data = pd.read_csv('data/processed/litreview_analysis.csv')

In [5]:
data.head()

Unnamed: 0,Title,URL,Year,Disease,Primary or Secondary Care,Uses additional data,Country,Time Type,N varaibles,N,...,External evaluation metric 4,lifestyle factors,healthcare utilisation,diagnoses,demographics,lab tests,procedures,symptoms,medication,comorbidities
0,"Derivation, Validation, and Potential Treatmen...",https://pubmed.ncbi.nlm.nih.gov/31104070/,2019.0,Sepsis,Secondary,no,USA,flat,29.0,20189.0,...,Comparison to tradition cluster groups,0,0,0,1,1,0,0,0,1
1,Exploration of critical care data by using uns...,https://pubmed.ncbi.nlm.nih.gov/32403049/,2020.0,ICU patients,Secondary,no,USA,flat,9.0,1503.0,...,,0,0,0,0,1,0,0,0,0
2,Multimorbidity and functional status in older ...,https://pubmed.ncbi.nlm.nih.gov/32297200/,2020.0,Multi Morbidity,Primary,yes,Spain,flat,38.0,813.0,...,,1,1,0,1,0,0,0,1,1
3,Deep representation learning of electronic hea...,https://pubmed.ncbi.nlm.nih.gov/32699826/,2020.0,T2D,Both,no,USA,temporal,31659.0,100506.0,...,,0,1,0,1,1,1,1,1,0
4,Deep representation learning of electronic hea...,https://pubmed.ncbi.nlm.nih.gov/32699826/,2020.0,AD,Both,no,USA,temporal,31659.0,6748.0,...,,0,1,0,1,1,1,1,1,0


In [6]:
short = data.drop_duplicates('Title')
len(short)

64

## Date Barplot

In [88]:
date_df = short['Year'].value_counts().reset_index().rename(columns={'index':'Year','Year':'Number of papers'})
date_df['colour'] = date_df['Year'].apply(lambda x: str(x%4))
date_plot = px.bar(date_df, x='Year',y='Number of papers',color='colour', color_discrete_sequence=['#D90368','#197278','#541388','#F18805'])
date_plot.update_layout(showlegend=False,plot_bgcolor="#F0e7E2") 


date_plot.show()

## Disease Barplot

In [89]:
disease_df = data.drop_duplicates(['Title','Disease'])['Disease'].value_counts().reset_index().rename(columns={
    'index':'Disease',
    'Disease':'Number of papers'}).sort_values('Disease')
disease_df['colour'] = [str(i%4) for i in range(len(disease_df))]
disease_plot = px.bar(disease_df,x='Disease',y='Number of papers',color='colour', color_discrete_sequence=['#D90368','#197278','#541388','#F18805'])
disease_plot.update_layout(
    showlegend=False,xaxis = go.layout.XAxis(
        tickangle = -60),
    plot_bgcolor="#F0e7E2"
    
)
disease_plot.update_xaxes(categoryorder='category ascending')
disease_plot.show()

## NXM Scatter

In [90]:
nxm_df = data.drop_duplicates(['Title','N'])
nxm_plot = px.scatter(nxm_df,x='N varaibles',y='N',log_x=True,log_y=True,color='Time Type',color_discrete_sequence=['#D90368','#197278'])
nxm_plot.update_layout(plot_bgcolor="#F0e7E2") 

nxm_plot.show()

## Primary Seconday Venn

In [12]:
care_df = short['Primary or Secondary Care'].value_counts().rename({
    'Primary or Secondary Care':'Count'
})
care_df

Secondary    26
Both         21
Primary      16
Tertiary      1
Name: Primary or Secondary Care, dtype: int64

In [13]:
p_d = care_df.loc['Primary'] + care_df.loc['Both']
s_d = care_df.loc['Secondary'] + care_df.loc['Both']
b_d = care_df.loc['Both']
s_w = s_d/p_d
b_w = b_d/p_d

p_x0 = 0
p_x1 = 1
p_y0 = 0
p_y1 = 1

s_x0 = 1-b_w
s_x1 = s_x0 + s_w
s_y0 = 0.5-s_w/2
s_y1 = 0.5+s_w/2

In [101]:
import plotly.graph_objects as go

fig = go.Figure()

# Create scatter trace of text labels
fig.add_trace(go.Scatter(
    x=[p_x1/3-0.1,s_x1/2-0.1,s_x1-s_w/3],
    y=[0.5, 0.5, 0.5],
    text=[i + '<br>'+str(care_df.loc[i]) + ' papers' for i in["Primary", "Both", "Secondary"]],
    mode="text",
    textfont=dict(
        color="black",
        size=14,
        family="Arial",
    )
))

# Update axes properties
fig.update_xaxes(
    showticklabels=False,
    showgrid=False,
    zeroline=False,
)

fig.update_yaxes(
    showticklabels=False,
    showgrid=False,
    zeroline=False,
)

# Add circles
fig.add_shape(type="circle",
    line_color='#541388', fillcolor='#541388',
    x0=p_x0, y0=p_y0, x1=p_x1, y1=p_y1
)
fig.add_shape(type="circle",
    line_color='#197278', fillcolor='#197278',
    x0=s_x0, y0=s_y0, x1=s_x1, y1=s_y1
)
fig.update_shapes(opacity=0.3, xref="x", yref="y")

fig.update_layout(
    margin=dict(l=20, r=20, b=100),
    height=600, width=600,
    plot_bgcolor="white"
)

fig.show()

## Location Map

In [103]:
country_df = short['Country'].value_counts().reset_index().rename(columns={'index':'Country','Country':'Count'})
country_df.loc[country_df['Country']=='USA','Country']='United States'
country_df.loc[country_df['Country']=='UK','Country']='United Kingdom'

df = px.data.gapminder().query("year==2007")
country_df = pd.merge(df[['country','iso_alpha']],country_df, how = 'right',left_on='country',right_on='Country')
country_df.loc[3,'country'] = 'Russia'
country_df.loc[3,'iso_alpha'] = 'RUS'
country_df.head()

Unnamed: 0,country,iso_alpha,Country,Count
0,United States,USA,United States,43
1,Spain,ESP,Spain,10
2,United Kingdom,GBR,United Kingdom,5
3,Russia,RUS,Russia,1
4,Australia,AUS,Australia,1


In [104]:
country_plot = px.choropleth(country_df, locations="iso_alpha",
                    color="Count", # lifeExp is a column of gapminder
                    hover_name="Country", # column to add to hover information
                    color_continuous_scale=px.colors.sequential.Plasma)
country_plot.show()

## Variables upset plot

In [17]:
vars_df = short[['lifestyle factors', 'healthcare utilisation',
       'demographics', 'lab tests', 'procedures', 'symptoms', 'medication',
       'comorbidities']]
var_types = [i[1][i[1]==1].index.to_list() for i in vars_df.iterrows()]
var_set = sorted(list(set(tuple(sorted(i)) for i in var_types)))
var_unique = sorted(vars_df.columns.tolist())
var_code = {i:(idx*3+1)/10 for idx,i in enumerate(var_unique)}
coords = [[var_code[j] for j in i] for i in var_set]
var_types2 = [tuple(sorted(i)) for i in var_types]
var_counts = [var_types2.count(i)for i in var_set]

In [124]:
up = go.Figure()
up.update_xaxes(
    showticklabels=False,
    showgrid=False,
    zeroline=False,
)

up.update_yaxes(
    showticklabels=False,
    showgrid=False,
    zeroline=False,
)
up.update_layout(
    plot_bgcolor="white"
)
up.add_trace(go.Scatter(
    x=[0-(len(i)/24)-0. for i in var_code.keys()],
    y=[(i+0.1)*-1 for i in var_code.values()],
    text=list(var_code.keys()),
    mode="text",
    textfont=dict(
        color="black",
        size=12,
        family="Arial",
    )
))

up.add_trace(go.Scatter(
    x=[i*0.3 +0.18 for i in range(len(var_counts))],
    y=[0 for i in range(len(var_counts))],
    text=var_counts,
    mode="text",
    textfont=dict(
        color="black",
        size=12,
        family="Arial",
    )
))
    


for i in range(len(var_unique)):
    x0=0
    x1=len(var_set)*0.3+0.1
    y0= i*0.3 +0.05 
    y1=y0+0.3
    y0 = y0*(-1)
    y1 = y1*(-1)
    if i%2==0:
        col = '#DCDCDC'
    else:
        col = '#f2f2f2'
    up.add_shape(type='rect',x0=x0,x1=x1,y0=y0,y1=y1,line_color=col,fillcolor=col)

    
    
for idx,i in enumerate(coords):
    c_x0 = (idx*3+1)/10
    c_x1 = c_x0 + 0.2
    for j in range(len(var_unique)):
        c_y0 = (j*3+1)/10
        c_y1 = c_y0+ 0.2
        if c_y0 in i:
            c_col = '#000000'
        else:
            c_col = '#bfbfbf'
        c_y0 = c_y0*(-1)
        c_y1 = c_y1*(-1)
        up.add_shape(type="circle",line_width=0, fillcolor=c_col,x0=c_x0, y0=c_y0, x1=c_x1, y1=c_y1)
    if len(i)>1:
        l_x = c_x0+0.1
        l_y0 = (min(i)+ 0.1)*-1
        l_y1 = (max(i)+ 0.1)*-1
        up.add_shape(type='line',line=dict(width=2,color='black'),x0=l_x,x1=l_x,y0=l_y0,y1=l_y1)

up.update_layout(
    showlegend=False,height=400, width=300*4.5)        

up.show()

## Method Alluvial

In [23]:
def melt_df(df,col_type):
    df2 = df.copy()
    val_vars =  [i for i in df2.columns if col_type in i]
    id_vars = [i for i in df2.columns if i not in val_vars]
    df3 = pd.melt(df2,id_vars = id_vars, value_vars = val_vars,value_name = col_type).drop(columns='variable')
    df3 = df3[df3[col_type] != '']
    return df3

In [20]:
def get_sets(df,col):
    df2 = df.copy()
    col_list = [i for i in df2.columns if col in i]
    if len(col_list) > 1: 
        df2 = melt_df(df2,col)
    return set(df2[col].to_list())

In [158]:
def rename_repeats(col_list,col_name,repeat_cols):
    if 'None' not in repeat_cols:
        repeat_cols = repeat_cols + ['None']
    rename_dict = {i: 'No '+col_name if i =='None' else i+' '+col_name for i in repeat_cols}
    new_col = [rename_dict[i] if i in rename_dict.keys() else i for i in col_list]
    return new_col

In [159]:
def get_counts(df,col_list,col_n,repeat_cols):
    col1 = col_list[col_n-1]
    col2 = col_list[col_n]
    col1_list = [i for i in df.columns if col1 in i]
    col2_list = [i for i in df.columns if col2 in i]
    df_short = df[col1_list+col2_list]
    if len(col1_list) > 1:
        df_short = melt_df(df_short,col1)
    if len(col2_list) > 1:
        df_short = melt_df(df_short,col2)
    df_short['Value'] = 1
    df_short = df_short.groupby([col1,col2]).count().reset_index().rename(columns={col1:'Source',col2:'Target'})
    df_short['Source'] = rename_repeats(df_short['Source'].tolist(),col1,repeat_cols)
    df_short['Target'] = rename_repeats(df_short['Target'].tolist(),col2,repeat_cols)
    return df_short

In [160]:
def get_node_df(method_df,col_list,col): 

    none_cols = [i for i in method_df.columns if any(j in i for j in ['2','3','4'])]
    method_df[none_cols] = method_df[none_cols].replace({'None':''})

    set_list = [j for i in col_list for j in get_sets(method_df,i)]
    repeat_cols = [i for i in set(set_list) if set_list.count(i) > 1]


    node_df = pd.concat([get_counts(method_df,col_list,i,repeat_cols)for i in range(1,len(col_list))])
    node_df['colour']=col
    return node_df

In [161]:
def get_sanky(df, col_list,color_key):
    useful_cols = ['Feature Selection','Dimentionality Reduction',
       'Clustering algorithm ', 'Deciding K 1', 'Deciding K 2', 'Deciding K 3',
       'Deciding K 4','Charactersing clusters',
       'Internal evaluation metric 1', 'Internal evaluation metric 2',
       'Internal evaluation metric 3', 'Internal evaluation metric 4',
       'External evaluation metric 1', 'External evaluation metric 2',
       'External evaluation metric 3', 'External evaluation metric 4',]
    df_list = [df[df[color_key]==i][useful_cols] for i in df[color_key].unique()]
    color_list = ['#D90368','#197278','#541388','#F18805',"#F0e7E2",'#B76d68','#D8FFDD','#BABFD1',
                 '#FF7B9C','#75D1B9','#9FB84D','#462749','#11151C']
   
    node_df = pd.concat([get_node_df(df_list[i],col_list,color_list[i]) for i in range(len(df_list))])
    node_list = list(set(node_df['Source'].tolist()+node_df['Target'].tolist()))
    node_keys = {i:idx for idx,i in enumerate(node_list)}
    labels = [i if (node_df[node_df['Source']==i]['Value'].sum()+node_df[node_df['Target']==i]['Value'].sum()) > 0 else '' for i in node_list]
    node_df[['Source','Target']] = node_df[['Source','Target']].replace(node_keys)
    
    fig = go.Figure(data=[go.Sankey(
        node = dict(
            pad = 1,
            thickness = 1,
            line = dict(color = "black", width = 0.5),
            label=labels,  
            color = "blue"
    ),
        link = dict(
            source = node_df['Source'].tolist(), # indices correspond to labels, eg A1, A2, A1, B1, ...
            target = node_df['Target'].tolist(),
            value = node_df['Value'].tolist(),
            color = node_df['colour'].tolist()
        ))])
    fig.update_layout(height=800)
    fig.show()
    return fig

In [157]:

data_flat = data[data['Time Type']=='flat']
data_long = data[data['Time Type']=='temporal']
eval_list = ['Clustering algorithm ','Internal evaluation','External evaluation']
method_type = ['Feature Selection','Dimentionality Reduction', 'Clustering algorithm ',
       'Charactersing clusters', 'Deciding K']
get_sanky(data_long,method_type,'Clustering algorithm ')





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [60]:
method_df = data[['Dimentionality Reduction',
       'Clustering algorithm ', 'Deciding K 1', 'Deciding K 2', 'Deciding K 3',
       'Deciding K 4','Charactersing clusters',
       'Internal evaluation metric 1', 'Internal evaluation metric 2',
       'Internal evaluation metric 3', 'Internal evaluation metric 4',
       'External evaluation metric 1', 'External evaluation metric 2',
       'External evaluation metric 3', 'External evaluation metric 4',]]
none_cols = [i for i in method_df.columns if any(j in i for j in ['2','3','4'])]
method_df[none_cols] = method_df[none_cols].replace({'None':''})


col_list = ['Dimentionality Reduction', 'Clustering algorithm ',
       'Charactersing clusters', 'Deciding K', 'Internal evaluation',
       'External evaluation']

set_list = [j for i in col_list for j in get_sets(method_df,i)]
repeat_cols = [i for i in set(set_list) if set_list.count(i) > 1]


node_df = pd.concat([get_counts(method_df,col_list,i,repeat_cols)for i in range(1,len(col_list))])
node_list = list(set(node_df['Source'].tolist()+node_df['Target'].tolist()))
node_keys = {i:idx for idx,i in enumerate(node_list)}
labels = [i if (node_df[node_df['Source']==i]['Value'].sum()+node_df[node_df['Target']==i]['Value'].sum()) > 0 else '' for i in node_list]
node_df[['Source','Target']] = node_df[['Source','Target']].replace(node_keys)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [125]:
fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 1,
      thickness = 1,
      line = dict(color = "black", width = 0.5),
#      label=labels,  
      color = "blue"
    ),
    link = dict(
      source = node_df['Source'].tolist(), # indices correspond to labels, eg A1, A2, A1, B1, ...
      target = node_df['Target'].tolist(),
      value = node_df['Value'].tolist()
  ))])


fig.show()

In [150]:
data.columns

Index(['Title', 'URL', 'Year', 'Disease', 'Primary or Secondary Care',
       'Uses additional data', 'Country', 'Time Type', 'N varaibles', 'N',
       'Missing data', 'Feature Selection', 'Dimentionality Reduction',
       'Clustering algorithm ', 'Deciding K 1', 'Deciding K 2', 'Deciding K 3',
       'Deciding K 4', 'K', 'Charactersing clusters',
       'Internal evaluation metric 1', 'Internal evaluation metric 2',
       'Internal evaluation metric 3', 'Internal evaluation metric 4',
       'External evaluation metric 1', 'External evaluation metric 2',
       'External evaluation metric 3', 'External evaluation metric 4',
       'lifestyle factors', 'healthcare utilisation', 'diagnoses',
       'demographics', 'lab tests', 'procedures', 'symptoms', 'medication',
       'comorbidities'],
      dtype='object')

In [33]:
rename_dict

{'SPADE': 'SPADE test',
 'None': 'No test',
 'PCA': 'PCA test',
 'Hierarchical Clustering': 'Hierarchical Clustering test',
 'Stability': 'Stability test',
 'CVNN': 'CVNN test'}