In [1]:
import numpy as np
import pandas as pd

### Data import

In [2]:
df_APS = pd.read_csv('.../final_table_20240110.csv', sep=';', encoding="ISO-8859-1")

### Collecting information for the Sankey graphs

In [21]:
# dictionary to decode diseases in all columns with manifestations
disease_decoder = {0:'HT', 
                   1:'GD', 
                   2:'AD', 
                   3:'CeD', 
                   4:'AIG', 
                   5:'T1D', 
                   6:'CD', 
                   7:'POF', 
                   8:'RA', 
                   9:'PsO', 
                   10:'AIH', 
                   11:'Vit', 
                   12:'PM', 
                   13:'SLE', 
                   14:'AIHA', 
                   15:'MG', 
                   16:'UC', 
                   17:'CMC', 
                   18:'PSC', 
                   19:'MS', 
                   20:'SS', 
                   21:'Alo', 
                   22:'PBC', 
                   23:'SSc', 
                   24:'hypoPT', 
                   25:'LH', 
                   26:'ITP', 
                   27:'PAPS', 
                   -1:'unknown'}

In [8]:
# to create a df of the manifestations with the associated disease number 
manifestations = df_APS.copy()
manifestations = manifestations.iloc[:,[34, 44, 45, 46, 47]]
manifestations.replace({'1st_manifestation':disease_decoder}, inplace=True)   # to decode the diseases
manifestations.replace({'2nd_manifestation':disease_decoder}, inplace=True)
manifestations.replace({'3rd_manifestation':disease_decoder}, inplace=True)
manifestations.replace({'4th_manifestation':disease_decoder}, inplace=True)

In [18]:
# listing all disease combinations and the number of their occurrences
grouped_manifs = manifestations.groupby(['1st_manifestation', '2nd_manifestation', '3rd_manifestation', '4th_manifestation']).count()
grouped_manifs = grouped_manifs.sort_values(['1st_manifestation', '2nd_manifestation', '3rd_manifestation', '4th_manifestation'])
grouped_manifs.rename(columns = {'assoc_disease_no':'count'}, inplace=True)

In [19]:
grouped_manifs

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count
1st_manifestation,2nd_manifestation,3rd_manifestation,4th_manifestation,Unnamed: 4_level_1
AD,AIG,GD,unknown,1
AD,CD,unknown,unknown,1
AD,GD,AIG,unknown,2
AD,GD,RA,unknown,1
AD,GD,unknown,unknown,1
...,...,...,...,...
Vit,HT,Alo,unknown,1
Vit,HT,SS,unknown,1
Vit,HT,unknown,unknown,13
Vit,MG,GD,unknown,1


In this sense, unknown means it is not known if there will be a 3rd or a 4th manifestation in the future.<br>
In other words, there was no 3rd or 4th manifestation at the time of this investigation.<br>
<br>
However, there are two exceptions: all four manifestations are marked as unknown in two instances.<br>
This means the order of the manifestations was unknown in these cases.

In [20]:
# saving the result as a .csv file
grouped_manifs.to_csv('manifestations_grouped.csv')

### Sankey Graphs

Sankey graphs were constructed by using the online tool SankeyMatic at <https://sankeymatic.com> with the following syntax:

In [None]:
"""
// Graves' disease:
3GD1 [9] 3CeD2
3GD1 [7] 3RA2
3GD1 [4] 3AD2
3GD1 [4] 3CD2
3GD1 [3] 3SLE2
3GD1 [3] 3T1D2
3GD1 [3] 3SS2
3GD1 [2] 3AIG2
3GD1 [2] 3Vit2
3GD1 [1] 3UC2
3GD1 [1] 3MG2
3GD1 [1] 3PM2
3GD1 [1] 3PBC2
3GD1 [1] 3MS2

3CeD2 [1] 3PsO3
3CeD2 [1] 3AIG3
3RA2 [1] 3AIG3
3AD2 [1] 3Vit3
3SLE2 [1] 3MG3
3SS2 [1] 3RA3
3MS2 [1] 3UC3


// Addison's disease:
4AD [18] 4HT2
4AD [4] 4GD2
4AD [2] 4Vit2
4AD [1] 4AIG2
4AD [1] 4CD2
4AD [1] 4RA2
4AD [1] 4MS2

4HT2 [1] 4T1D3
4HT2 [1] 4POF3
4HT2 [1] 4RA3
4HT2 [2] 4AIG3
4GD2 [1] 4RA3
4GD2 [2] 4AIG3
4Vit2 [1] 4AIG3
4AIG2 [1] 4GD3


// Coeliac disease:
5CeD [15] 5HT2
5CeD [7] 5GD2
5CeD [2] 5UC2
5CeD [1] 5T1D2
5CeD [1] 5PsO2
5CeD [1] 5Vit2

5HT2 [1] 5T1D3
5HT2 [1] 5AIG3
5GD2 [1] 5AIG3
5UC2 [1] 5GD3
5UC2 [1] 5HT3
5T1D2 [1] 5HT3
5PsO2 [1] 5HT3
5Vit2 [1] 5HT3

5T1D3 [1] 5AIG4
5GD3 [1] 5PsO4
5HT3 [1] 5SLE4


// Hashimoto's thyroiditis:
HT [17] CeD2
HT [12] SS2
HT [11] RA2
HT [11] AIG2
HT [7] T1D2
HT [6] AD2
HT [6] Vit2
HT [5] Alo2
HT [4] SLE2
HT [4] POF2
HT [4] PsO2
HT [3] AIHA2
HT [2] AIH2
HT [2] PM2
HT [2] PBC2
HT [1] CD2
HT [1] UC2
HT [1] LH2
HT [1] PAPS2
HT [1] MS2

CeD2 [1] AIG3
CeD2 [1] SS3
SS2 [1] AD3
SS2 [1] CeD3
SS2 [1] POF3
SS2 [1] PBC3
AIG2 [1] SS3
AIG2 [1] CeD3
AIG2 [1] SLE3
AIG2 [1] Vit3
T1D2 [1] Vit3
AD2 [1] T1D3
POF2 [1] Vit3
POF2 [1] Alo3
AIHA2 [1] AIG3
AIHA2 [1] AIH3
UC2 [1] AIG3

AIG3 [1] SS4
CeD3 [1] PAPS4


// Diabetes mellitus type 1:
2T1D [34] 2HT2
2T1D [12] 2GD2
2T1D [11] 2CeD2
2T1D [2] 2AIH2
2T1D [1] 2POF2
2T1D [1] 2Alo2
2T1D [1] 2UC2
2T1D [1] 2PsO2
2T1D [1] 2RA2
2T1D [1] 2PSC2

2HT2 [1] 2Vit3
2HT2 [1] 2AD3
2HT2 [1] 2UC3
2HT2 [1] 2PsO3
2GD2 [2] 2AIG3
2GD2 [1] 2RA3
2CeD2 [3] 2GD3
2CeD2 [3] 2HT3
2AIH2 [1] 2HT3
2POF2 [1] 2HT3
2Alo2 [1] 2SS3
2PSC2 [1] 2AIH3

2Vit3 [1] 2AIG4


// Colors:
// Graves' disease:
:3GD1 #e7298a
:3CeD2 #66a61e
:3RA2 #e6ab02
:3AD2 #a6761d
:3CD2 #666666
:3SLE2 #1f77b4
:3T1D2 #1b9e77
:3SS2 #d95f02
:3AIG2 #7570b3
:3Vit2 #e7298a
:3UC2 #66a61e
:3MG2 #a6761d
:3PM2 #e6ab02
:3PBC2 #666666
:3MS2 #1b9e77

:3PsO3 #1f77b4
:3AIG3 #7570b3
:3Vit3 #e7298a
:3MG3 #a6761d
:3RA3 #e6ab02
:3UC3 #66a61e


// Addison's disease:
:4AD #e6ab02
:4HT2 #a6761d
:4GD2 #666666
:4Vit2 #1f77b4
:4AIG2 #1b9e77
:4CD2 #d95f02
:4RA2 #7570b3
:4MS2 #e7298a

:4T1D3 #66a61e
:4POF3 #e6ab02
:4RA3 #7570b3
:4AIG3 #1b9e77
:4GD3 #666666


// Coeliac disease:
:5CeD #66a61e
:5HT2 #e6ab02
:5GD2 #a6761d
:5UC2 #666666
:5T1D2 #1f77b4
:5PsO2 #1b9e77
:5Vit2 #d95f02

:5T1D3 #1f77b4
:5AIG3 #7570b3
:5GD3 #a6761d
:5HT3 #e6ab02

:5AIG4 #7570b3
:5PsO4 #1b9e77
:5SLE4 #e7298a


// Hashimoto's thyroiditis:
:HT #1b9e77
:CeD2 #d95f02
:SS2 #7570b3
:RA2 #e7298a
:AIG2 #66a61e
:T1D2 #e6ab02
:AD2 #a6761d
:Vit2 #d95f02
:Alo2 #1f77b4
:SLE2 #1b9e77
:POF2 #666666
:PsO2 #7570b3
:AIHA2 #e7298a
:AIH2 #66a61e
:PM2 #e6ab02
:PBC2 #a6761d
:CD2 #666666
:UC2 #1f77b4
:LH2 #d95f02
:PAPS2 #1b9e77
:MS2 #7570b3

:AIG3 #66a61e
:SS3 #7570b3
:AD3 #a6761d
:CeD3 #d95f02
:POF3 #666666
:PBC3 #a6761d
:SLE3 #1b9e77
:Vit3 #d95f02
:T1D3 #e6ab02
:Alo3 #1f77b4
:AIH3 #66a61e

:SS4 #7570b3
:PAPS4 #1b9e77


// Diabetes mellitus type 1:
:2T1D #7570b3
:2HT2 #e7298a
:2GD2 #66a61e
:2CeD2 #e6ab02
:2AIH2 #a6761d
:2POF2 #666666
:2Alo2 #1f77b4
:2UC2 #e6ab02
:2PsO2 #7570b3
:2RA2 #e7298a
:2PSC2 #d95f02

:2Vit3 #1b9e77
:2AD3 #d95f02
:2UC3 #e6ab02
:2PsO3 #7570b3
:2AIG3 #a6761d
:2RA3 #e7298a
:2GD3 #66a61e
:2HT3 #e7298a
:2SS3 #666666
:2AIH3 #a6761d

:2AIG4 #a6761d
"""

Apart from the default settings, the followings were specified to produce the graphs:

In [None]:
"""
1. Arrange the diagram: Using the exact input order
2. Labels:  a) Do not show values
            b) Placement: Automatic, After the Node
3. Nodes: Default Node Colors: Dark
4. Flows: Default Flow Colors: each flow's Source
5. When Total Inputs ≠ Total Outputs: Attach incomplete flow groups to: The trailing edge of the Node
6. Diagram Size & Background: Width: 700, Height: 1800
7. For saving the graphs as a .png image :  a) Labels: Do not show names
                                            b) Width: 600, Height: 1800
"""