In [42]:
#imports

import requests as rq
import bs4
import pandas as pd
import plotly.express as px
import io
import numpy as np
import plotly.graph_objects as go

In [43]:
#reading in the data

url = 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)'
#Need to set user agent because the default option wasn't working for me
headers = {
    'User-Agent': 'nolandr1 (nolandr1@jh.edu)'
}
page = rq.get(url, headers=headers)
## print out the first 100 characters just to see what it looks like
page.text[0 : 99]

'<!DOCTYPE html>\n<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-l'

In [44]:
#reading into bs4

bs4page = bs4.BeautifulSoup(page.text, 'html.parser')
tables = bs4page.find('table',{'class':"wikitable"})

gdp = pd.read_html(io.StringIO(str(tables)))


In [45]:
#reading into pandas

gdp = pd.read_html(io.StringIO(str(tables)))[0]
gdp.head()

Unnamed: 0,Country/Territory,IMF (2026)[6],World Bank (2024)[7],United Nations (2024)[8]
0,World,123584494,111326370,100834796
1,United States,31821293,28750956,29298000
2,China[n 1],20650754,18743803,18743802
3,Germany,5328184,4685593,4659929
4,India,4505629,3909892,3952244


In [46]:
#Old Table with UN Region info

url = 'https://en.wikipedia.org/w/index.php?title=List_of_countries_by_GDP_(nominal)&oldid=1187446467'
headers = {
    'User-Agent': 'nolandr1 (nolandr1@jh.edu)'
}
page = rq.get(url, headers=headers)
## print out the first 100 characters just to see what it looks like
page.text[0 : 99]

'<!DOCTYPE html>\n<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-l'

In [47]:
#Finding the table in Old GDP

bs4page = bs4.BeautifulSoup(page.text, 'html.parser')
all_tables = pd.read_html(io.StringIO(page.text))
old_gdp = all_tables[2]

# Flatten the MultiIndex columns to a single level
old_gdp.columns = old_gdp.columns.get_level_values(1)

#dropping every column except UN region & Country/Territory
old_gdp = old_gdp[['Country/Territory', 'UN region']]
old_gdp.head()

Unnamed: 0,Country/Territory,UN region
0,World,—
1,United States,Americas
2,China,Asia
3,Germany,Europe
4,Japan,Asia


In [48]:
#Merging the two dataframes together

merge_gdp = pd.merge(gdp, old_gdp, on='Country/Territory')
merge_gdp.head()

Unnamed: 0,Country/Territory,IMF (2026)[6],World Bank (2024)[7],United Nations (2024)[8],UN region
0,World,123584494,111326370,100834796,—
1,United States,31821293,28750956,29298000,Americas
2,Germany,5328184,4685593,4659929,Europe
3,India,4505629,3909892,3952244,Asia
4,Japan,4463634,4027598,4026211,Asia


In [49]:
#Creating the plot

fig = px.bar(merge_gdp, x='UN region', y='IMF (2026)[6]', color='Country/Territory')
fig.show()
fig.write_html('stacked_bar.html')

In [50]:
dat = pd.read_csv("https://raw.githubusercontent.com/smart-stats/ds4bio_book/main/book/assetts/kirby21AllLevels.csv").drop(['Unnamed: 0'], axis = 1)
dat.head()

Unnamed: 0,rawid,roi,volume,min,max,mean,std,type,level,id,icv,tbv
0,kirby127a_3_1_ax.img,Telencephalon_L,531111,0.0,374.0,128.3013,51.8593,1,1,127,1378295,1268519
1,kirby127a_3_1_ax.img,Telencephalon_R,543404,0.0,300.0,135.0683,53.6471,1,1,127,1378295,1268519
2,kirby127a_3_1_ax.img,Diencephalon_L,9683,15.0,295.0,193.5488,32.2733,1,1,127,1378295,1268519
3,kirby127a_3_1_ax.img,Diencephalon_R,9678,10.0,335.0,193.7051,32.7869,1,1,127,1378295,1268519
4,kirby127a_3_1_ax.img,Mesencephalon,10268,55.0,307.0,230.8583,29.2249,1,1,127,1378295,1268519


In [51]:
dat = dat.assign(id_char = dat.id.astype(str))
fig = px.bar(dat, x = "id_char", y = "volume", color = "roi")
fig.show()

In [53]:
icv = dat.groupby(['id']).volume.sum().reset_index().rename(columns = {'volume' : 'icv'})
dat = pd.merge(dat, icv, on = 'id')
dat = dat.assign(comp = dat.volume / dat.icv)
dat.head()

Unnamed: 0,rawid,roi,volume,min,max,mean,std,type,level,id,icv_x,tbv,id_char,icv_y,icv,comp
0,kirby127a_3_1_ax.img,Telencephalon_L,531111,0.0,374.0,128.3013,51.8593,1,1,127,1378295,1268519,127,13783255,13783255,0.038533
1,kirby127a_3_1_ax.img,Telencephalon_R,543404,0.0,300.0,135.0683,53.6471,1,1,127,1378295,1268519,127,13783255,13783255,0.039425
2,kirby127a_3_1_ax.img,Diencephalon_L,9683,15.0,295.0,193.5488,32.2733,1,1,127,1378295,1268519,127,13783255,13783255,0.000703
3,kirby127a_3_1_ax.img,Diencephalon_R,9678,10.0,335.0,193.7051,32.7869,1,1,127,1378295,1268519,127,13783255,13783255,0.000702
4,kirby127a_3_1_ax.img,Mesencephalon,10268,55.0,307.0,230.8583,29.2249,1,1,127,1378295,1268519,127,13783255,13783255,0.000745


In [54]:
roi_mean = dat.drop(["id", "id_char", "icv", "rawid"], axis = 1).groupby(["roi"]).mean().reset_index()
fig = px.bar(roi_mean, x = "roi", y = "comp")
fig.show()

In [55]:
url = "https://raw.githubusercontent.com/bcaffo/MRIcloudT1volumetrics/master/inst/extdata/multilevel_lookup_table.txt"
multilevel_lookup = pd.read_csv(url, sep = "\t").drop(['Level5'], axis = 1)
multilevel_lookup = multilevel_lookup.rename(columns = {
    "modify"   : "roi",
    "modify.1" : "level4",
    "modify.2" : "level3",
    "modify.3" : "level2",
    "modify.4" : "level1"})
multilevel_lookup = multilevel_lookup[['roi', 'level4', 'level3', 'level2', 'level1']]
multilevel_lookup.head()

Unnamed: 0,roi,level4,level3,level2,level1
0,SFG_L,SFG_L,Frontal_L,CerebralCortex_L,Telencephalon_L
1,SFG_R,SFG_R,Frontal_R,CerebralCortex_R,Telencephalon_R
2,SFG_PFC_L,SFG_L,Frontal_L,CerebralCortex_L,Telencephalon_L
3,SFG_PFC_R,SFG_R,Frontal_R,CerebralCortex_R,Telencephalon_R
4,SFG_pole_L,SFG_L,Frontal_L,CerebralCortex_L,Telencephalon_L


In [56]:
#Displaying levels for Type = 1 data

id = 127
subjectData = pd.read_csv("https://raw.githubusercontent.com/smart-stats/ds4bio_book/main/book/assetts/kirby21AllLevels.csv")
subjectData = subjectData.loc[(subjectData.type == 1) & (subjectData.id == id)]
subjectData = subjectData[['roi', 'volume']]

subjectData = pd.merge(subjectData, multilevel_lookup, on = "roi")
subjectData = subjectData.assign(icv = "ICV")
subjectData = subjectData.assign(comp = subjectData.volume / np.sum(subjectData.volume))
subjectData.head()

Unnamed: 0,roi,volume,level4,level3,level2,level1,icv,comp
0,CSF,109776,xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx,,,,ICV,0.047225
1,Thalamus_L,6342,Thalamus_L,Thalamus_L,Thalamus_L,Diencephalon_L,ICV,0.002728
2,Thalamus_R,6372,Thalamus_R,Thalamus_R,Thalamus_R,Diencephalon_R,ICV,0.002741
3,BasalForebrain_L,3341,BasalForebrain_L,BasalForebrain_L,BasalForebrain_L,Diencephalon_L,ICV,0.001437
4,BasalForebrain_L,3341,BasalForebrain_L,BasalForebrain_L,BasalForebrain_L,Diencephalon_L,ICV,0.001437


In [57]:
data = ['icv', 'level1', 'level2', 'level3', 'level4', 'roi']

all_nodes = []
for level in data:
    all_nodes.extend(subjectData_cleaned[level].unique())

all_nodes = [node for node in all_nodes if node != '']
all_nodes = sorted(list(set(all_nodes)))
node_to_index = {node: i for i, node in enumerate(all_nodes)}


source_indices = []
target_indices = []
link_values = []


for index, row in subjectData_cleaned.iterrows():
    current_value = row['comp']


    for i in range(len(data) - 1):
        source_level = data[i]
        target_level = data[i+1]

        source_node = row[source_level]
        target_node = row[target_level]


        if source_node != '' and target_node != '':
            source_indices.append(node_to_index[source_node])
            target_indices.append(node_to_index[target_node])
            link_values.append(current_value)


fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=all_nodes,
        color="blue"
    ),
    link=dict(
        source=source_indices,
        target=target_indices,
        value=link_values,
    )
)])


fig.update_layout(title_text=f"Sankey Diagram", font_size=10)
fig.show()
fig.write_html('sankey.html')