In [48]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

In [49]:
#loading the datasets
df_num_citations = pd.read_csv('../data/processed/citation_count.csv', index_col=0)
df_collaboration_network = pd.read_csv('../data/processed/collaborations.csv', index_col=0)
df_doc_types = pd.read_csv('../data/processed/doc_type.csv', index_col=0)
df_funding_ents = pd.read_csv('../data/processed/funding_data.csv', index_col=0)
df_journals = pd.read_csv('../data/processed/journals_df.csv', index_col=0)
df_languages = pd.read_csv('../data/processed/lang_data.csv', index_col=0)
df_country_map = pd.read_csv('../data/processed/map_data.csv', index_col=0)
df_num_collaborators = pd.read_csv('../data/processed/number_of_countries.csv', index_col=0)
df_num_publications = pd.read_csv('../data/processed/publication_count.csv', index_col=0)


# Number of publications from each country

## Top 10 Countries

In [95]:
temp_df = df_num_publications.groupby(['countries'])['counts'].sum().reset_index(name='counts')
temp_df

Unnamed: 0,countries,counts
0,Canada,8
1,Denmark,1
2,France,6
3,Germany,2
4,Ghana,4
...,...,...
94,Turkey,8
95,Uganda,1
96,United Arab Emirates,2
97,United Kingdom,61


In [97]:
temp_df = temp_df.replace(r"^ +| +$", r"", regex=True)
temp_df

Unnamed: 0,countries,counts
0,Canada,8
1,Denmark,1
2,France,6
3,Germany,2
4,Ghana,4
...,...,...
94,Turkey,8
95,Uganda,1
96,United Arab Emirates,2
97,United Kingdom,61


In [99]:
temp_df = temp_df.groupby(['countries'])['counts'].sum().reset_index(name='counts')


In [103]:
top_20_countries_df = temp_df.sort_values(by='counts', ascending=False).head(20)

In [105]:
fig = px.bar(top_20_countries_df, x='countries', y='counts')
fig.show()

In [106]:
top_20_countries_df.to_csv('../data/processed/top_20_countries.csv')

## By Year

In [50]:
df_num_publications['counts']=1
df_num_publications_yr = df_num_publications.groupby(['countries','Year','Document Type'])['countries'].count().reset_index(name='counts')
df_num_publications_yr


Unnamed: 0,countries,Year,Document Type,counts
0,Canada,2003,Article,1
1,Canada,2008,Article,1
2,Canada,2015,Article,2
3,Canada,2015,Review,1
4,Canada,2018,Letter,1
...,...,...,...,...
600,United States,2022,Article,14
601,United States,2022,Book Chapter,1
602,United States,2022,Note,2
603,United States,2022,Review,2


In [51]:
comp_list=['United Kingdom','United States']
temp_df = df_num_publications_yr[df_num_publications_yr.countries.isin(comp_list)]


In [52]:
temp_df

Unnamed: 0,countries,Year,Document Type,counts
447,United Kingdom,1972,Article,3
448,United Kingdom,1974,Article,1
449,United Kingdom,1976,Article,1
450,United Kingdom,1977,Article,1
451,United Kingdom,1979,Article,1
...,...,...,...,...
600,United States,2022,Article,14
601,United States,2022,Book Chapter,1
602,United States,2022,Note,2
603,United States,2022,Review,2


In [53]:
fig = go.Figure()

for y in temp_df.countries.unique():
    dfy = temp_df[temp_df.countries == y]
    fig.add_bar(x = dfy.Year, y = dfy.counts, name = str(y))

fig.show()

In [54]:
def comp_country_publications_yr(country1, country2):
    comp_list=[country1, country2]
    #create dataframe with values of chosen countries
    temp_df = df_num_publications_yr[df_num_publications_yr.countries.isin(comp_list)]
    #create grouped bar graph
    fig = go.Figure()
    
    for y in temp_df.countries.unique():
        dfy = temp_df[temp_df.countries == y]
        fig.add_bar(x = dfy.Year, y = dfy.counts, name = str(y))
    
    return(fig.show())
    

In [55]:
#testing
comp_country_publications_yr('Germany','United States')

In [65]:
comp_list=['United Kingdom','United States']
temp_df = df_num_publications_yr[df_num_publications_yr.countries.isin(comp_list)]
temp_df


Unnamed: 0,countries,Year,Document Type,counts
447,United Kingdom,1972,Article,3
448,United Kingdom,1974,Article,1
449,United Kingdom,1976,Article,1
450,United Kingdom,1977,Article,1
451,United Kingdom,1979,Article,1
...,...,...,...,...
600,United States,2022,Article,14
601,United States,2022,Book Chapter,1
602,United States,2022,Note,2
603,United States,2022,Review,2


In [71]:
temp_df = temp_df.groupby(['countries'])['counts'].sum().reset_index(name='counts')
fig = px.pie(temp_df, values = 'counts', names = 'countries', title='Share of publications', hole=.3)
fig.show()

In [73]:
def publication_comp_pie (country1, country2):
    comp_list=[country1, country2]
    temp_df = df_num_publications_yr[df_num_publications_yr.countries.isin(comp_list)]
    temp_df = temp_df.groupby(['countries'])['counts'].sum().reset_index(name='counts')
    fig = px.pie(temp_df, values = 'counts', names = 'countries', title='Share of publications', hole=.3)
    
    return (fig.show())

In [74]:
publication_comp_pie('Germany','United States')

## By Decade

In [56]:
df_num_publications['counts']=1
df_num_publications_dc = df_num_publications.groupby(['countries','Period','Document Type'])['countries'].count().reset_index(name='counts')
df_num_publications_dc


Unnamed: 0,countries,Period,Document Type,counts
0,Canada,1994 - 2003,Article,1
1,Canada,2004 - 2013,Article,1
2,Canada,2014 - 2023,Article,4
3,Canada,2014 - 2023,Letter,1
4,Canada,2014 - 2023,Review,1
...,...,...,...,...
293,United States,2014 - 2023,Book Chapter,2
294,United States,2014 - 2023,Conference Paper,5
295,United States,2014 - 2023,Letter,4
296,United States,2014 - 2023,Note,6


In [57]:
comp_list=['United Kingdom','United States']
temp_df = df_num_publications_dc[df_num_publications_dc.countries.isin(comp_list)]
temp_df

Unnamed: 0,countries,Period,Document Type,counts
255,United Kingdom,1964 - 1973,Article,3
256,United Kingdom,1974 - 1983,Article,5
257,United Kingdom,1984 - 1993,Article,7
258,United Kingdom,1994 - 2003,Article,7
259,United Kingdom,1994 - 2003,Editorial,1
260,United Kingdom,1994 - 2003,Letter,2
261,United Kingdom,1994 - 2003,Review,1
262,United Kingdom,2004 - 2013,Article,5
263,United Kingdom,2004 - 2013,Book Chapter,2
264,United Kingdom,2004 - 2013,Conference Paper,1


In [58]:
fig = go.Figure()

for y in temp_df.countries.unique():
    dfy = temp_df[temp_df.countries == y]
    fig.add_bar(x = dfy.Period, y = dfy.counts, name = str(y))

fig.show()

In [61]:
def comp_country_publications_dc(country1, country2):
    comp_list=[country1, country2]
    #create dataframe with values of chosen countries
    temp_df = df_num_publications_dc[df_num_publications_dc.countries.isin(comp_list)]
    #create grouped bar graph
    fig = go.Figure()
    
    for y in temp_df.countries.unique():
        dfy = temp_df[temp_df.countries == y]
        fig.add_bar(x = dfy.Period, y = dfy.counts, name = str(y))
    
    return(fig.show())

In [62]:
comp_country_publications_dc('Germany','United States')

## Top 10 publishing countries

# Chloropleth map

In [42]:
fig = go.Figure(data=go.Choropleth(
    locations = df_country_map['code'],
    z = df_country_map['count'],
    text = df_country_map['countries'],
    colorscale = 'pinkyl',
    autocolorscale=False,
    reversescale=False,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    colorbar_title = 'Number of SCR Publications',
))

fig.update_layout(
    title_text='Sickle cell Retinopathy Publications',
    geo=dict(
        showframe=False,
        showcoastlines=False,
        projection_type='equirectangular'
    ),
    annotations = [dict(
        x=0.55,
        y=0.1,
        xref='paper',
        yref='paper',
        # text='Source: <a href="https://www.cia.gov/library/publications/the-world-factbook/fields/2195.html">\
        #     CIA World Factbook</a>',
        showarrow = False
    )]
)

fig.show()

# citation counts comparison

In [108]:
temp_df = df_num_citations.sort_values(by='Cited by', ascending=False)
fig = px.bar(temp_df, x='countries', y='Cited by')
fig.show()

In [110]:
comp_list=['United Kingdom','United States']
temp_df = df_num_citations[df_num_citations.countries.isin(comp_list)]
fig = px.bar(temp_df, x='countries', y='Cited by')
fig.show()

In [113]:
def comp_country_citations(country1, country2):
    comp_list=[country1, country2]
    temp_df = df_num_citations[df_num_citations.countries.isin(comp_list)]
    fig = px.bar(temp_df, x='countries', y='Cited by')
    
    return(fig.show())

In [115]:
comp_country_citations('Germany','Ghana')

# Document Type

In [116]:
df_doc_types

Unnamed: 0,Document Type,counts
0,Article,574
1,Book,1
2,Book Chapter,16
3,Conference Paper,12
4,Data Paper,1
5,Editorial,5
6,Erratum,2
7,Letter,39
8,Note,19
9,Review,125


In [119]:
fig = px.treemap(df_doc_types, path=[px.Constant('all'), 'Document Type'], values='counts')
fig.update_traces(root_color="lightgrey")
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.update_traces(marker=dict(cornerradius=5))
fig.show()

# Publishing Journal


In [123]:
df_journals

Unnamed: 0,Year,Source title,Period,counts
0,2023,American Journal of Ophthalmology Case Reports,2014 - 2023,1
1,2023,European Journal of Haematology,2014 - 2023,1
2,2023,International Journal of Environmental Researc...,2014 - 2023,1
3,2023,Genes,2014 - 2023,1
4,2023,Eye (Basingstoke),2014 - 2023,1
...,...,...,...,...
794,1964,Transactions - American Academy of Ophthalmolo...,1964 - 1973,1
795,1963,Belgisch tijdschrift voor geneeskunde,1954 - 1963,1
796,1960,Klinische Monatsblätter für Augenheilkunde und...,1954 - 1963,1
797,1958,The Mississippi doctor,1954 - 1963,1


In [130]:
temp_df = df_journals.groupby('Source title')['counts'].sum().reset_index(name='counts').sort_values(by='counts', ascending=False).head(30)
temp_df

Unnamed: 0,Source title,counts
16,American Journal of Ophthalmology,40
69,British Journal of Ophthalmology,40
41,Archives of Ophthalmology,39
171,Journal Francais d'Ophtalmologie,19
14,American Journal of Hematology,19
119,Eye (Basingstoke),14
245,Ophthalmology,13
68,British Journal of Haematology,13
270,Retina,12
131,Haematologica,9


## Top 20 most prolific journals

In [131]:
fig = px.bar(temp_df, x='Source title', y='counts')
fig.show()

In [152]:
data = pd.read_csv('../data/interim/data_02.csv')

In [153]:
cols = ['Year', 'Source title','Period','countries']
journals = data[cols]
journals

Unnamed: 0,Year,Source title,Period,countries
0,2023,American Journal of Ophthalmology Case Reports,2014 - 2023,['United States']
1,2023,European Journal of Haematology,2014 - 2023,"['France', ' Guadeloupe']"
2,2023,International Journal of Environmental Researc...,2014 - 2023,['United States']
3,2023,Genes,2014 - 2023,['South Korea']
4,2023,Eye (Basingstoke),2014 - 2023,['United Kingdom']
...,...,...,...,...
794,1964,Transactions - American Academy of Ophthalmolo...,1964 - 1973,['']
795,1963,Belgisch tijdschrift voor geneeskunde,1954 - 1963,['']
796,1960,Klinische Monatsblätter für Augenheilkunde und...,1954 - 1963,['']
797,1958,The Mississippi doctor,1954 - 1963,['']


In [154]:
from ast import literal_eval
journals['countries'] = journals['countries'].apply(lambda row: literal_eval(row))
journals



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Year,Source title,Period,countries
0,2023,American Journal of Ophthalmology Case Reports,2014 - 2023,[United States]
1,2023,European Journal of Haematology,2014 - 2023,"[France, Guadeloupe]"
2,2023,International Journal of Environmental Researc...,2014 - 2023,[United States]
3,2023,Genes,2014 - 2023,[South Korea]
4,2023,Eye (Basingstoke),2014 - 2023,[United Kingdom]
...,...,...,...,...
794,1964,Transactions - American Academy of Ophthalmolo...,1964 - 1973,[]
795,1963,Belgisch tijdschrift voor geneeskunde,1954 - 1963,[]
796,1960,Klinische Monatsblätter für Augenheilkunde und...,1954 - 1963,[]
797,1958,The Mississippi doctor,1954 - 1963,[]


In [155]:
journals = journals.explode('countries')
journals

Unnamed: 0,Year,Source title,Period,countries
0,2023,American Journal of Ophthalmology Case Reports,2014 - 2023,United States
1,2023,European Journal of Haematology,2014 - 2023,France
1,2023,European Journal of Haematology,2014 - 2023,Guadeloupe
2,2023,International Journal of Environmental Researc...,2014 - 2023,United States
3,2023,Genes,2014 - 2023,South Korea
...,...,...,...,...
794,1964,Transactions - American Academy of Ophthalmolo...,1964 - 1973,
795,1963,Belgisch tijdschrift voor geneeskunde,1954 - 1963,
796,1960,Klinische Monatsblätter für Augenheilkunde und...,1954 - 1963,
797,1958,The Mississippi doctor,1954 - 1963,


In [156]:
journals.countries = journals.countries.replace(r"^ +| +$", r"", regex=True)

In [157]:
journals.countries.value_counts()

countries
United States         398
United Kingdom         72
France                 67
                       48
Brazil                 46
                     ... 
Monaco                  1
Zimbabwe                1
Russian Federation      1
Uganda                  1
Hong Kong               1
Name: count, Length: 73, dtype: int64

In [158]:
journals = journals[journals['countries'] !='']

In [159]:
journals

Unnamed: 0,Year,Source title,Period,countries
0,2023,American Journal of Ophthalmology Case Reports,2014 - 2023,United States
1,2023,European Journal of Haematology,2014 - 2023,France
1,2023,European Journal of Haematology,2014 - 2023,Guadeloupe
2,2023,International Journal of Environmental Researc...,2014 - 2023,United States
3,2023,Genes,2014 - 2023,South Korea
...,...,...,...,...
785,1971,Archives of Ophthalmology,1964 - 1973,United States
786,1971,American Journal of Ophthalmology,1964 - 1973,United States
789,1969,Documenta Ophthalmologica,1964 - 1973,United States
791,1967,American Journal of Ophthalmology,1964 - 1973,United States


In [160]:
journals['counts'] = 1
journals = journals.groupby(['countries','Source title'])['counts'].sum().reset_index(name='counts')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [161]:
journals

Unnamed: 0,countries,Source title,counts
0,"""Cote dIvoire""",Journal Francais d'Ophtalmologie,3
1,"""Cote dIvoire""",Medecine d'Afrique Noire,1
2,Argentina,ASN Neuro,1
3,Argentina,Experimental Eye Research,1
4,Australia,Pharmaceuticals,1
...,...,...,...
577,United States,Undersea and Hyperbaric Medicine,1
578,United States,Vision Research,1
579,United States,eLife,1
580,Zambia,Neurology,1


## Journals by Countries

In [166]:
fig = px.treemap(journals, path=[px.Constant('World'),'countries','Source title'], values = 'counts', color='counts', color_continuous_scale='Blues')
fig.update_traces(root_color="lightgrey")
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.update_traces(marker=dict(cornerradius=5))
fig.show()

# Funding

In [167]:
df_funding_ents

Unnamed: 0,Funding Entites,counts
0,ACC,1
1,AHA,1
2,ANR,1
3,ANR‐11‐IDEX‐0005‐02,1
4,ASH,1
...,...,...
161,Yale University,1
162,Yeshiva University,1
163,Z01DK075008,1
164,de Pessoal de N&#x00ED;vel Superior,1


In [185]:
funding_df = data[['Funding Entities','countries']]
funding_df

Unnamed: 0,Funding Entities,countries
0,[],['United States']
1,[],"['France', ' Guadeloupe']"
2,"[Albert Einstein College of Medicine, Yeshiva ...",['United States']
3,[National Research Foundation],['South Korea']
4,[],['United Kingdom']
...,...,...
794,[],['']
795,[],['']
796,[],['']
797,[],['']


In [186]:
funding_df = funding_df[funding_df['Funding Entities']!='[]']

In [187]:
funding_df

Unnamed: 0,Funding Entities,countries
2,"[Albert Einstein College of Medicine, Yeshiva ...",['United States']
3,[National Research Foundation],['South Korea']
5,"[Ministry of Science, ICT, Future Planning, Na...","['Pakistan', ' South Korea']"
6,"[National Heart, Blood Institute]","['United Kingdom', ' Ghana', ' Niger', ' Niger..."
9,"[Fundação de Amparo à Pesquisa, Conselho Nacio...",['Brazil']
...,...,...
766,"[U.S. Public Health Service, USPHS, National H...",['United States']
777,"[Wellcome Trust, WT]","['United Kingdom', ' Jamaica']"
778,"[Wellcome Trust, WT]","['United Kingdom', ' Jamaica']"
781,"[Wellcome Trust, WT]","['United Kingdom', ' Jamaica']"


In [180]:
funding_df['countries']=funding_df['countries'].apply(lambda row: literal_eval(row))
funding_df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Funding Entities,countries
2,"[Albert Einstein College of Medicine, Yeshiva ...",[United States]
3,[National Research Foundation],[South Korea]
5,"[Ministry of Science, ICT, Future Planning, Na...","[Pakistan, South Korea]"
6,"[National Heart, Blood Institute]","[United Kingdom, Ghana, Niger, Nigeria, So..."
9,"[Fundação de Amparo à Pesquisa, Conselho Nacio...",[Brazil]
...,...,...
766,"[U.S. Public Health Service, USPHS, National H...",[United States]
777,"[Wellcome Trust, WT]","[United Kingdom, Jamaica]"
778,"[Wellcome Trust, WT]","[United Kingdom, Jamaica]"
781,"[Wellcome Trust, WT]","[United Kingdom, Jamaica]"


In [191]:
funding_df ['Funding Entities'] = funding_df ['Funding Entities'].str.replace('[','').str.replace("'",'').str.replace(']','')
funding_df ['Funding Entities']=funding_df ['Funding Entities'].str.split(',')
funding_df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Funding Entities,countries
2,"[Albert Einstein College of Medicine, Yeshiva...",['United States']
3,[National Research Foundation],['South Korea']
5,"[Ministry of Science, ICT, Future Planning, ...","['Pakistan', ' South Korea']"
6,"[National Heart, Blood Institute]","['United Kingdom', ' Ghana', ' Niger', ' Niger..."
9,"[Fundação de Amparo à Pesquisa, Conselho Naci...",['Brazil']
...,...,...
766,"[U.S. Public Health Service, USPHS, National...",['United States']
777,"[Wellcome Trust, WT]","['United Kingdom', ' Jamaica']"
778,"[Wellcome Trust, WT]","['United Kingdom', ' Jamaica']"
781,"[Wellcome Trust, WT]","['United Kingdom', ' Jamaica']"


In [192]:
for index, row in df.iterrows():
    print (type(row[0]), type(row[1]))

<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> <class 'list'>
<class 'str'> 

In [207]:
df= funding_df.explode('Funding Entities')
df

Unnamed: 0,Funding Entities,countries
2,Albert Einstein College of Medicine,['United States']
2,Yeshiva University,['United States']
3,National Research Foundation,['South Korea']
5,Ministry of Science,"['Pakistan', ' South Korea']"
5,ICT,"['Pakistan', ' South Korea']"
...,...,...
778,WT,"['United Kingdom', ' Jamaica']"
781,Wellcome Trust,"['United Kingdom', ' Jamaica']"
781,WT,"['United Kingdom', ' Jamaica']"
786,U.S. Public Health Service,['United States']


In [208]:
df['Funding Entities'] = df['Funding Entities'].replace(r"^ +| +$", r"", regex=True)




In [209]:
df

Unnamed: 0,Funding Entities,countries
2,Albert Einstein College of Medicine,['United States']
2,Yeshiva University,['United States']
3,National Research Foundation,['South Korea']
5,Ministry of Science,"['Pakistan', ' South Korea']"
5,ICT,"['Pakistan', ' South Korea']"
...,...,...
778,WT,"['United Kingdom', ' Jamaica']"
781,Wellcome Trust,"['United Kingdom', ' Jamaica']"
781,WT,"['United Kingdom', ' Jamaica']"
786,U.S. Public Health Service,['United States']


In [210]:
to_replace = ['Blood Institute','National Heart','Lung Institute']
df['Funding Entities'].loc[df['Funding Entities'].isin(to_replace)] = 'National Heart,Lung and Blood Institute'
df

Unnamed: 0,Funding Entities,countries
2,Albert Einstein College of Medicine,['United States']
2,Yeshiva University,['United States']
3,National Research Foundation,['South Korea']
5,Ministry of Science,"['Pakistan', ' South Korea']"
5,ICT,"['Pakistan', ' South Korea']"
...,...,...
778,WT,"['United Kingdom', ' Jamaica']"
781,Wellcome Trust,"['United Kingdom', ' Jamaica']"
781,WT,"['United Kingdom', ' Jamaica']"
786,U.S. Public Health Service,['United States']


In [211]:
df.countries = df.countries.apply(lambda row: literal_eval(row))
df

Unnamed: 0,Funding Entities,countries
2,Albert Einstein College of Medicine,[United States]
2,Yeshiva University,[United States]
3,National Research Foundation,[South Korea]
5,Ministry of Science,"[Pakistan, South Korea]"
5,ICT,"[Pakistan, South Korea]"
...,...,...
778,WT,"[United Kingdom, Jamaica]"
781,Wellcome Trust,"[United Kingdom, Jamaica]"
781,WT,"[United Kingdom, Jamaica]"
786,U.S. Public Health Service,[United States]


In [212]:
df = df.explode('countries')
df

Unnamed: 0,Funding Entities,countries
2,Albert Einstein College of Medicine,United States
2,Yeshiva University,United States
3,National Research Foundation,South Korea
5,Ministry of Science,Pakistan
5,Ministry of Science,South Korea
...,...,...
781,Wellcome Trust,Jamaica
781,WT,United Kingdom
781,WT,Jamaica
786,U.S. Public Health Service,United States


In [213]:
df['countries'] = df['countries'].replace(r"^ +| +$", r"", regex=True)
df

Unnamed: 0,Funding Entities,countries
2,Albert Einstein College of Medicine,United States
2,Yeshiva University,United States
3,National Research Foundation,South Korea
5,Ministry of Science,Pakistan
5,Ministry of Science,South Korea
...,...,...
781,Wellcome Trust,Jamaica
781,WT,United Kingdom
781,WT,Jamaica
786,U.S. Public Health Service,United States


In [214]:
df['Times Funded']=1
df = df.groupby(['countries', 'Funding Entities'])['Times Funded'].sum().reset_index(name='Times Funded')
df

Unnamed: 0,countries,Funding Entities,Times Funded
0,,DDCF,1
1,,Doris Duke Charitable Foundation,1
2,,NIH,2
3,,National Center for Advancing Translational Sc...,1
4,,National Eye Institute,1
...,...,...,...
341,Zambia,Doris Duke Charitable Foundation,1
342,Zambia,NIH,1
343,Zambia,National Institutes of Health,1
344,Zambia,RBF,1


In [221]:
df.to_csv('../data/processed/funding_treemap_df.csv')

In [217]:
fig = px.treemap(df, path= ['countries', 'Funding Entities'], values = 'Times Funded',color='Times Funded', color_continuous_scale='Blues')
fig.show()

# fig = px.treemap(journals, path=[px.Constant('World'),'countries','Source title'], values = 'counts', color='counts', color_continuous_scale='Blues')

In [220]:
fig = px.sunburst(df, path=['countries', 'Funding Entities'], values='Times Funded',
                color='Times Funded', 
                color_continuous_scale='Blues',
                )
fig.show()