In [258]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import country_converter as coco
import pycountry_convert as pc

frame = pd.read_pickle('/Users/giuliavenditti/Desktop/data.pkl')
frame.head()

Unnamed: 0,issn,doi,doi-num,on-crossref,reference,asserted-by-cr,asserted-by-pub,ref-undefined,ref-num,year,country,subject
0,2372-0352,10.3934/environsci.2020019,1,1,1,0,52,24,76,2020.0,US,G
1,2372-0352,10.3934/environsci.2020024,1,1,1,0,11,8,19,2020.0,US,G
2,2372-0352,10.3934/environsci.2020026,1,1,1,0,26,23,49,2020.0,US,G
3,2372-0352,10.3934/environsci.2015.3.565,1,1,1,0,37,0,37,2015.0,US,G
4,2372-0352,10.3934/environsci.2021009,1,1,1,21,0,19,40,2021.0,US,G


# 1) Percentage of DOIs registered on Crossref

In [161]:
totdoi = len(frame)
oncross = frame['on-crossref'].sum()
new = pd.DataFrame()
noton = totdoi - oncross
new['val'] = [noton, oncross]
new['ref'] = ['Percentage articles not on cross', 'Percentage articles on cross']
fig = px.pie(new, values='val', names='ref', color='ref', color_discrete_map={
                                'Percentage articles not on cross':'#34B67A',
                                'Percentage articles on cross':'#471D6C'}, width=800,
title ="Percentage of DOIs registered on Crossref")

fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)

fig.show()

In [256]:
frame2 = frame.copy()
frame2 = frame2.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame2 = frame2[(frame2.year >= 1950)&(frame2.year < 2022)]
frame2 = frame2.groupby('year').sum()
frame2['perc_cr'] = (frame2['on-crossref']/frame2['doi-num'])*100
fig = px.box(frame2, y="perc_cr", width=800,
labels={ "perc_cr": "% DOIs on Crossref" },
 title="Percentage of DOIs registered on Crossref - by research field.")
fig.update_traces(marker_color='#471D6C') 
fig.update_yaxes(range=[50, 100])
fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)
fig.show()

In [157]:
frame1 = frame.copy()
frame1 = frame1.groupby('subject').sum()
frame1 =frame1.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame1['perc_cr'] = (frame1['on-crossref']/frame1['doi-num'])*100
fig = px.bar(frame1,x=frame1.index,y='perc_cr', log_y=True, color="doi-num",color_continuous_scale=px.colors.sequential.Viridis , width=800,
 labels={ "subject": "Research field", "perc_cr": "% DOIs on Crossref", "doi-num": "Number of DOIs" },
 title="Percentage of DOIs registered on Crossref - by research field.")

for idx in range(len(fig.data)):
    fig.data[idx].x = ['General works','Philosophy, Psychology, Religion','Auxiliary science of History', 'World History', 'History of the Americs', 'Local History of the Americas', 'Geography, Anthropology, Recreation', 'Social Sciences', 'Political Science', 'Law', 'Education', 'Music', 'Fine Arts', 'Language and Literature', 'Science', 'Medicine', 'Agriculture', 'Technology', 'Military Science', 'Naval Science', 'Bibliography, Library Science']

fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)
fig.show()
# fig.write_html('perc_cr_subj.html')

In [268]:
frame11a = frame.copy()
frame11a = frame11a.groupby('subject').sum()
frame11a =frame11a.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame11a['perc_cr'] =  (frame11a['on-crossref']/frame11a['doi-num'])*100
fig = px.scatter(frame11a,x='perc_cr',y=frame11a.index,size='ref-num', width=800, color="doi-num", color_continuous_scale=px.colors.sequential.Viridis,
    size_max=60,
    labels={ "subject": "Research field", "perc_cr": "% DOIs on Crossref", "doi-num": "Number of DOIs" },
    title="Percentage of DOIs registered on Crossref - by research field.")

for idx in range(len(fig.data)):
    fig.data[idx].y = ['General works','Philosophy, Psychology, Religion','Auxiliary science of History', 'World History', 'History of the Americs', 'Local History of the Americas', 'Geography, Anthropology, Recreation', 'Social Sciences', 'Political Science', 'Law', 'Education', 'Music', 'Fine Arts', 'Language and Literature', 'Science', 'Medicine', 'Agriculture', 'Technology', 'Military Science', 'Naval Science', 'Bibliography, Library Science']
fig.update_xaxes(range=[50, 110])
fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)
fig.show()

In [188]:
frame3 = frame.copy()
frame3 = frame3.groupby('country').sum()
frame3 =frame3.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame3['perc_cr'] = (frame3['on-crossref']/frame3['doi-num'])*100
frame3['country-name'] = coco.convert(names=frame3.index, to="name")
continent_name = []
for x in frame3.index:
    country_continent_code = pc.country_alpha2_to_continent_code(x)
    country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
    continent_name.append(country_continent_name)
frame3['continent'] = continent_name

fig = px.scatter(frame3,x="perc_cr",y="country-name", width=800, color="continent", color_discrete_map={
    'Asia' : '#34B67A',
    'Europe' : '#471D6C',
    'Africa' : '#D4E129',
    'South America' : '#25848E',
    'Oceania' : '#A6DA35',
    'North America' : '#FBE625'
},
 labels={ "country": "Country", "perc_cr": "% DOIs on Crossref", "continent": "Continent" },
 title="Percentage of DOIs registered on Crossref - by country.")


fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)
fig.show()

In [189]:
frame4 = frame.copy()
frame4 = frame4.groupby('country').sum()
frame4 =frame4.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame4['perc_cr'] = (frame4['on-crossref']/frame4['doi-num'])*100
frame4['country-name'] = coco.convert(names=frame4.index, to="name")
frame4 = frame4[frame4.perc_cr < 80]
continent_name = []
for x in frame4.index:
    country_continent_code = pc.country_alpha2_to_continent_code(x)
    country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
    continent_name.append(country_continent_name)
frame4['continent'] = continent_name

fig = px.bar(frame4,x="country-name",y='perc_cr', width=800, color="continent", color_discrete_map={
    'Asia' : '#34B67A',
    'Europe' : '#471D6C',
    'Africa' : '#D4E129',
    'South America' : '#25848E',
    'Oceania' : '#A6DA35',
    'North America' : '#FBE625'
},
 labels={ "country": "Country", "perc_cr": "% DOIs on Crossref", "continent": "Continent" },
 title="Percentage of DOIs registered on Crossref - by country.")


fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)
fig.show()

In [195]:
frame5 = frame.copy()
frame5 = frame5.groupby('country').sum()
frame5 =frame5.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame5['perc_cr'] = (frame5['on-crossref']/frame5['doi-num'])*100
frame5['iso_alpha'] = coco.convert(names=frame5.index, to='ISO3')
continent_name = []
for x in frame5.index:
    country_continent_code = pc.country_alpha2_to_continent_code(x)
    country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
    continent_name.append(country_continent_name)
frame5['continent'] = continent_name

fig = px.scatter_geo(frame5, locations="iso_alpha", width=800, color="continent", color_discrete_map={
                        'Asia' : '#34B67A',
                        'Europe' : '#471D6C',
                        'Africa' : '#D4E129',
                        'South America' : '#25848E',
                        'Oceania' : '#A6DA35',
                        'North America' : '#FBE625'
                    },
                     hover_name="doi-num", size="perc_cr", size_max=15,
                     projection="natural earth",  labels={ "iso_alpha": "Country code", "perc_cr": "% DOIs on Crossref", "doi-num" : "Number of dois", "continent": "Continent" },
 title="Percentage of DOIs registered on Crossref - by country.")

fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)

fig.show()


In [197]:
frame6 = frame.copy()
frame6 = frame6.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame6 = frame6[(frame6.year >= 1950)&(frame6.year < 2022)]
frame6 = frame6.groupby('year').sum()
frame6['perc_cr'] = (frame6['on-crossref']/frame6['doi-num'])*100
# fig = px.bar(frame,x=frame.index,y='perc_cr')
fig = px.scatter(frame6,x=frame6.index,y='perc_cr', width=800,
 labels={ "year": "Year", "perc_cr": "% DOIs on Crossref" },
 title="Percentage of DOIs registered on Crossref - by year")
fig.update_yaxes(range=[50, 100])
fig.update_traces(marker_color='#471D6C') 
fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)
fig.show()

In [201]:
frame7 = frame.copy()
frame7 = frame7.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame7 = frame7[(frame7.year >= 1950)&(frame7.year < 2022)]
frame7 = frame7.groupby('year').sum()
frame7['perc_cr'] = (frame7['on-crossref']/frame7['doi-num'])*100
# fig = px.bar(frame,x=frame.index,y='perc_cr')
fig = px.line(frame7,x=frame7.index,y='perc_cr', width=800,
 labels={ "year": "Year", "perc_cr": "% DOIs on Crossref" },
 title="Percentage of DOIs registered on Crossref - by year")
fig.update_yaxes(range=[50, 100])
fig.update_traces( line_color='#471D6C', line_width=2)
fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)
fig.show()

In [253]:
frame8 = frame.copy()
frame8 = frame8.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame8 = frame8[(frame8.year >= 2000)&(frame8.year < 2022)]
frame8 = frame8.groupby('year').sum()
frame8['perc_cr'] = (frame8['on-crossref']/frame8['doi-num'])*100
# fig = px.bar(frame,x=frame.index,y='perc_cr')
fig = px.scatter(frame8,x=frame8.index, y='perc_cr', size='doi-num',size_max=60,width=800, color="doi-num", color_continuous_scale=px.colors.sequential.Viridis,
 labels={ "year": "Year", "perc_cr": "% DOIs on Crossref", "doi-num": "Number of DOIs" },
 title="Percentage of DOIs registered on Crossref - by year")
fig.update_yaxes(range=[50, 110])
fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)
fig.show()

Dal grafico possiamo dedurre che gli articoli pubblicati dopo il 2000, anno di fondazione di Crossref, hanno più probabilità di essere presenti sulla piattaforma. Interessante notare il calo avito tra il 2010 e il 2015.

# 2) reference on crosref doi 

In [162]:
totdoi = len(frame)
oncross = frame['on-crossref'].sum()

new = pd.DataFrame()
noton =  frame['on-crossref'].sum() - frame['reference'].sum() 
new['val'] = [noton, frame['reference'].sum() ]
new['ref'] = ['Articles without references DOI', 'Articles with references DOI']


fig = px.pie(new, values='val', names='ref', color='ref', color_discrete_map={
                                'Articles without references DOI':'#34B67A',
                                'Articles with references DOI':'#471D6C'}, width=800,
title ="Percentage of DOIs'references registered on Crossref")

fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)

fig.show()

In [261]:
frame2a = frame.copy()
frame2a = frame2a.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame2a = frame2a[(frame2a.year >= 1950)&(frame2a.year < 2022)]
frame2a = frame2a.groupby('year').sum()
frame2a['perc_ref'] = (frame2a['reference']/frame2a['on-crossref'])*100
fig = px.box(frame2a, y="perc_ref", width=800,
labels={ "perc_ref": "% of DOIs' references on Crossref" },
 title="Percentage of DOIs'references registered on Crossref - by research field.")
fig.update_traces(marker_color='#471D6C') 
fig.update_yaxes(range=[0, 100])
fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)
fig.show()

In [212]:
frame9 = frame.copy()
frame9 = frame.groupby('subject').sum()
frame9 =frame9.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame9['perc_ref'] = (frame9['reference']/frame9['on-crossref'])*100
fig = px.bar(frame9,x=frame9.index,y='perc_ref', width=800,
    color="ref-num",color_continuous_scale=px.colors.sequential.Viridis,
    labels={ "subject": "Research field", "perc_ref": "% of DOIs' references on Crossref" },
    title="Percentage of DOIs'references registered on Crossref - by research field.")


for idx in range(len(fig.data)):
    fig.data[idx].x = ['General works','Philosophy, Psychology, Religion','Auxiliary science of History', 'World History', 'History of the Americs', 'Local History of the Americas', 'Geography, Anthropology, Recreation', 'Social Sciences', 'Political Science', 'Law', 'Education', 'Music', 'Fine Arts', 'Language and Literature', 'Science', 'Medicine', 'Agriculture', 'Technology', 'Military Science', 'Naval Science', 'Bibliography, Library Science']

fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)
fig.show()

In [213]:
frame10 = frame.copy()
frame10 = frame.groupby('subject').sum()
frame10 =frame10.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame10['perc_ref'] = (frame10['reference']/frame10['on-crossref'])*100
fig = px.scatter(frame10,x='perc_ref',y=frame10.index, width=800,
    labels={ "subject": "Research field", "perc_ref": "% of DOIs' references on Crossref" },
    title="Percentage of DOIs'references registered on Crossref - by research field.")


for idx in range(len(fig.data)):
    fig.data[idx].y = ['General works','Philosophy, Psychology, Religion','Auxiliary science of History', 'World History', 'History of the Americs', 'Local History of the Americas', 'Geography, Anthropology, Recreation', 'Social Sciences', 'Political Science', 'Law', 'Education', 'Music', 'Fine Arts', 'Language and Literature', 'Science', 'Medicine', 'Agriculture', 'Technology', 'Military Science', 'Naval Science', 'Bibliography, Library Science']
fig.update_xaxes(range=[0, 100])
fig.update_traces(marker_color='#471D6C') 
fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)
fig.show()

In [215]:
frame11 = frame.copy()
frame11 = frame11.groupby('subject').sum()
frame11 =frame11.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame11['perc_ref'] = (frame11['reference']/frame11['on-crossref'])*100
fig = px.scatter(frame11,x='perc_ref',y=frame11.index,size='doi-num', width=800, color="ref-num", color_continuous_scale=px.colors.sequential.Viridis,
    size_max=60,
    labels={ "subject": "Research field", "perc_ref": "% of DOIs' references on Crossref", "ref-num":"Number of references" },
    title="Percentage of DOIs'references registered on Crossref - by research field.")


for idx in range(len(fig.data)):
    fig.data[idx].y = ['General works','Philosophy, Psychology, Religion','Auxiliary science of History', 'World History', 'History of the Americs', 'Local History of the Americas', 'Geography, Anthropology, Recreation', 'Social Sciences', 'Political Science', 'Law', 'Education', 'Music', 'Fine Arts', 'Language and Literature', 'Science', 'Medicine', 'Agriculture', 'Technology', 'Military Science', 'Naval Science', 'Bibliography, Library Science']
fig.update_xaxes(range=[0, 100])
fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)
fig.show()

In [244]:
frame12 = frame.copy()
frame12 = frame12.groupby('country').sum()
frame12 =frame12.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame12['perc_ref'] = (frame12['reference']/frame12['on-crossref'])*100
frame12['country-name'] = coco.convert(names=frame12.index, to="name")
continent_name = []
for x in frame12.index:
    country_continent_code = pc.country_alpha2_to_continent_code(x)
    country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
    continent_name.append(country_continent_name)  
frame12['continent'] = continent_name
fig = px.scatter(frame12,y='country-name',x='perc_ref', width=800, color="continent", color_discrete_map={
    'Asia' : '#34B67A',
    'Europe' : '#471D6C',
    'Africa' : '#D4E129',
    'South America' : '#25848E',
    'Oceania' : '#A6DA35',
    'North America' : '#FBE625'
},
    labels={"country-name": "Country", "perc_ref": "% of DOIs' references on Crossref", "continent":"Continent" },
    title="Percentage of DOIs'references registered on Crossref - by country.")

fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)
fig.show()

In [245]:
frame13 = frame.copy()
frame13 = frame13.groupby('country').sum()
frame13 = frame13.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame13['perc_ref'] = (frame13['reference']/frame13['on-crossref'])*100
frame13['country-name'] = coco.convert(names=frame13.index, to="name")
frame13 =  frame13[frame13.perc_ref > 60]
continent_name = []
for x in frame13.index:
    country_continent_code = pc.country_alpha2_to_continent_code(x)
    country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
    continent_name.append(country_continent_name)
frame13['continent'] = continent_name

fig = px.bar(frame13,x='country-name',y='perc_ref', width=800, color="continent", color_discrete_map={
    'Asia' : '#34B67A',
    'Europe' : '#471D6C',
    'Africa' : '#D4E129',
    'South America' : '#25848E',
    'Oceania' : '#A6DA35',
    'North America' : '#FBE625'
},
    labels={"country-name": "Country", "perc_ref": "% of DOIs' references on Crossref", "continent":"Contintent" },
    title="Percentage of DOIs'references registered on Crossref - by country.")

fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)
fig.show()


In [235]:
frame14 = frame.copy()
frame14 = frame14.groupby('country').sum()
frame14 =frame14.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame14['perc_ref'] = (frame14['reference']/frame14['on-crossref'])*100
frame14['iso_alpha'] = coco.convert(names=frame14.index, to='ISO3')
continent_name = []
for x in frame14.index:
    country_continent_code = pc.country_alpha2_to_continent_code(x)
    country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
    continent_name.append(country_continent_name)
frame14['continent'] = continent_name
frame14 = frame14[frame14['perc_ref'] > 0]

fig = px.scatter_geo(frame14, locations="iso_alpha", width=800, color="continent", color_discrete_map={
                        'Asia' : '#34B67A',
                        'Europe' : '#471D6C',
                        'Africa' : '#D4E129',
                        'South America' : '#25848E',
                        'Oceania' : '#A6DA35',
                        'North America' : '#FBE625'
                    },
                     hover_name="ref-num", size="perc_ref", size_max=15,
                     projection="natural earth",  labels={ "iso_alpha": "Country code", "perc_ref": "% of DOIs' references on Crossref", "doi-num" : "Number of dois", "continent": "Continent" },
 title="Percentage of DOIs'references registered on Crossref - by country.")

fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)

fig.show()

In [240]:
frame15 = frame.copy()
frame15 = frame15.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame15 = frame15[(frame15.year >= 1950)&(frame15.year < 2022)]
frame15 = frame15.groupby('year').sum()
frame15['perc_ref'] = (frame15['reference']/frame15['on-crossref'])*100
fig = px.scatter(frame15,x=frame15.index,y='perc_ref', width=800,
    labels={ "subject": "Research field", "perc_ref": "% of DOIs' references on Crossref" },
    title="Percentage of DOIs'references registered on Crossref - by year.")

 
fig.update_traces(marker_color='#471D6C') 
fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)
fig.show()

In [243]:
frame16 = frame.copy()
frame16 = frame16.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame16 = frame16[(frame16.year >= 1950)&(frame16.year < 2022)]
frame16 = frame16.groupby('year').sum()
frame16['perc_ref'] = (frame15['reference']/frame15['on-crossref'])*100
fig = px.line(frame16,x=frame16.index,y='perc_ref', width=800,
 labels={ "year": "Year", "perc_ref":  "% of DOIs' references on Crossref" },
 title="Percentage of DOIs'references registered on Crossref - by year.")
fig.update_traces( line_color='#471D6C', line_width=2)
fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)
fig.show()

In [251]:
frame17 = frame.copy()
frame17 = frame17.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame17 = frame17[(frame17.year >= 2000)&(frame17.year < 2022)]
frame17 = frame17.groupby('year').sum()
frame17['perc_ref'] = (frame17['reference']/frame17['on-crossref'])*100
fig = px.scatter(frame17,x=frame17.index, y='perc_ref', size='ref-num',size_max=60,width=800, color="ref-num", color_continuous_scale=px.colors.sequential.Viridis,
 labels={ "year": "Year", "perc_ref":  "% of DOIs' references on Crossref", "ref-num":"Number of references" },
 title="Percentage of DOIs'references registered on Crossref - by year.")
fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)
fig.show()

In [16]:
frame = frame.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame = frame[(frame.year >= 1950)&(frame.year < 2022)]
frame = frame.groupby('year').sum()
frame['perc_ref'] = (frame['reference']/frame['on-crossref'])*100
frame['perc_cr'] = (frame['on-crossref']/frame['doi-num'])*100
fig = px.line(frame,x=frame.index,y=['perc_ref', 'perc_cr'], 
labels={ "year": "Year", "perc_ref":  "% of DOIs' references on Crossref" },
 title="Comparison between DOI and references registered on Crossref - by year.")

fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)
fig.show()

# 3) Reference that have doi specified

In [163]:
totdoi = len(frame)
oncross = frame['on-crossref'].sum()
frame[frame['on-crossref']==0].describe()

ref_defined = frame['ref-num']-frame['ref-undefined']
ref_defined = ref_defined.sum()
ref_undefined = frame['ref-undefined'].sum()

new = pd.DataFrame()
new['val'] = [ref_defined, ref_undefined]
new['ref'] = ['References with a defined doi', 'References without a defined doi']

fig = px.pie(new, values='val', names='ref', color='ref', color_discrete_map={
                                'References with a defined doi':'#34B67A',
                                'References without a defined doi':'#471D6C'}, width=800,
title ="Percentage of DOIs registered on Crossref")

fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)

fig.show()

In [44]:
frame = frame.groupby('subject').sum()
frame =frame.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame['perc_ref_nodoi'] = (frame['ref-undefined']/frame['ref-num'])*100
frame['perc_ref_doi'] = (frame['ref-num']/frame['ref-undefined'])*100
fig = px.histogram(frame, x=frame.index, y= ['perc_ref_doi', 'perc_ref_nodoi'], barmode='group', histfunc='avg',
 labels={ "subject": "Research field", "perc_ref_nodoi": "% reference without DOI", "perc_ref_doi": "% reference with DOI"},
 title="References DOI overview - by research field")

for idx in range(len(fig.data)):
    fig.data[idx].x = ['General works','Philosophy, Psychology, Religion','Auxiliary science of History', 'World History', 'History of the Americs', 'Local History of the Americas', 'Geography, Anthropology, Recreation', 'Social Sciences', 'Political Science', 'Law', 'Education', 'Music', 'Fine Arts', 'Language and Literature', 'Science', 'Medicine', 'Agriculture', 'Technology', 'Military Science', 'Naval Science', 'Bibliography, Library Science']

fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)
fig.show()

In [46]:
frame = frame.groupby('country').sum()
frame =frame.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame['country-name'] = coco.convert(names=frame.index, to='name')
frame['perc_ref_nodoi'] = (frame['ref-undefined']/frame['ref-num'])*100
frame['perc_ref_doi'] = (frame['ref-num']/frame['ref-undefined'])*100
fig = px.scatter(frame,  x= ['perc_ref_doi', 'perc_ref_nodoi'], y="country-name",
labels={"country-name": "Country", 'perc_ref_doi':'% References with DOI', 'perc_ref_nodoi': '% References without DOI'},
     title="References DOI overview - by country")

fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)

fig.show()

In [48]:
frame = frame.groupby('country').sum()
frame =frame.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame['iso_alpha'] = coco.convert(names=frame.index, to='ISO3')
frame['country-name'] = coco.convert(names=frame.index, to='name')
frame['perc_ref_nodoi'] = (frame['ref-undefined']/frame['ref-num'])*100
frame['perc_ref_doi'] = (frame['ref-num']/frame['ref-undefined'])*100
continent_name = []
for x in frame.index:
    country_continent_code = pc.country_alpha2_to_continent_code(x)
    country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
    continent_name.append(country_continent_name)
frame['continent'] = continent_name

fig = px.scatter_geo(frame, locations="iso_alpha", color="continent",
                     hover_name='perc_ref_doi', size='ref-num',
                     projection="natural earth",  labels={ "iso_alpha": "Country code", "ref-num" : "Number of references with dois", "continent": "Continent" },
title="References DOI overview - by country")

fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)

fig.show()

In [50]:
frame = frame.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame = frame[(frame.year >= 1950)&(frame.year < 2022)]
frame = frame.groupby('year').sum()
frame['perc_ref_nodoi'] = (frame['ref-undefined']/frame['ref-num'])*100
frame['perc_ref_doi'] = 100 - frame['perc_ref_nodoi']
fig = px.histogram(frame, x=frame.index, y= ['perc_ref_doi', 'perc_ref_nodoi'], histfunc='avg', 
title="References DOI overview - by country")

fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)

fig.show()

In [52]:
frame = frame.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame = frame[(frame.year >= 1950)&(frame.year < 2022)]
frame = frame.groupby('year').sum()
frame['perc_ref_nodoi'] = (frame['ref-undefined']/frame['ref-num'])*100
frame['perc_ref_doi'] = (frame['ref-num']/frame['ref-undefined'])*100
fig = px.line(frame,x=frame.index,y=['perc_ref_nodoi', 'perc_ref_doi'], 
labels={ "year": "Year" },
 title="References DOI overview - by country")

fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)
fig.show()

# 4) asserted-by crossref / asserted-by publisher

In [165]:
totdoi = len(frame)
oncross = frame['on-crossref'].sum()
ass_cross = frame['asserted-by-cr'].sum()
ass_pub = frame['asserted-by-pub'].sum()
und = frame['ref-undefined'].sum()
ass_cross, ass_pub


new = pd.DataFrame()
new['val'] = [ass_cross, ass_pub, und]
new['ref'] = ['Asserted by Crossref', 'Asserted by citing publisher', 'Missing/undefined doi']
fig = px.pie(new, values='val', names='ref', color='ref', color_discrete_map={
                                'Asserted by Crossref':'#34B67A',
                                'Asserted by citing publisher':'#471D6C',
                                'Missing/undefined doi':'#D4E129'}, width=800,
title ="Percentage of DOIs registered on Crossref")
fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)

fig.show()

In [56]:
frame = frame.groupby('subject').sum()
frame =frame.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame['perc_asserted_cr'] = (frame['asserted-by-cr']/frame['ref-num'])*100
frame['perc_asserted_pub'] = (frame['asserted-by-pub']/frame['ref-num'])*100
frame['perc_ref_nodoi'] = (frame['ref-undefined']/frame['ref-num'])*100
fig = px.histogram(frame, x=frame.index, y= ['perc_asserted_cr','perc_asserted_pub','perc_ref_nodoi'], histfunc='avg',
labels={ "subject": "Research field"},
title = "Reference assertion overviwe - by research field.")
for idx in range(len(fig.data)):
    fig.data[idx].x = ['General works','Philosophy, Psychology, Religion','Auxiliary science of History', 'World History', 'History of the Americs', 'Local History of the Americas', 'Geography, Anthropology, Recreation', 'Social Sciences', 'Political Science', 'Law', 'Education', 'Music', 'Fine Arts', 'Language and Literature', 'Science', 'Medicine', 'Agriculture', 'Technology', 'Military Science', 'Naval Science', 'Bibliography, Library Science']
fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)
fig.show()

In [182]:
frame = frame.groupby('subject').sum()
frame =frame.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame['perc_asserted_cr'] = (frame['asserted-by-cr']/frame['ref-num'])*100
frame['perc_asserted_pub'] = (frame['asserted-by-pub']/frame['ref-num'])*100
frame['perc_ref_nodoi'] = (frame['ref-undefined']/frame['ref-num'])*100
fig = px.scatter(frame, x= ['perc_asserted_cr','perc_asserted_pub','perc_ref_nodoi'], y=frame.index,
labels={ "subject": "Research field"},
title = "Reference assertion overviwe - by research field.")

for idx in range(len(fig.data)):
    fig.data[idx].y = ['General works','Philosophy, Psychology, Religion','Auxiliary science of History', 'World History', 'History of the Americs', 'Local History of the Americas', 'Geography, Anthropology, Recreation', 'Social Sciences', 'Political Science', 'Law', 'Education', 'Music', 'Fine Arts', 'Language and Literature', 'Science', 'Medicine', 'Agriculture', 'Technology', 'Military Science', 'Naval Science', 'Bibliography, Library Science']

fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)
fig.show()

In [227]:
frame = frame.groupby('country').sum()
frame =frame.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame['perc_asserted_cr'] = (frame['asserted-by-cr']/frame['ref-num'])*100
frame['perc_asserted_pub'] = (frame['asserted-by-pub']/frame['ref-num'])*100
frame['perc_ref_nodoi'] = (frame['ref-undefined']/frame['ref-num'])*100
frame['country-name'] = coco.convert(names=frame.index, to='name')
continent_name = []

for x in frame.index:
    country_continent_code = pc.country_alpha2_to_continent_code(x)
    country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
    continent_name.append(country_continent_name)
frame['continent'] = continent_name

fig = px.scatter(frame, x= ['perc_asserted_cr','perc_asserted_pub','perc_ref_nodoi'], y="country-name", 
 labels={ "country-name": "Country", "perc_cr": "% DOIs on Crossref" },
 title="Percentage of DOIs registered on Crossref - by country.")

# fig.update_traces(marker_color='continent', marker_line_color='rgb(0, 224, 198)',
#                   marker_line_width=1.5, opacity=0.8)
fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)
fig.show()

In [288]:
frame = frame.groupby('country').sum()
frame =frame.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame['perc_asserted_cr'] = (frame['asserted-by-cr']/frame['ref-num'])*100
frame['perc_asserted_pub'] = (frame['asserted-by-pub']/frame['ref-num'])*100
frame['perc_ref_nodoi'] = (frame['ref-undefined']/frame['ref-num'])*100
frame['country-name'] = coco.convert(names=frame.index, to='name')
continent_name = []

for x in frame.index:
    country_continent_code = pc.country_alpha2_to_continent_code(x)
    country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
    continent_name.append(country_continent_name)
frame['continent'] = continent_name

fig = px.histogram(frame, x='continent', y= ['perc_asserted_cr','perc_asserted_pub','perc_ref_nodoi'], histfunc='avg',
labels={ "subject": "Research field"},
title = "Reference assertion overviwe - by research field.")

fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)
fig.show()

In [195]:
frame = frame.apply(lambda x: pd.to_numeric(x, errors='coerce'))
frame = frame[(frame.year >= 1950)&(frame.year < 2022)]
frame = frame.groupby('year').sum()
frame['perc_asserted_cr'] = (frame['asserted-by-cr']/frame['ref-num'])*100
frame['perc_asserted_pub'] = (frame['asserted-by-pub']/frame['ref-num'])*100
frame['perc_ref_nodoi'] = (frame['ref-undefined']/frame['ref-num'])*100
fig = px.histogram(frame, x=frame.index, y= ['perc_asserted_cr','perc_asserted_pub','perc_ref_nodoi'], histfunc='avg',
labels={ "year": "Year"},
title = "Reference assertion overviwe - by year.")

fig.update_layout(
    font_family="sans-serif",
    title_font_size=20,
    font_size=10,
    title = {'xanchor': 'center', 'y':0.9,'x':0.5},
)
fig.show()

# Country proposal plus

In [275]:
import pycountry_convert as pc
newframe = frame.copy()
newframe = newframe.groupby('country').sum()
newframe['perc_asserted_cr'] = (newframe['asserted-by-cr']/newframe['ref-num'])*100
newframe['perc_asserted_pub'] = (newframe['asserted-by-pub']/newframe['ref-num'])*100
newframe['iso_alpha'] = coco.convert(names=newframe.index, to='ISO3')
newframe['country-name'] = coco.convert(names=newframe.index, to='name')
continent_name = []
for x in newframe.index:
    country_continent_code = pc.country_alpha2_to_continent_code(x)
    country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
    continent_name.append(country_continent_name)
newframe['continent'] = continent_name
newframe = newframe[newframe['perc_asserted_cr'] > 0]
newframe = newframe[newframe['perc_asserted_pub'] > 0]
newframe

Unnamed: 0_level_0,doi-num,on-crossref,reference,asserted-by-cr,asserted-by-pub,ref-undefined,ref-num,perc_asserted_cr,perc_asserted_pub,iso_alpha,country-name,continent
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AL,581,579,333,1574,275,5377,7226,21.782452,3.805702,ALB,Albania,Europe
AM,1117,1116,24,31,146,364,541,5.730129,26.987061,ARM,Armenia,Asia
AO,62,54,43,27,277,810,1114,2.423698,24.865350,AGO,Angola,Africa
AR,16824,15743,437,1595,3944,5747,11286,14.132554,34.945951,ARG,Argentina,South America
AT,7171,3776,1011,1417,41195,7999,50611,2.799787,81.395349,AUT,Austria,Europe
...,...,...,...,...,...,...,...,...,...,...,...,...
UA,85960,77090,17320,43226,138976,223193,405395,10.662687,34.281627,UKR,Ukraine,Europe
UG,78,77,36,1,555,520,1076,0.092937,51.579926,UGA,Uganda,Africa
US,641228,634568,465002,12373243,5192490,3561812,21127545,58.564509,24.576873,USA,United States,North America
VE,1937,1635,736,2784,3525,9914,16223,17.160821,21.728410,VEN,Venezuela,South America


In [284]:
fig1 = px.scatter_geo(newframe, locations="iso_alpha",
                     hover_name="ref-num", size="perc_asserted_cr", size_max=10,
                     projection="natural earth",  labels={ "iso_alpha": "Country code" }, mode='markers',
    marker=dict(color=['rgb(93, 164, 214)', 'rgb(255, 144, 14)','rgb(44, 160, 101)', 'rgb(255, 65, 54)']),
 title="Reference assertion overviwe - by country.")

# fig.update_traces(marker_color='rgb(7, 247, 159)', marker_line_color='rgb(0, 224, 198)',
#                   marker_line_width=1.5, opacity=0.8)

fig1.show()

TypeError: scatter_geo() got an unexpected keyword argument 'mode'

In [271]:
fig2 = px.scatter_geo(newframe, locations="iso_alpha",
                     hover_name="ref-num", size="perc_asserted_pub", size_max=10,
                     projection="natural earth",  labels={ "iso_alpha": "Country code" },
 title="Reference assertion overviwe - by country.")

fig2.show()

In [272]:
fig=px.scatter_geo()
fig.add_traces(fig1._data)
fig.add_traces(fig2._data)

fig.show()

In [125]:
import country_converter as coco
import pycountry_convert as pc
from pycountry_convert import country_alpha2_to_continent_code

iso2_codes = ['AE', 'US', 'LV', 'ML' ]
iso3_codes = coco.convert(names=iso2_codes, to='ISO3')
print(iso3_codes)

continent_code = []
continent_name = []
for x in iso2_codes:
    country_continent_code = pc.country_alpha2_to_continent_code(x)
    continent_code.append(country_continent_code)
    country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
    continent_name.append(country_continent_name)
print(continent_name, continent_code)

['ARE', 'USA', 'LVA', 'MLI']
['Asia', 'North America', 'Europe', 'Africa'] ['AS', 'NA', 'EU', 'AF']
