In [10]:

import pandas as pd
import numpy as np
from bokeh.models import ColumnDataSource, LabelSet, HoverTool
from bokeh.plotting import figure, show, output_file
from bokeh.sampledata.periodic_table import elements
from bokeh.io import output_notebook
from bokeh.models import NumeralTickFormatter

inputfile='distribution_sum_access'


import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")


A few useful functions

In [112]:

def get_spaced_colors(n):
    '''
    function to load n equally spaced colors
    '''
    max_value = 16581375 #255**3
    interval = int(max_value / (n-1))
    colors = [hex(I)[2:].zfill(6) for I in range(0, max_value, interval)]
    return ['#'+i for i in colors]

def generate_scatter_plot(dataframe,colors,topic):
    '''
    draws plot
    '''
    df2=dataframe.loc[df['topic'] == topic]
    df2['melting_colors'] = colors
    TITLE = "Percentage of open publications for topic "+topic
    tools = "pan,wheel_zoom,box_zoom,reset,save".split(',')
    hover = HoverTool(tooltips=[
        ("language", "@wiki"),
        ("total scholarly publications:", "@{total citations}"),
        ("% open access publications", "@{percentage open citations}")])
    tools.append(hover)
    if max(df2['total citations']>500):
        print('here')
        p = figure(tools=tools, toolbar_location="above", logo="grey", plot_width=800, plot_height=600, title=TITLE,y_axis_type="log")
    else:
        p = figure(tools=tools, toolbar_location="above", logo="grey", plot_width=800, plot_height=600, title=TITLE)
    p.background_fill_color = "#ffffff"
    p.xaxis.axis_label = "percentage of open access publications"
    p.yaxis.axis_label = "number of scholarly publications"
    p.yaxis[0].formatter = NumeralTickFormatter(format="0.00")
    p.grid.grid_line_color = "gray"
    source = ColumnDataSource(df2)
    p.circle("percentage open citations", "total citations", size=10, source=source, line_color="#005693", line_width=1,line_alpha=0.7, fill_alpha=0.5, fill_color="#23a3ff")
    labels = LabelSet(x="percentage open citations", y="total citations", text="wiki",y_offset=8,
                      text_font_size="8pt", text_color="#555555",
                      source=source, text_align='center')
    p.add_layout(labels)
    output_notebook()
    show(p)




Read data and load colors

In [113]:
df = pd.DataFrame.from_csv(inputfile,sep='\t',  index_col=None)

wikis = list(np.sort(list(set(df['wiki'].tolist()))))
palette=get_spaced_colors(len(wikis))

List all the which topics available

In [114]:
topics = np.sort(list(set(df['topic'].tolist())))
topics

array(['Africa', 'Americas', 'Article improvement and grading', 'Arts',
       'Biology', 'Bodies of water', 'Broadcasting',
       'Business and economics', 'Chemistry', 'Cities', 'Contents systems',
       'Countries', 'Crafts and hobbies', 'Economics', 'Education',
       'Entertainment', 'Europe', 'Files', 'Food and drink', 'Geosciences',
       'History and society', 'Information science', 'Internet culture',
       'Landforms', 'Language and literature', 'Maintenance', 'Maps',
       'Mathematics', 'Media', 'Medicine', 'Meteorology',
       'Military and warfare', 'Music', 'Performing arts',
       'Philosophy and religion', 'Physics', 'Plastic arts',
       'Politics and government', 'Science', 'Space', 'Sports',
       'Technology', 'Time', 'Transportation', 'all_topics'], 
      dtype='|S31')

Then generate graph

In [115]:
generate_scatter_plot(df,palette,'all_topics')



here


In [116]:
generate_scatter_plot(df,palette,'History and society')


here


In [117]:
generate_scatter_plot(df,palette,'Medicine')


here


In [118]:
generate_scatter_plot(df,palette,'Biology')


here


In [119]:
generate_scatter_plot(df,palette,'Language and literature')


here
