In [61]:

import pandas as pd
import numpy as np
from bokeh.models import ColumnDataSource, LabelSet, HoverTool
from bokeh.plotting import figure, show, output_file
from bokeh.sampledata.periodic_table import elements
from bokeh.io import output_notebook

inputfile='distribution_sum_access'


import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")


In [143]:

def get_spaced_colors(n):
    max_value = 16581375 #255**3
    interval = int(max_value / (n-1))
    colors = [hex(I)[2:].zfill(6) for I in range(0, max_value, interval)]
    return ['#'+i for i in colors]

def generate_scatter_plot(dataframe,colors,topic):
    df2=dataframe.loc[df['topic'] == topic]
    df2['melting_colors'] = colors
    TITLE = "Percentage of open citations for topic "+topic
    tools = "pan,wheel_zoom,box_zoom,reset,save".split(',')
    hover = HoverTool(tooltips=[
        ("edition", "@wiki"),
        ("total citations:", "@{total citations}"),
        ("percentage open citations", "@{percentage open citations}")])
    tools.append(hover)
    if max(df2['total citations']>500):
        print('here')
        p = figure(tools=tools, toolbar_location="above", logo="grey", plot_width=800, plot_height=600, title=TITLE,y_axis_type="log")
    else:
        p = figure(tools=tools, toolbar_location="above", logo="grey", plot_width=800, plot_height=600, title=TITLE)
    p.background_fill_color = "#dddddd"
    p.xaxis.axis_label = "percentage of open citations"
    p.yaxis.axis_label = "number of citations"
    p.grid.grid_line_color = "white"
    source = ColumnDataSource(df2)
    p.circle("percentage open citations", "total citations", size=8, source=source,
             color='melting_colors', line_color="black", fill_alpha=0.8)
    labels = LabelSet(x="percentage open citations", y="total citations", text="wiki",y_offset=8,
                      text_font_size="3pt", text_color="#555555",
                      source=source, text_align='center')
    p.add_layout(labels)
    output_notebook()
    show(p)




In [144]:
df = pd.DataFrame.from_csv(inputfile,sep='\t',  index_col=None)

wikis = list(np.sort(list(set(df['wiki'].tolist()))))
palette=get_spaced_colors(len(wikis))

Select which topics we are using

In [145]:
topics = np.sort(list(set(df['topic'].tolist())))
topics

array(['Africa', 'Americas', 'Article improvement and grading', 'Arts',
       'Biology', 'Bodies of water', 'Broadcasting',
       'Business and economics', 'Chemistry', 'Cities', 'Contents systems',
       'Countries', 'Crafts and hobbies', 'Economics', 'Education',
       'Entertainment', 'Europe', 'Files', 'Food and drink', 'Geosciences',
       'History and society', 'Information science', 'Internet culture',
       'Landforms', 'Language and literature', 'Maintenance', 'Maps',
       'Mathematics', 'Media', 'Medicine', 'Meteorology',
       'Military and warfare', 'Music', 'Performing arts',
       'Philosophy and religion', 'Physics', 'Plastic arts',
       'Politics and government', 'Science', 'Space', 'Sports',
       'Technology', 'Time', 'Transportation', 'all_topics'], 
      dtype='|S31')

Then generate graph

In [147]:
generate_scatter_plot(df,palette,'all_topics')



here


In [None]:
generate_scatter_plot(df,palette,'Space')
