In [472]:

import pandas as pd
import numpy as np
from bokeh.models import ColumnDataSource, LabelSet, HoverTool
from bokeh.plotting import figure, show, output_file
from bokeh.sampledata.periodic_table import elements
from bokeh.io import output_notebook
from bokeh.layouts import gridplot,row
from bokeh.models import NumeralTickFormatter
from random import random

inputfile='distribution_typetopics_forplot_publications.csv'
types=['isbn','doi']
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")


A few useful functions

In [481]:

def get_spaced_colors(n):
    '''
    function to load n equally spaced colors
    '''
    max_value = 16581375 #255**3
    vals=[int(max_value*random()) for r in range(n)]
    print vals
    colors = [hex(I)[2:].zfill(6) for I in vals]
    return ['#'+i for i in colors]

def generate_scatter_plot(dataframe,colors,wiki,averages):
    '''
    draws plot
    '''
    matrix=[]
    for ty in types:
        df2=dataframe.loc[dataframe['wiki'] == wiki].loc[dataframe['type'] == ty].reset_index()
        colorlist=[]
        avglist=[]
        for ind,arg in enumerate(df2['type']):
            colorlist.append(colors[types.index(arg)])
            avglist.append(averages[df2['topic'][ind]][arg])
        df2['melting_colors'] = colorlist
        df2['average'] = avglist
        m=max([max(df2['average']),max(df2['percentage_of_publications']),0.2])+0.1
        TITLE = ty.upper() +" publications in "+wiki
        tools = "pan,wheel_zoom,box_zoom,reset,save".split(',')
        hover = HoverTool(tooltips=[
            ("topic", "@topic"),
            ("average % of publications for topic:", "@{average}{0.00%}"),
            ("% of publications for topic in "+wiki, "@{percentage_of_publications}{0.00%}")])
        tools.append(hover)
        p = figure(tools=tools, toolbar_location="above", logo="grey", plot_width=450, plot_height=450, title=TITLE,x_axis_type="log",y_axis_type="log", x_range=(0.001, m), y_range=(0.001, m))
        p.background_fill_color = "#ffffff"
        p.yaxis.axis_label = "average percentage of " +ty+" publications"
        p.xaxis.axis_label = "percentage of  " +ty+" publications in " + wiki
        p.yaxis[0].formatter = NumeralTickFormatter(format="0.00%")
        p.xaxis[0].formatter = NumeralTickFormatter(format="0.00%")
        p.grid.grid_line_color = "gray"
        source = ColumnDataSource(df2)
        p.circle("percentage_of_publications", "average", size=10,color='melting_colors', source=source, line_color="#555555", line_width=1,line_alpha=0.7, fill_alpha=0.5)
        labels = LabelSet(x="percentage_of_publications", y="average", text="topic",y_offset=8,
                      text_font_size="7pt", text_color="#555555",
                      source=source, text_align='center')
        p.add_layout(labels)
        matrix.append(p)
    pr = row(matrix[0], matrix[1])
    output_notebook()
    show(pr)




Read data and load colors

In [492]:
df = pd.DataFrame.from_csv(inputfile,sep='\t',  index_col=None)

topics = list(np.sort(list(set(df['topic'].tolist()))))
averages=df.loc[df['wiki'] == '_AVERAGE'].reset_index()
dicaverage={}
for ind,arg in enumerate(averages['percentage_of_publications']):
    try:
        dicaverage[df['topic'][ind]][df['type'][ind]]=arg
    except:
        dicaverage[df['topic'][ind]]={}
        dicaverage[df['topic'][ind]][df['type'][ind]]=arg
palette=get_spaced_colors(len(types))

[3395667, 11165988]


Then generate graph

In [493]:
generate_scatter_plot(df,palette,'arwiki',dicaverage)


In [494]:
generate_scatter_plot(df,palette,'enwiki',dicaverage)


In [495]:
generate_scatter_plot(df,palette,'itwiki',dicaverage)


In [497]:
generate_scatter_plot(df,palette,'rmwiki',dicaverage)


In [491]:
generate_scatter_plot(df,palette,'fawiki',dicaverage)
