# The topics and accessibility of citations in Wikipedia

In [74]:
'''
import useful libraries
'''
import pandas as pd
import numpy as np
from bokeh.models import ColumnDataSource, LabelSet, HoverTool,Range1d, NumeralTickFormatter
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
import math
import operator
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

''' use input file provided or substitute with your own path '''
inputfile='data/distribution_sum_access_publications'


### We define the two functions to discover relations between publication topic, language and accessibility
* Breakdown of total number of publications and percentage of open publications by language, for a specific topic or for all topics
* Breakdown of total number of publications and percentage of open publications by topic, for a specific language or for all languages

In [101]:
def generate_open_language_plot(dataframe,topic):
    '''
    takes as input:
    dataframe - the pandas data frame containing the data
    topic - a string corresponding to one of the topics, or 'all' if you want to have 
            a complete overview across topics. Choose between:
            'Africa', 'Americas', 'Article improvement and grading', 'Arts', 'Biology', 'Bodies of water', 
            'Broadcasting', 'Business and economics', 'Chemistry', 'Cities', 'Contents systems','Countries', 
            'Crafts and hobbies', 'Economics', 'Education','Entertainment', 'Europe', 'Files', 
            'Food and drink', 'Geosciences','History and society', 'Information science', 'Internet culture',
            'Landforms', 'Language and literature', 'Maintenance', 'Maps', 'Mathematics', 'Media', 'Medicine', 
            'Meteorology', 'Military and warfare', 'Music', 'Performing arts','Philosophy and religion', 
            'Physics', 'Plastic arts','Politics and government', 'Science', 'Space', 'Sports','Technology', 
            'Time', 'Transportation', 'all'
    '''
    if topic == 'all':
        topic= 'all_topics'
        TITLE = "Percentage of open publications for all topics"
    else:
        TITLE = "Percentage of open publications for topic "+topic
    
    #load the data for one specific topic
    source=ColumnDataSource(dataframe.loc[dataframe['topic'] == topic])
    
    #prepare interaction tools
    tools = "pan,wheel_zoom,box_zoom,reset,save".split(',')
    hover = HoverTool(tooltips=[
        ("language", "@wiki"),
        ("total scholarly publications:", "@{total citations}{0}"),
        ("% open access publications", "@{percentage open citations}{0.00%}")])
    tools.append(hover)
    
    #prepare the plot figure, depending on the quantity of data, go for log scale or linear scale
    if max(dataframe.loc[dataframe['topic'] == topic]['total citations']>500):
        p = figure(tools=tools, toolbar_location="above", logo="grey", 
                   plot_width=800, plot_height=600, title=TITLE,y_axis_type="log")
    else:
        p = figure(tools=tools, toolbar_location="above", logo="grey", 
                   plot_width=800, plot_height=600, title=TITLE)
    
    #prepare plot background, axes labels and line colors
    p.background_fill_color = "#ffffff" #change if you don't want white background
    p.xaxis.axis_label = "percentage of open access publications"
    p.yaxis.axis_label = "number of scholarly publications"
    p.grid.grid_line_color = "gray"

    #choose format for axes
    p.yaxis[0].formatter = NumeralTickFormatter(format="0")
    p.xaxis[0].formatter = NumeralTickFormatter(format="0.00%")
    
    #draw the circles; change colors here
    p.circle("percentage open citations", "total citations", size=10, 
             source=source, line_color="#005693", line_width=1,
             line_alpha=0.7, fill_alpha=0.5, fill_color="#23a3ff")
    labels = LabelSet(x="percentage open citations", y="total citations", text="wiki",y_offset=8,
                      text_font_size="8pt", text_color="#555555",
                      source=source, text_align='center')
    p.add_layout(labels)
    
    #draws the plot
    output_notebook()
    show(p)


In [116]:
def generate_open_topic_plot(df,topics, lan):
    '''
    takes as input:
    df - the pandas data frame containing the data
    lan - a string corresponding to one of the languages for which we have data, or 'all' if you want to have 
            a complete overview across all languages. Choose between:
            'ace', 'af', 'ak', 'als', 'am', 'an', 'ang', 'ar', 'arz', 'as', 'ast', 'av', 'az', 'azb', 'ba', 
            'bar', 'bat_smg', 'bcl', 'be', 'be_x_old', 'bg', 'bh', 'bjn', 'bn', 'bo', 'bpy', 'br', 'bs', 'bxr',
            'cbk_zam', 'cdo', 'ce', 'ceb', 'chr', 'ckb', 'co', 'cs', 'csb', 'cu', 'cv', 'cy', 'da', 'de', 'diq',
            'dsb', 'dty', 'dv', 'ee', 'el', 'eml', 'en', 'eo', 'es', 'et', 'eu', 'ext', 'fa', 'fi', 'fo', 'fr',
            'frr', 'fy', 'ga', 'gag', 'gan', 'gd', 'gl', 'gn', 'gom', 'gu', 'gv', 'ha', 'hak', 'hi', 'hif', 'hr',
            'hsb', 'ht', 'hu', 'hy', 'ia', 'id', 'ie', 'ig', 'ilo', 'io', 'is', 'it', 'ja', 'jam', 'jbo', 'jv',
            'ka', 'kab', 'kk', 'km', 'kn', 'ko', 'koi', 'krc', 'ku', 'kv', 'ky', 'la', 'lad', 'lb', 'lez', 'lg', 
            'li', 'lij', 'lmo', 'ln', 'lo', 'lrc', 'lt', 'lv', 'mai', 'map_bms', 'mdf', 'mg', 'mhr', 'min', 'mk',
            'ml', 'mn', 'mr', 'mrj', 'ms', 'mt', 'mwl', 'my', 'myv', 'mzn', 'na', 'nah', 'nds', 'nds_nl', 'ne', 
            'new', 'nl', 'nn', 'no', 'nov', 'nso', 'nv', 'ny', 'oc', 'olo', 'om', 'or', 'os', 'pa', 'pam', 'pap', 
            'pcd', 'pfl', 'pi', 'pih', 'pl', 'pms', 'pnb', 'ps', 'pt', 'qu', 'rm', 'ro', 'roa_tara', 'ru', 'rue',
            'rw', 'sa', 'sah', 'sc', 'scn', 'sco', 'sd', 'se', 'sh', 'si', 'simple', 'sk', 'sl', 'sm', 'sn', 'so', 
            'sq', 'sr', 'ss', 'st', 'stq', 'su', 'sv', 'sw', 'szl', 'ta', 'tcy', 'te', 'test', 'test2', 'tet', 'tg',
            'th', 'ti', 'tl', 'tn', 'tr', 'ts', 'tt', 'ug', 'uk', 'ur', 'uz', 'vec', 'vep', 'vi', 'vls', 'vo', 'war',
            'wo', 'wuu', 'xh', 'xmf', 'yi', 'yo', 'za', 'zh', 'zh_classical', 'zh_min_nan', 'zh_yue','all'
    '''
    #we are now going to generate a new dataframe gathering for each topic either the  values for one language, 
    #or the average/sum of values across languages
    dic={}
    counts={}
    if lan=='all':
        for topic in topics:
            if topic=='all_topics': 
                continue
            dic[topic]=np.mean(df.loc[df['topic']==topic]['percentage open citations'])
            counts[topic]=np.sum(df.loc[df['topic']==topic]['total citations'])
            TITLE = "Percentage of open publications by topic for all languages"
    else:
        for topic in topics:
            if topic=='all_topics': 
                continue
            dic[topic]=np.mean(df.loc[df['topic']==topic].loc[df['wiki']==lan]['percentage open citations'])
            counts[topic]=np.sum(df.loc[df['topic']==topic].loc[df['wiki']==lan]['total citations'])
            TITLE = "Percentage of open publications for "+lan+".Wikipedia"
    source = pd.DataFrame(data={'topics':dic.keys(), 'counts':counts.values(), 'perc':dic.values()})   
    
    #prepare interaction tools
    tools = "pan,wheel_zoom,box_zoom,reset,save".split(',')
    hover = HoverTool(tooltips=[
        ("topic", "@topics"),
        ("total scholarly publications:", "@{counts}{0}"),
        ("% open access publications", "@{perc}{0.00%}")])
    tools.append(hover)
    
    #prepare the plot figure, depending on the quantity of data, go for log scale or linear scale
    if max(source['counts']>200):
        p = figure(tools=tools, toolbar_location="above", logo="grey", plot_width=800, plot_height=600, title=TITLE,y_axis_type="log")
    else:
        p = figure(tools=tools, toolbar_location="above", logo="grey", plot_width=800, plot_height=600, title=TITLE)
    p.background_fill_color = "#ffffff"
    
    #change axes labels according to whether we analyze one language or all languages, prepare axes
    if lan=='all':
        p.xaxis.axis_label = "average percentage of open access publications across languages"
        p.yaxis.axis_label = "sum of all scholarly publications across languages"
    else:
        p.xaxis.axis_label = "percentage of open access publications"
        p.yaxis.axis_label = "number of scholarly publications"
        p.x_range=Range1d(0,1.1)
    p.yaxis[0].formatter = NumeralTickFormatter(format="0")
    p.xaxis[0].formatter = NumeralTickFormatter(format="0.00%")
    p.grid.grid_line_color = "gray"
    
    #draw the circles; change colors here
    source = ColumnDataSource(source)
    p.circle("perc", "counts", size=10, source=source, line_color="#8B0A50", line_width=1,line_alpha=0.7, fill_alpha=0.5, fill_color="#cd1076")
    labels = LabelSet(x="perc", y="counts", text="topics",y_offset=8,
                      text_font_size="8pt", text_color="#555555",
                      source=source, text_align='center')
    p.add_layout(labels)
    
    #draw plot
    output_notebook()
    show(p)



We now read the input data and store set of languages and topics available (for future usage)

In [117]:
df = pd.DataFrame.from_csv(inputfile,sep='\t',  index_col=None)
wikis = list(np.sort(list(set(df['wiki'].tolist()))))
topics = np.sort(list(set(df['topic'].tolist())))

We now generate the distribution of languages over the accessibility of their publications, for all topics

In [118]:
generate_open_language_plot(df,'all')

We now generate the distribution of topics over the accessibility of their publications, for all languages

In [119]:
 generate_open_topic_plot(df,topics,'all')

We see from the plot above that "Biology" is the most open topic, while, for example "Chemistry" has less open publications; let's see their breakdown by language:

In [120]:
generate_open_language_plot(df,'Biology')
generate_open_language_plot(df,'Chemistry')

We see from the language distribution plot that among the languages with highest number of publications, "Ukrainian " Wikipedia is very open; by contrast, we see that "Vietnamese" wikipedia is less open; let's see their breakdown by topic:

In [125]:
generate_open_topic_plot(df,topics,'uk')
generate_open_topic_plot(df,topics,'vi')