In [1]:
# look at running two sparql queries, matching using fuzzy strings,
# and graphing country of voters vs population of said country
# first up, wikibase side, voters by country

import requests, pydash, pandas

url = 'http://164.90.222.155:8989/bigdata/sparql'

query = """
SELECT DISTINCT ?voter ?voterLabel  ?votercountry ?votercountryLabel WHERE {
    ?film wdt:P1 wd:Q1 .
   ?film wdt:P3 ?voter.
  ?voter wdt:P2 ?votercountry .

  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
  }
  """

r = requests.get(url, params = {'format': 'json', 'query': query})
data = r.json()

data = pydash.get(data, 'results.bindings')
data = [pydash.get(x, 'votercountryLabel.value') for x in data]

index = pandas.Index(data)
thing = pandas.DataFrame(index.value_counts()).reset_index()
thing.columns = ['country', 'voters']

print(len(thing)) # okay 95 unique countries
print(sum(thing.voters))

replacer = {'UK':'United Kingdom', 'US':'United States of America', 'China':"People's Republic of China"}
thing = thing.replace({'country':replacer})
# United Kingdom


thing.head()

95
1274


Unnamed: 0,country,voters
0,United Kingdom,311
1,United States of America,221
2,Spain,65
3,Germany,46
4,France,44


In [2]:
# https://query.wikidata.org/sparql
    
    
url = 'https://query.wikidata.org/sparql'

query = """
SELECT DISTINCT ?country ?countryLabel ?population ?populationLabel  WHERE {
    ?country wdt:P31/wdt:P279* wd:Q6256 .
    ?country wdt:P1082 ?population .



  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
  }
  """

r = requests.get(url, params = {'format': 'json', 'query': query})
data = r.json()

data = pydash.get(data, 'results.bindings') 
data = [(pydash.get(x, 'countryLabel.value'), pydash.get(x, 'population.value')) for x in data]


population = pandas.DataFrame(data, columns=['country', 'population'])
population = population.sort_values(by='population', ascending=False)
population = population.drop_duplicates(subset='country', keep='first')



print(len(population.country.unique()))
print(len(population))
population.head()

431
431


Unnamed: 0,country,population
194,Rashidun Caliphates,t2009096880
407,Hungary,9937628
82,Empire of Japan,97770000
23,Seychelles,95843
117,Djibouti,956985


In [7]:
import numpy

combination = pandas.merge(thing, population, on='country', how='left')
combination = combination.loc[~combination.population.isin([numpy.nan])]
# prob = combination.loc[combination.population.isin([numpy.nan])]
combination['voters'] = combination['voters'].astype('int64')
combination['population'] = combination['population'].astype('int64')
# UK, voters percentage of the population
combination['perc'] = (combination['voters']/combination['population'])*100
# searcher = population.loc[population.country.str.contains('Netherlands', na=False)]

print(len(combination))
combination.head(20)

# combination.sample(20)

80


Unnamed: 0,country,voters,population,perc
0,United Kingdom,311,66022273,0.000471
1,United States of America,221,325145963,6.8e-05
2,Spain,65,46733038,0.000139
3,Germany,46,83149300,5.5e-05
4,France,44,66628000,6.6e-05
5,Australia,36,24511800,0.000147
6,Argentina,32,44938712,7.1e-05
7,Italy,24,60317000,4e-05
8,Canada,24,37894799,6.3e-05
9,Russia,20,146804372,1.4e-05


In [14]:
import altair

line = altair.Chart(combination).mark_line(interpolate='linear').encode(
        x='country',y='perc')

display(altair.layer(line).properties(width=1200, height=300))
               