# Introducing the gosduma7 Dataset

In [21]:
# Some initialization magic
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import collections

# Let's load up the data
import json

with open("scrapyproject/results.json") as fin:
    tables = [json.loads(line) for line in fin]

In [22]:
# Here's what it looks like. 
# Each line corresponds to a tabled scraped from the Central Electoral Committee website.
# Tables are dictionaries.
tables[0].keys()

dict_keys(['area_ik_long', 'timestamp', 'column_headers', 'url', 'region', 'md5', 'data_type', 'area_ik', 'data', 'row_headers'])

In [3]:
# region contains the name of the region .
a_table = [t for t in tables if t["data_type"] == "federal"][0]
a_table["region"]

'Республика Калмыкия'

In [4]:
# area_ik contains the number of the regional election committee (окружная избирательная коммиссия).
a_table["area_ik"]

'ОИК №15'

In [5]:
# committee_name is the full name of the local election committee
# for turnout data, this is the same as the committee number.
a_table["area_ik_name"]

'Республика Калмыкия - Калмыцкий'

In [6]:
# url corresponds to the URL from which the data was scraped from
a_table["url"]

'http://www.vybory.izbirkom.ru/region/region/izbirkom?action=show&root=1000023&tvd=100100067795877&vrn=100100067795849&region=0&global=true&sub_region=0&prver=0&pronetvd=0&vibid=100100067795877&type=233'

In [7]:
# the timestamp shows the date and time that the data was scraped
a_table["timestamp"]

'2016-09-29T06:36:28.944704+00:00'

In [8]:
# md5 corresponds to the MD5 hash of the HTML that was fetched from the URL at the time of scraping
import requests
import hashlib
r = requests.get(a_table["url"])
assert hashlib.md5(r.content).hexdigest() == a_table["md5"], "MD5 mismatch, perhaps data is out of date?"

In [9]:
# data_type refers to the type of scraped data. There are three types:
#  * federal - data for the federal election
#  * single - data for the single-mandate elections
#  * turnout - turnout data at different points in time
#  * federal_uik - data for the federal election, with individual polling station data
#  * federal_uik - turnout_data at different points in time, with individual polling station data
set(table["data_type"] for table in tables)

{'federal', 'federal_uik', 'single', 'turnout', 'turnout_uik'}

In [10]:
# Each table, regardless of data_type, contains data in rows and columns.
# The label for each row and column is contained in "row_headers" and "column_headers", respectively.
# For turnout tables, the data looks like this:
turnout_table = [t for t in tables if t["data_type"] == "turnout"][0]
print(turnout_table["row_headers"])
print(turnout_table["column_headers"])

['ВСЕГО, в том числе', 'Городовиковская', 'Ики-Бурульская', 'Лаганская', 'Кетченеровская', 'Малодербетовская', 'Октябрьская', 'Приютненская', 'Сарпинская', 'Целинная', 'Черноземельская', 'Элистинская городская', 'Юстинская', 'Яшалтинская', 'Яшкульская']
['10:00', '12:00', '15:00', '18:00']


In [11]:
# The row headers in this case are the names of the local electoral committees (участковая избирательная коммиссия).
# The column headers are the times at which turnout was reported.

In [12]:
# For federal tables, the data looks like this:
federal_table = [t for t in tables if t["data_type"] == "federal"][0]
print(federal_table["row_headers"][:5])  # snipped for brevity
print(federal_table["column_headers"][:5]) # snipped for brevity

['Число избирателей, внесенных в список избирателей на момент окончания голосования', 'Число избирательных бюллетеней, полученных участковой избирательной комиссией', 'Число избирательных бюллетеней, выданных избирателям, проголосовавшим досрочно', 'Число избирательных бюллетеней, выданных в помещении для голосования в день голосования', 'Число избирательных бюллетеней, выданных вне помещения для голосования в день голосования']
['Сумма', 'Городовиковская', 'Ики-Бурульская', 'Лаганская', 'Кетченеровская']


In [13]:
# Rows correspond to a certain measurement, the columns correspond to the local electoral committees.
# federal tables record votes for each of the 14 registered parties.
# The list of parties is the same across the entire country.
print(federal_table["row_headers"][18:24])  # snipped for brevity

['1. ВСЕРОССИЙСКАЯ ПОЛИТИЧЕСКАЯ ПАРТИЯ "РОДИНА"', '2. Политическая партия КОММУНИСТИЧЕСКАЯ ПАРТИЯ КОММУНИСТЫ РОССИИ', '3. Политическая партия "Российская партия пенсионеров за справедливость"', '4. Всероссийская политическая партия "ЕДИНАЯ РОССИЯ"', '5. Политическая партия "Российская экологическая партия "Зеленые"', '6. Политическая партия "Гражданская Платформа"']


In [14]:
# For single tables, the headers include the names of the candidates.
# The names of the candidates differ between regional electoral committees.
single_table = [t for t in tables if t["data_type"] == "single"][0]
single_table["row_headers"][18:24]  # snipped for brevity

['Атеев Семен Николаевич',
 'Балаклеец Людмила Ивановна',
 'Бессарабов Андрей Анатольевич',
 'Болдырев Игорь Владимирович',
 'Габунщин Сергей Валериевич',
 'Захарченко Анатолий Александрович']

In [15]:
#
# The *_iuk tables contain the same row headers as their regular counterparts.
# They just show finer-grained information.
#
federal_uik_table = [t for t in tables if t["data_type"] == "federal_uik"][0]
assert federal_uik_table["row_headers"] == federal_table["row_headers"]
federal_uik_table["column_headers"][:5]  # snipped for brevity

['Сумма', 'УИК №137', 'УИК №138', 'УИК №139', 'УИК №140']

In [16]:
turnout_uik_table = [t for t in tables if t["data_type"] == "turnout_uik"][0]
assert turnout_uik_table["column_headers"] == turnout_table["column_headers"]
turnout_uik_table["row_headers"][:5]  # snipped for brevity

['ВСЕГО, в том числе', 'УИК №820', 'УИК №821', 'УИК №822', 'УИК №823']

In [17]:
# The data itself can be obtained by indexing into the data matrix.
federal_table["data"][0][0]

211637.0

# Checking the Dataset

In [18]:
# There are 225 electorates, make sure we have complete data for each
counter = collections.Counter(table["data_type"] for table in tables)
counter

Counter({'federal': 225,
         'federal_uik': 225,
         'single': 225,
         'turnout': 225,
         'turnout_uik': 225})

In [19]:
for data_type in ["federal", "single", "turnout", "federal_uik", "turnout_uik"]:
    assert counter[data_type] == 225, "incorrect count for {!r}: expected 225, actual: {:d}".format(data_type, counter[data_type])

In [20]:
# Check that zeroth column contains the row total for federal and single tables
for table in [t for t in tables if t["data_type"] in ("federal", "single")]:
    for i, _ in enumerate(table["row_headers"]):
        assert table["data"][i][0] == sum(table["data"][i][1:])