# Introducing the gosduma7 Dataset

In [1]:
# Some initialization magic
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import collections

# Let's load up the data
import json

with open("scrapyproject/results.json") as fin:
    tables = [json.loads(line) for line in fin]

In [2]:
# Here's what it looks like. 
# Each line corresponds to a tabled scraped from the Central Electoral Committee website.
# Tables are dictionaries.
tables[0].keys()

dict_keys(['area_ik_long', 'url', 'column_headers', 'area_ik', 'data_type', 'data', 'region', 'timestamp', 'row_headers', 'md5'])

In [3]:
# region contains the name of the region.
a_table = [t for t in tables if t["data_type"] == "federal"][0]
a_table["region"]

'Кабардино-Балкарская Республика'

In [4]:
# area_ik contains the number of the regional election committee (окружная избирательная коммиссия).
a_table["area_ik"]

'ОИК №14'

In [5]:
# area_ik_long is the full name of the local election committee
# for turnout data, this is the same as the committee number.
a_table["area_ik_long"]

'Кабардино-Балкарская Республика - Кабардино-Балкарский'

In [6]:
# url corresponds to the URL from which the data was scraped from
a_table["url"]

'http://www.vybory.izbirkom.ru/region/region/izbirkom?action=show&root=1000021&tvd=100100067795875&vrn=100100067795849&region=0&global=true&sub_region=0&prver=0&pronetvd=0&vibid=100100067795875&type=233'

In [7]:
# the timestamp shows the date and time that the data was scraped
a_table["timestamp"]

'2016-09-30T05:20:26.933261+00:00'

In [8]:
# md5 corresponds to the MD5 hash of the HTML that was fetched from the URL at the time of scraping
import requests
import hashlib
r = requests.get(a_table["url"])
assert hashlib.md5(r.content).hexdigest() == a_table["md5"], "MD5 mismatch, perhaps data is out of date?"

In [9]:
# data_type refers to the type of scraped data. There are three types:
#  * federal - data for the federal election
#  * single - data for the single-mandate elections
#  * turnout - turnout data at different points in time
#  * federal_uik - data for the federal election, with individual polling station data
#  * federal_uik - turnout_data at different points in time, with individual polling station data
# In case you're wondering, UIK refers to участковая избирательная комиссия, or polling station.
set(table["data_type"] for table in tables)

{'federal', 'federal_uik', 'single', 'turnout', 'turnout_uik'}

In [10]:
# Each table, regardless of data_type, contains data in rows and columns.
# The label for each row and column is contained in "row_headers" and "column_headers", respectively.
# For turnout tables, the data looks like this:
turnout_table = [t for t in tables if t["data_type"] == "turnout"][0]
print(turnout_table["row_headers"])
print(turnout_table["column_headers"])

['ВСЕГО, в том числе', 'Баксанская', 'Баксанская городская', 'Зольская', 'Лескенская', 'Майская', 'Нальчикская городская', 'Прохладненская', 'Прохладненская городская', 'Терская', 'Урванская', 'Чегемская', 'Черекская', 'Эльбрусская']
['10:00', '12:00', '15:00', '18:00']


In [11]:
# The row headers in this case are the names of the local electoral committees (участковая избирательная коммиссия).
# The column headers are the times at which turnout was reported.

In [12]:
# For federal tables, the data looks like this:
federal_table = [t for t in tables if t["data_type"] == "federal"][0]
print(federal_table["row_headers"][:5])  # snipped for brevity
print(federal_table["column_headers"][:5]) # snipped for brevity

['Число избирателей, внесенных в список избирателей на момент окончания голосования', 'Число избирательных бюллетеней, полученных участковой избирательной комиссией', 'Число избирательных бюллетеней, выданных избирателям, проголосовавшим досрочно', 'Число избирательных бюллетеней, выданных в помещении для голосования в день голосования', 'Число избирательных бюллетеней, выданных вне помещения для голосования в день голосования']
['Сумма', 'Баксанская', 'Баксанская городская', 'Зольская', 'Лескенская']


In [13]:
# Rows correspond to a certain measurement, the columns correspond to the local electoral committees.
# federal tables record votes for each of the 14 registered parties.
# The list of parties is the same across the entire country.
print(federal_table["row_headers"][18:24])  # snipped for brevity

['1. ВСЕРОССИЙСКАЯ ПОЛИТИЧЕСКАЯ ПАРТИЯ "РОДИНА"', '2. Политическая партия КОММУНИСТИЧЕСКАЯ ПАРТИЯ КОММУНИСТЫ РОССИИ', '3. Политическая партия "Российская партия пенсионеров за справедливость"', '4. Всероссийская политическая партия "ЕДИНАЯ РОССИЯ"', '5. Политическая партия "Российская экологическая партия "Зеленые"', '6. Политическая партия "Гражданская Платформа"']


In [14]:
# For single tables, the headers include the names of the candidates.
# The names of the candidates differ between regional electoral committees.
single_table = [t for t in tables if t["data_type"] == "single"][0]
single_table["row_headers"][18:24]  # snipped for brevity

['Жилов Хасан Русланович',
 'Паштов Борис Султанович',
 'Султанова Айшат Токболатовна',
 'Токов Руслан Мухарбиевич',
 'Цумаев Муса Мутушович',
 'Шаваев Камал Хасанович']

In [15]:
#
# The *_iuk tables contain the same row headers as their regular counterparts.
# They just show finer-grained information.
#
federal_uik_table = [t for t in tables if t["data_type"] == "federal_uik"][0]
assert federal_uik_table["row_headers"] == federal_table["row_headers"]
federal_uik_table["column_headers"][:5]  # snipped for brevity

['Сумма', 'УИК №226', 'УИК №227', 'УИК №228', 'УИК №229']

In [16]:
# They also include the territory electoral committe (территориальная избирательная комиссия)
federal_uik_table["territory_ik"]

'Теучежская'

In [17]:
turnout_uik_table = [t for t in tables if t["data_type"] == "turnout_uik"][0]
assert turnout_uik_table["column_headers"] == turnout_table["column_headers"]
turnout_uik_table["row_headers"][:5]  # snipped for brevity

['ВСЕГО, в том числе', 'УИК №5501', 'УИК №5502', 'УИК №5503', 'УИК №5504']

In [18]:
# The data itself can be obtained by indexing into the data matrix.
federal_table["data"][0][0]

536867.0

# Checking the Dataset

In [19]:
# There are 225 electorates, make sure we have complete data for each
counter = collections.Counter(table["data_type"] for table in tables)
counter

Counter({'federal': 225,
         'federal_uik': 2820,
         'single': 225,
         'turnout': 225,
         'turnout_uik': 2820})

In [20]:
for data_type in ["federal", "single", "turnout"]:
    assert counter[data_type] == 225, "incorrect count for {!r}: expected 225, actual: {:d}".format(data_type, counter[data_type])

In [21]:
# I was able to fetch data from 2820 TIK (territory electoral committees).
assert counter["federal_uik"] == counter["turnout_uik"] == 2820

In [22]:
# Check that zeroth column contains the row total for federal and single tables
for table in [t for t in tables if t["data_type"] in ("federal", "single")]:
    for i, _ in enumerate(table["row_headers"]):
        assert table["data"][i][0] == sum(table["data"][i][1:])

# Summary

In [23]:
# The number of OIK (area electoral committees), TIK (territory electoral committees),
# and UIK (spot electoral committees) per region
import csv
import sys

writer = csv.writer(sys.stdout, delimiter="|")
writer.writerow(["region", "oik", "tik", "uik"])

federal = [t for t in tables if t["data_type"] == "federal"]
federal_uik = [t for t in tables if t["data_type"] == "federal_uik"]
oik_counter = collections.Counter(t["region"] for t in federal)

for region, num_oik in sorted(oik_counter.items()):
    region_tables = [t for t in federal if t["region"] == region]
    num_tik = sum([(len(t["column_headers"]) - 1) for t in region_tables])
    
    region_tables_uik = [t for t in federal_uik if t["region"] == region]
    num_uik = sum([(len(t["column_headers"]) - 1) for t in region_tables_uik])
    
    writer.writerow([region, num_oik, num_tik, num_uik])

region|oik|tik|uik
Алтайский край|4|74|1882
Амурская область|1|29|765
Архангельская область|2|32|942
Астраханская область|1|16|618
Белгородская область|2|22|1251
Брянская область|2|35|1125
Владимирская область|2|23|982
Волгоградская область|4|47|1542
Вологодская область|2|28|1006
Воронежская область|4|39|1719
Еврейская автономная область|1|6|169
Забайкальский край|2|39|959
Ивановская область|2|31|800
Иркутская область|4|47|1956
Кабардино-Балкарская Республика|1|13|354
Калининградская область|2|25|0
Калужская область|2|28|732
Камчатский край|1|14|301
Карачаево-Черкесская Республика|1|12|249
Кемеровская область|4|49|1761
Кировская область|2|48|1183
Костромская область|1|30|600
Краснодарский край|8|60|2750
Красноярский край|4|72|2196
Курганская область|1|27|992
Курская область|2|36|1161
Ленинградская область|3|19|976
Липецкая область|2|25|915
Магаданская область|1|10|107
Московская область|11|70|3625
Мурманская область|1|17|581
Ненецкий автономный округ|1|2|51
Нижегородская область|5|62|2