In [3]:
import pandas as pd
import json
import random
%matplotlib inline

In [7]:
dump_fpath = 'data/declarations.json'

with open(dump_fpath) as f:
    declarations = json.loads(f.read())

In [8]:
random.choice(declarations)

{'bonds': [],
 'incomes': [{'comment': '', 'relative': None, 'size': 1322690.38}],
 'main': {'document_type': {'id': 1, 'name': 'Антикоррупционная декларация'},
  'office': {'id': 873,
   'name': 'ГУФСИН Красноярский край',
   'post': 'заместитель начальника КП-13 ФКУ ОИУ-1 ОУХД',
   'region': {'id': 28, 'name': 'Красноярский край'},
   'type': {'id': 7, 'name': 'Федеральный, без региональной структуры'},
   'url': 'http://www.24.fsin.su/'},
  'party': None,
  'person': {'family_name': 'Слипченко',
   'given_name': 'Т.',
   'id': 32105,
   'name': 'Слипченко Т. А.',
   'patronymic_name': 'А.'},
  'year': 2014},
 'real_estates': [{'comment': '',
   'country': 'Россия',
   'name': '',
   'own_type': {'id': 21, 'name': 'Долевая собственность'},
   'region': None,
   'relative': None,
   'share': 0.5,
   'square': 48.7,
   'type': {'id': 4, 'name': 'Квартира'}},
  {'comment': '',
   'country': 'Россия',
   'name': '',
   'own_type': {'id': 9, 'name': 'В пользовании'},
   'region': None,
  

In [9]:
columns = ['person_id', 'person_name', 'year', 'office_id',
           'office_name', 'income', 'savings',
           'real_estate_amount',
           'real_estate_squares']

rows = []
for dec in declarations:
    row = [
        dec['main']['person']['id'],
        dec['main']['person']['name'],
        dec['main']['year'],
        dec['main']['office']['id'],
        dec['main']['office']['name'],
    ]
    income_sum = sum([inc['size'] for inc in dec['incomes']])
    savings_sum = sum([float(sav.split('руб.')[0].replace(',', '.').replace(' ', '')) for sav in dec['savings']])
    real_estate_amount = len(dec['real_estates'])
    real_estate_squares_sum = sum([(estate['square']  or 0) for estate in dec['real_estates']])
    row += [income_sum, savings_sum, real_estate_amount, real_estate_squares_sum]
    rows.append(row)

In [10]:
df = pd.DataFrame(rows, columns=columns)
df = df.sort_values(by=['person_id', 'year'])
df.head()


Unnamed: 0,person_id,person_name,year,office_id,office_name,income,savings,real_estate_amount,real_estate_squares
30357,8,Зюганов Геннадий Андреевич,1998,14,Государственная Дума,124154.0,0.0,1,150.0
31969,8,Зюганов Геннадий Андреевич,1999,449,Президент Российской Федерации,506247.0,0.0,3,252.7
127,8,Зюганов Геннадий Андреевич,2006,14,Государственная Дума,1257784.0,141740.91,1,167.4
1109,8,Зюганов Геннадий Андреевич,2006,449,Президент Российской Федерации,3590668.48,400490.32,2,281.3
5520,8,Зюганов Геннадий Андреевич,2009,14,Государственная Дума,2177821.52,0.0,3,448.7


In [11]:
df.shape

(200545, 9)

In [13]:
df.to_csv('data/declarations.csv', index=False)

In [57]:
grouped_view = df.groupby(['person_id', 'person_name', 'year']).first()

In [86]:
common_offices = df.groupby(['year', 'office_id'])['person_id'].apply(list).reset_index()
common_offices.head()

Unnamed: 0,year,office_id,person_id
0,1998,14,"[8, 68, 89, 132, 140, 144, 147, 150, 152, 177,..."
1,1999,449,"[8, 89, 177, 580, 582, 960, 9214, 19333, 19337..."
2,2006,14,"[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,..."
3,2006,449,"[8, 89, 542, 584]"
4,2008,12,[1383]


In [54]:
common_offices.to_csv('data/common_offices.csv', index=False)

