In [16]:
!pwd

/Users/nedgar/src/github.com/nedgar/parleh/notebooks


In [1]:
import json
import os
import pandas as pd
import re
import requests

In [12]:
parl_api_url = 'https://lop.parl.ca/ParlinfoWebAPI'
accept_json = {'Accept': 'application/json'}

In [4]:
def drop_time(df):
    date_cols = [col for col in df.columns if col.endswith('Date')]
    for col in date_cols:
        df[col] = df[col].str[:10]

def drop_french(df):
    to_drop = [col for col in df.columns if col.endswith('Fr')]
    df.drop(columns=to_drop, inplace=True)
    
def drop_empty_cols(df):
    to_drop = [col for col in df.columns if df[col].count() == 0]
    df.drop(columns=to_drop, inplace=True)

def drop_unsupported_cols(df):
    df.drop(columns=['Documents', 'Senator'], inplace=True, errors='ignore')
    
def cleanup(df):
    drop_time(df)
    drop_french(df)
    drop_empty_cols(df)
    drop_unsupported_cols(df)

In [5]:
r = requests.get(parl_api_url + '/Person/SearchAndRefine?refiners=4-1,', headers=accept_json)
d = r.json()
len(d)

431

In [None]:
r = requests.get(parl_api_url + '/Person/SearchAndRefine?refiners=28-1,28-2,28-3,', headers=accept_json)
d = r.json()
len(d)

In [5]:
len(d[0]['Roles'])

2

In [6]:
rows = [{k: (p[k] if k in p else None) for k in ['PersonId', 'LastName', 'UsedFirstName', 'ProvincialExperienceEN', 'MunicipalExperienceEn']} for p in d]
len(rows)

430

In [7]:
df = pd.DataFrame(rows)
df

Unnamed: 0,PersonId,LastName,UsedFirstName,ProvincialExperienceEN,MunicipalExperienceEn
0,18421,Aboultaif,Ziad,,
1,20137,Aitchison,Scott,,
2,17852,Albas,Dan,,
3,8947,Alghabra,Omar,,
4,18479,Alleslev,Leona,,
...,...,...,...,...,...
425,18279,Yurdiga,David,,
426,18535,Zahid,Salma,,
427,20099,Zann,Lenore,,
428,17951,Zimmer,Bob,,


In [8]:
df['ProvincialExperienceEN'].value_counts()

Series([], Name: ProvincialExperienceEN, dtype: int64)

In [None]:
df[df['LastName'] == 'Aglukkaq']

In [None]:
r = requests.get(parl_api_url + '/Person/GetPersonWebProfile/4487', headers=accept_json)
d = r.json()
len(d)

In [None]:
sorted(d.keys())

In [None]:
for k in list(d.keys()):
    if k.endswith('Fr'):
        del d[k]

In [None]:
person = d['Person']
name = f"{person['UsedFirstName']} {person['LastName']}"
name = person['DisplayName']
name

In [None]:
filename = name.replace(' ', '_') + '.json'
filename = name + '.json'
filename

In [None]:
with open(filename, 'w') as f:
    json.dump(d, f, indent=2)

In [None]:
!code $filename

In [None]:
sorted(d.keys())

In [None]:
sorted(d['Person'].keys())

In [None]:
[key for key in sorted(d['Person']['Roles'][0].keys()) if not key.endswith('Fr')]

In [8]:
def num_prefix(str):
    return int(str.split('-')[0])

def name_suffix(str):
    return str.split('-')[-1]

In [20]:
people_dir = '../data/people'

# Match people .json files
def person_files():
    pattern = re.compile('[0-9]+-.+\.json$')
    matching_files = filter(lambda d: pattern.match(d), os.listdir(people_dir))
    return sorted(matching_files, key=name_suffix)

len(person_files())

5188

In [21]:
def person_recs(n = None):
    files = person_files()
    for file in files[:n] if n else files:
        with open(os.path.join(people_dir, file)) as f:
            yield json.load(f)

In [41]:
# person_cols = ['PersonId', 'LastName', 'UsedFirstName']
# role_cols = ['PersonRoleId', 'ParliamentNumber', 'PartyEn', 'ToBeStyledAsEn',
#              'OrganizationTypeEn', 'OrganizationLongEn', 'GroupingTitleEn', 'PortFolioEn', 'NameEn', 
#              'IsMP', 'IsSenator', 'IsActing',
#              'StartDate', 'StartDateIsApproximate', 
#              'EndDate', 'EndDateIsApproximate', 'EndReasonEn', 'EndReasonTypeEn', 
#              'NotesEn']

# rows = []
# for rec in person_recs():
#     person = rec['Person']
#     person_tuple = tuple(person[col] for col in person_cols)
#     for role in rec['FederalExperience']:
#         role_tuple = tuple(role[col] for col in role_cols)
#         row = person_tuple + role_tuple
#         rows.append(row)
# len(rows)

In [42]:
# df = pd.DataFrame(rows, columns = person_cols + role_cols) \
#   .sort_values(['LastName', 'UsedFirstName', 'PersonId', 'StartDate', 'PersonRoleId'])
# df = df.drop_duplicates()
# len(df)

In [43]:
# cleanup(df)
# df.sample(5)

In [44]:
# df.loc[39471]

In [40]:
# df['ToBeStyledAsEn'].value_counts()[:30]

In [27]:
# filename = os.path.join(people_dir, 'fed_roles.csv')
# df.to_csv(filename, index=False, encoding='utf8')

In [None]:
!open $filename

In [28]:
# person_cols = ['PersonId', 'LastName', 'UsedFirstName']
# education_cols = ['SchoolNameLongEn', 'FieldOfStudyEn', 'DiplomaLongEn', 'GraduationYear']

# rows = []
# for rec in person_recs():
#     person = rec['Person']
#     person_tuple = tuple(person[col] for col in person_cols)
#     if person['Education']:
#         for edu in person['Education']:
#             edu_tuple = tuple(edu[col] for col in education_cols)
#             row = person_tuple + edu_tuple
#             rows.append(row)
# len(rows)

5756

In [29]:
# df = pd.DataFrame(rows, columns = person_cols + education_cols) \
#   .sort_values(['LastName', 'UsedFirstName', 'PersonId', 'GraduationYear'])
# df = df.drop_duplicates()
# len(df)

5753

In [30]:
# df.sample(25)

Unnamed: 0,PersonId,LastName,UsedFirstName,SchoolNameLongEn,FieldOfStudyEn,DiplomaLongEn,GraduationYear
2883,6574,Lapierre,Edmond Antoine,Saint Mary's University,Unknown,Unknown,
2137,9000,Gurbin,Gary Michael,University of Western Ontario,Medicine,Doctor of medicine,
2215,18437,Hardie,Ken,University of British Columbia,Economics,Bachelor,
556,6172,Bouffard,Paul Henri,Laval University,Laws,Licentiate,
4245,6907,Pelletier,Irénée,Université de Toulouse,Unknown,Unknown,
1845,4053,Fréchette,Louis Honoré,Laval University,Laws,Unknown,
2572,6942,Jones,Herbert Ladd,Collegiate studies,Unknown,Degree,1875.0
4823,17254,Saxton,Andrew,University of Western Ontario,Finance,Degree,1986.0
2293,6302,Hearn,Loyola,Memorial Univerisity of Newfoundland,Education,Bachelor,1969.0
1166,9670,Cowan,James S.,Dalhousie University,Laws,Bachelor,1965.0


In [31]:
# filename = os.path.join(people_dir, 'education.csv')
# df.to_csv(filename, index=False, encoding='utf8')

In [None]:
education_cols

In [None]:
parl_num = 42
parl = df[df['ParliamentNumber'] == parl_num]
parl

In [None]:
filename = os.path.join(people_dir, f'parl_{parl_num}_roles.csv')
parl42.to_csv(filename, index=False, encoding='utf8')

In [None]:
!open $filename

In [None]:
mp_roles = df[df['IsMP']].drop_duplicates()
len(mp_roles)

In [None]:
mp_roles.nunique().sort_values(ascending=False)

In [None]:
mp_roles.fillna('').groupby(['GroupingTitleEn', 'OrganizationTypeEn', 'PortFolioEn', 'NameEn'])[['PersonRoleId']].count()

In [None]:
mp_roles.fillna('').groupby(['GroupingTitleEn', 'OrganizationTypeEn']).nunique()

In [None]:
mp_roles.nunique()

In [None]:
mp_roles = mp_roles.drop(['GroupingTitleEn', 'PortFolioEn', 'NameEn', 'IsMP', 'IsSenator', 'IsActing'], axis=1)
mp_roles

In [None]:
mp_roles['PersonRoleId'].value_counts()[:25]

In [None]:
cols = ['PersonId', 'LastName', 'UsedFirstName', 'PersonRoleId', 'StartDate', 'EndDate',
        'PartyEn', 'OrganizationTypeEn', 'OrganizationLongEn']
mp_roles = mp_roles[cols]
mp_roles

In [None]:
mp_roles.to_csv(os.path.join(people_dir, 'mp_roles.csv'), index=False, encoding='utf8')

In [None]:
df.columns

In [None]:
df['GroupingTitleEn'].fillna('').value_counts()

In [None]:
df['OrganizationTypeEn'].fillna('').value_counts()

In [None]:
df['IsMP'].fillna('').value_counts()

In [None]:
parl_roles = df[df['GroupingTitleEn'] == 'Parliamentarian']
parl_roles.count()

In [None]:
mps_df = df[df['IsMP']]
mps_df[['GroupingTitleEn', 'OrganizationTypeEn', 'OrganizationLongEn', 'GroupingTitleEn', 'PortFolioEn', 'NameEn', 'IsMP']].nunique()

In [None]:
mps_df['GroupingTitleEn'].value_counts()

In [None]:
mps_df['OrganizationTypeEn'].value_counts()

In [None]:
mps_df['OrganizationLongEn'].value_counts()

In [None]:
mps_df['PortFolioEn'].value_counts()

In [None]:
mps_df['NameEn'].value_counts()

In [None]:
parl_roles.fillna('') \
  .groupby(['OrganizationTypeEn', 'OrganizationLongEn', 'GroupingTitleEn', 'PortFolioEn', 'NameEn', 'IsMP'])[['PersonRoleId']].count() \
  [:25]

In [None]:
df['PartyEn'].value_counts()

In [None]:
df['IsActing'].value_counts()

In [None]:
df.groupby(['EndReasonTypeEn', 'EndReasonEn'])[['PersonRoleId']].count()

In [None]:
df[df['EndReasonTypeEn'] == 'Resignation']

In [None]:
df2 = df[~df['EndReasonTypeEn'].isna() | ~df['EndReasonEn'].isna()]
df2

In [None]:
df[['EndReasonEn', 'EndReasonTypeEn']].drop_duplicates()

In [None]:
df.fillna('na').pivot_table(
    index=['EndReasonTypeEn', 'EndReasonEn'],
    values=['PartyEn', 'PersonId', 'PersonRoleId'],
    aggfunc='nunique'
)[['PartyEn', 'PersonId', 'PersonRoleId']]

In [None]:
df.fillna('na').groupby(['EndReasonTypeEn', 'EndReasonEn'])[['PartyEn', 'PersonId', 'PersonRoleId']].nunique() \
  .sort_values('PersonRoleId', ascending=False)

In [None]:
df[df['EndReasonEn'] == 'Resigned between May 30, 1930 and September 9,1930.']

In [None]:
row = df.loc[107319]
row

In [None]:
row = 107319
df.loc[row, 'NotesEn'] = df.loc[row, 'EndReasonEn']
df.loc[row, 'EndReasonEn'] = None

In [None]:
df['PersonRoleId'].count(), df['PersonRoleId'].nunique()

In [None]:
df[:50]

In [None]:
df[:50].sort_values(['LastName', 'UsedFirstName', 'StartDate', 'PersonRoleId'])

In [None]:
len(df)

In [None]:
path = os.path.join(people_dir, 'person_roles.csv')
df.to_csv(path, index=False, encoding='utf8')

In [99]:
def extract_roles(role_type):
    global rows
    global df
    
    person_cols = ['PersonId', 'LastName', 'UsedFirstName']

    rows = []
    for rec in person_recs():
        person = {col: rec['Person'][col] for col in person_cols}
        roles = rec[role_type]
        if roles:
            for role in roles:
                classes = role.get('Classes')
                if classes is not None:
                    class_names = [c['RoleClassNameEn'] for c in classes]
                    if None in class_names:
                        print(person, "class names:", class_names)
                    role['Classes'] = '|'.join(filter(None, class_names))

                mp_info = role.get('MemberOfParliament')
                if mp_info is not None:
                    role['MemberOfParliament'] = mp_info['OccupationTypeEn']
                    
                row = {**person, **role}
                rows.append(row)
                
    df = pd.DataFrame(rows)
    # since run on 2021-09-08, RoleId, PersonRoleId, and StartDate are not available for Education roles
    df = df.sort_values([col for col in ['LastName', 'UsedFirstName', 'PersonId', 'StartDate', 'GraduationYear', 'RoleId'] if col in df.columns])
    df = df.set_index([col for col in ['PersonRoleId', 'PersonId'] if col in df.columns])
    cleanup(df);
    df = df.drop_duplicates()
    return df

In [100]:
df = extract_roles('ProvincialExperience')
path = os.path.join(people_dir, 'provincial_experience.csv')
df.to_csv(path, encoding='utf8')
df.count()

LastName                  5427
UsedFirstName             5427
StartDate                 5403
StartDateIsApproximate    5427
EndDate                   5318
EndDateIsApproximate      5427
EndReasonTypeEn              3
SourceOfInformationEn        8
IsActing                  5427
HasCrossedTheFloor        5427
OrganizationId            5427
OrganizationLongEn        5427
OrganizationShortEn       5394
OrganizationAcronymEn     5318
OrganizationTypeId        5427
OrganizationTypeEn        5427
OrganizationProvinceEn       2
OrganizationHasProfile    5427
IsSenatorialDivision      5427
ToBeStyledAsEn            5313
GroupId                   5427
GroupingTitleEn           5427
GroupingOrder             5427
ParliamentStart            265
ParliamentEnd              265
ParliamentNumber          5427
PartyOrganizationId       5427
PartyEn                   5153
PartyStartDate            5153
PartyEndDate              5036
PersonPersonId            5427
MemberOfParliament           1
IsMP    

In [102]:
df['Classes'].value_counts()

Prov                                                                                                                                                                                      2284
Minister|Provincial and Territorial Responsabilities                                                                                                                                      1203
Parliamentarian                                                                                                                                                                           1144
Minister|Parliamentary function                                                                                                                                                            231
Parliamentary function|Provincial and Territorial Responsabilities                                                                                                                         114
Prov|Minister|Parliamentary function         

In [103]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,LastName,UsedFirstName,StartDate,StartDateIsApproximate,EndDate,EndDateIsApproximate,EndReasonTypeEn,SourceOfInformationEn,IsActing,HasCrossedTheFloor,...,PersonPersonId,MemberOfParliament,IsMP,IsSenator,NotesEn,IsCurrent,RoleId,NameEn,Ordinal,Classes
PersonRoleId,PersonId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10292,8200,Abbott,John Joseph Caldwell,1857-01-01,True,1867-01-01,True,,,False,False,...,0,,False,False,,False,2997,Provincial Constituency Member,,Prov
99321,14709,Achim,Honoré,1917-12-15,False,1921-10-13,False,,,False,False,...,0,,False,False,,False,720,Elected Representative,0.0,Minister|Provincial and Territorial Responsabi...
62707,14709,Achim,Honoré,1917-12-15,False,1921-10-13,False,,,False,False,...,0,,False,False,,False,722,Party Member,,Parliamentarian
88656,14709,Achim,Honoré,1917-12-15,False,1921-10-13,False,,,False,False,...,0,,False,False,,False,2997,Provincial Constituency Member,,Prov
69657,1331,Adams,Michael,1870-06-01,True,1874-06-01,True,,,False,False,...,0,,False,False,,False,720,Elected Representative,0.0,Minister|Provincial and Territorial Responsabi...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6845,2773,Évanturel,Gustave,1911-12-11,False,1923-06-24,False,,,False,False,...,0,,False,False,,False,2997,Provincial Constituency Member,,Prov
6845,2773,Évanturel,Gustave,1911-12-11,False,1923-06-24,False,,,False,False,...,0,,False,False,,False,2997,Provincial Constituency Member,,Prov
6845,2773,Évanturel,Gustave,1911-12-11,False,1923-06-24,False,,,False,False,...,0,,False,False,,False,2997,Provincial Constituency Member,,Prov
13697,2773,Évanturel,Gustave,1914-06-29,False,1919-09-23,False,,,False,False,...,0,,False,False,,False,722,Party Member,,Parliamentarian


In [104]:
df = extract_roles('FederalExperience')
path = os.path.join(people_dir, 'federal_experience.csv')
df.to_csv(path, encoding='utf8')
df.count()

{'PersonId': 20049, 'LastName': 'Francis', 'UsedFirstName': 'Brian'} class names: ['Caucus Chair', 'Officers and Officials of Parliament', 'Parliamentarian Federal Role Senate', 'Political Officers', 'Political Party', None]
{'PersonId': 12245, 'LastName': 'Hubley', 'UsedFirstName': 'Elizabeth'} class names: ['Whip', 'Parliamentarian Federal Role Senate', None, 'Senate Roles']
{'PersonId': 14247, 'LastName': 'Merchant', 'UsedFirstName': 'Pana'} class names: ['Whip', 'Parliamentarian Federal Role Senate', None, 'Senate Roles']


LastName                  54247
UsedFirstName             54247
StartDate                 54245
StartDateIsApproximate    54247
EndDate                   51604
EndDateIsApproximate      54247
EndReasonEn                1148
EndReasonTypeEn            6606
SourceOfInformationEn      2693
IsActing                  54247
ActingTextEn                 20
HasCrossedTheFloor        54247
OrganizationId            54247
OrganizationLongEn        54247
OrganizationShortEn       46918
OrganizationAcronymEn     47085
OrganizationTypeId        54247
OrganizationTypeEn        54247
OrganizationProvinceEn    17485
OrganizationHasProfile    54247
IsSenatorialDivision      54247
ToBeStyledAsEn            48440
GroupId                   54247
PortFolioEn                8913
GroupingTitleEn           54247
GroupingOrder             54247
ParliamentStart           53986
ParliamentEnd             53986
ParliamentNumber          54247
PartyOrganizationId       54247
PartyEn                   54122
PartySta

In [105]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,LastName,UsedFirstName,StartDate,StartDateIsApproximate,EndDate,EndDateIsApproximate,EndReasonEn,EndReasonTypeEn,SourceOfInformationEn,IsActing,...,PersonPersonId,MemberOfParliament,IsMP,IsSenator,NotesEn,IsCurrent,RoleId,NameEn,Ordinal,Classes
PersonRoleId,PersonId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
94181,13134,Abbott,Anthony Chisholm,1974-07-08,False,1979-05-21,False,,,,False,...,0,Continuous,True,False,,False,719,Constituency Member,240.0,Salaries in the House of Commons|Parliamentarian
84888,13134,Abbott,Anthony Chisholm,1974-07-08,False,1979-05-21,False,,,,False,...,0,,False,False,,False,722,Party Member,,Parliamentarian
139134,13134,Abbott,Anthony Chisholm,1976-09-14,False,1977-09-15,False,,,,False,...,0,,False,False,,False,854,Minister of Consumer and Corporate Affairs,210.0,Salaries in the House of Commons|Minister|Parl...
22779,13134,Abbott,Anthony Chisholm,1977-09-16,False,1978-11-23,False,,,,False,...,0,,False,False,Title previously Small Business,False,127,Minister of State,200.0,Salaries in the House of Commons|Critic|Senato...
139322,13134,Abbott,Anthony Chisholm,1978-11-24,False,1979-06-03,False,,,,False,...,0,,False,False,,False,854,Minister,210.0,Salaries in the House of Commons|Minister|Parl...


In [106]:
df = extract_roles('MunicipalExperience')
path = os.path.join(people_dir, 'municipal_experience.csv')
df.to_csv(path, encoding='utf8')
df.count()

LastName                  2896
UsedFirstName             2896
StartDate                 2051
StartDateIsApproximate    2896
EndDate                   2054
EndDateIsApproximate      2896
IsActing                  2896
HasCrossedTheFloor        2896
OrganizationId            2896
OrganizationLongEn        2896
OrganizationShortEn       2785
OrganizationAcronymEn     2785
OrganizationTypeId        2896
OrganizationTypeEn        2896
OrganizationHasProfile    2896
IsSenatorialDivision      2896
ToBeStyledAsEn            2811
GroupId                   2896
GroupingTitleEn             33
GroupingOrder             2896
ParliamentStart              5
ParliamentEnd                5
ParliamentNumber          2896
PartyOrganizationId       2896
PartyEn                      5
PartyStartDate               5
PartyEndDate                 5
PersonPersonId            2896
IsMP                      2896
IsSenator                 2896
NotesEn                    316
IsCurrent                 2896
RoleId  

In [107]:
df = extract_roles('MilitaryExperience')
path = os.path.join(people_dir, 'military_experience.csv')
df.to_csv(path, encoding='utf8')
df.count()

LastName                  1933
UsedFirstName             1933
StartDate                 1015
StartDateIsApproximate    1933
EndDate                    934
EndDateIsApproximate      1933
IsActing                  1933
HasCrossedTheFloor        1933
OrganizationId            1933
OrganizationLongEn        1933
OrganizationShortEn       1923
OrganizationAcronymEn     1924
OrganizationTypeId        1933
OrganizationTypeEn        1933
OrganizationHasProfile    1933
IsSenatorialDivision      1933
ToBeStyledAsEn            1923
GroupId                   1933
GroupingTitleEn              2
GroupingOrder             1933
ParliamentNumber          1933
PartyOrganizationId       1933
PersonPersonId            1933
IsMP                      1933
IsSenator                 1933
NotesEn                    513
IsCurrent                 1933
RoleId                    1933
NameEn                    1933
Ordinal                    139
Classes                   1932
dtype: int64

In [108]:
df = extract_roles('Education')
path = os.path.join(people_dir, 'education.csv')
df.to_csv(path, encoding='utf8')
df.count()

LastName            5753
UsedFirstName       5753
SchoolNameLongEn    5750
SchoolCityEn        5726
FieldOfStudyEn      5726
DiplomaLongEn       5735
DiplomaShortEn      5681
GraduationYear      2486
dtype: int64

In [76]:
df

Unnamed: 0_level_0,LastName,UsedFirstName,SchoolNameLongEn,SchoolCityEn,FieldOfStudyEn,DiplomaLongEn,DiplomaShortEn,GraduationYear
PersonId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
13134,Abbott,Anthony Chisholm,Bishop's University,Lennoxville,Arts,Bachelor,B.A,
13134,Abbott,Anthony Chisholm,Osgoode Hall Law School,,Laws,Bachelor (Juris),B.A. (Juris),
3761,Abbott,Douglas Charles,McGill University,Montreal,Civil Law,Bachelor,B.A,
3761,Abbott,Douglas Charles,Bishop's University,Lennoxville,Arts,Bachelor,B.A,
3761,Abbott,Douglas Charles,Université de Dijon,,Laws,Diploma,,
...,...,...,...,...,...,...,...,...
5305,de Savoye,Pierre,University of Quebec at Trois-Rivières,Trois-Rivières,Education,Bachelor,B.A,1974.0
5305,de Savoye,Pierre,Unknown,,Teaching,Certificate,,1975.0
5305,de Savoye,Pierre,Laval University,,Business Administration,Master,M.A.,1993.0
20129,van Koeverden,Adam,McMaster University,Hamilton,Kinesiology,Bachelor,B.A,2007.0


In [77]:
{col:list(df[col].map(type).drop_duplicates()) for col in df.columns}

{'LastName': [str],
 'UsedFirstName': [str],
 'SchoolNameLongEn': [str, NoneType],
 'SchoolCityEn': [str, NoneType],
 'FieldOfStudyEn': [str, NoneType],
 'DiplomaLongEn': [str, NoneType],
 'DiplomaShortEn': [str, NoneType],
 'GraduationYear': [float]}

In [78]:
len(rows)

5756

In [79]:
rows[-1]

{'PersonId': 2773,
 'LastName': 'Évanturel',
 'UsedFirstName': 'Gustave',
 'SchoolNameLongEn': 'Laval University',
 'SchoolNameLongFr': 'Université Laval',
 'SchoolCityEn': '',
 'SchoolCityFr': '',
 'FieldOfStudyEn': 'Notarial Law',
 'FieldOfStudyFr': 'Droit notarial',
 'DiplomaLongEn': 'Diploma',
 'DiplomaLongFr': 'Diplôme',
 'DiplomaShortEn': '',
 'DiplomaShortFr': '',
 'GraduationYear': None}

In [None]:
df.count()

# Candidates
From https://lop.parl.ca/sites/ParlInfo/default/en_CA/ElectionsRidings/Elections

In [109]:
r = requests.get(parl_api_url + '/Parliament/GetCandidates', headers=accept_json)
d = r.json()
len(d)

44443

In [110]:
candidates = pd.DataFrame(d)
candidates.count()

ParliamentNumber              44443
ElectionId                    44443
IsGeneral                     44443
ElectionDate                  44443
ConstituencyId                44443
ConstituencyEn                44443
ConstituencyFr                44443
ProvinceEn                    44443
ProvinceFr                    44443
DisplayName                   44443
ElectionCandidateId           44443
ElectionProcessCandidateId    44443
PersonId                      44031
PersonLastFirstName           16536
PersonRoleId                      0
ElectionCanadaFirstName       44232
ElectionCanadaLastName        44396
ElectionCanadaMiddleName        305
Gender                        44427
OccupationEn                  42100
OccupationFr                  42098
PartyOrganizationId           44443
PartyNameEn                   44443
PartyNameFr                   44443
Votes                         44443
HasProfile                    44443
ResultLongEn                  44443
ResultLongFr                

In [111]:
cleanup(candidates)
candidates.count()

ParliamentNumber              44443
ElectionId                    44443
IsGeneral                     44443
ElectionDate                  44443
ConstituencyId                44443
ConstituencyEn                44443
ProvinceEn                    44443
DisplayName                   44443
ElectionCandidateId           44443
ElectionProcessCandidateId    44443
PersonId                      44031
PersonLastFirstName           16536
ElectionCanadaFirstName       44232
ElectionCanadaLastName        44396
ElectionCanadaMiddleName        305
Gender                        44427
OccupationEn                  42100
PartyOrganizationId           44443
PartyNameEn                   44443
Votes                         44443
HasProfile                    44443
ResultLongEn                  44443
OtherResultLongEn               613
dtype: int64

In [112]:
candidates = candidates.sort_values(['ParliamentNumber', 'ProvinceEn', 'ConstituencyEn', 'Votes', 'DisplayName'])
candidates[-20:]

Unnamed: 0,ParliamentNumber,ElectionId,IsGeneral,ElectionDate,ConstituencyId,ConstituencyEn,ProvinceEn,DisplayName,ElectionCandidateId,ElectionProcessCandidateId,...,ElectionCanadaLastName,ElectionCanadaMiddleName,Gender,OccupationEn,PartyOrganizationId,PartyNameEn,Votes,HasProfile,ResultLongEn,OtherResultLongEn
44408,43,802,True,2019-10-21,11632,Saskatoon--University,Saskatchewan,"Norris, Jan",45689,14393,...,Norris,,M,Artist,3388,Green Party of Canada,1401,True,Defeated,
44407,43,802,True,2019-10-21,11632,Saskatoon--University,Saskatchewan,"Hayton, Susan",45688,14392,...,Hayton,,F,Physician,4831,Liberal Party of Canada,6146,True,Defeated,
44406,43,802,True,2019-10-21,11632,Saskatoon--University,Saskatchewan,"Card, Claire",45687,14391,...,Card,,F,Professor,5774,New Democratic Party,13994,True,Defeated,
44410,43,802,True,2019-10-21,11632,Saskatoon--University,Saskatchewan,"Tochor, Corey",45691,14395,...,Tochor,,M,Entrepreneur,2159,Conservative Party of Canada,24514,True,Elected,
44416,43,802,True,2019-10-21,8024,Souris--Moose Mountain,Saskatchewan,"Patron, Travis",45697,14491,...,Patron,,M,CEO,15296,Canadian Nationalist Party,168,True,Defeated,
44415,43,802,True,2019-10-21,8024,Souris--Moose Mountain,Saskatchewan,"Mergel, Judy",45696,14490,...,Mergel,,F,Tai Chi Instructor,3388,Green Party of Canada,681,True,Defeated,
44417,43,802,True,2019-10-21,8024,Souris--Moose Mountain,Saskatchewan,"Zajac, Phillip",45698,14492,...,Zajac,,M,Mortgage Specialist,15161,People's Party of Canada,702,True,Defeated,
44412,43,802,True,2019-10-21,8024,Souris--Moose Mountain,Saskatchewan,"Ames-Sinclair, Javin",45693,14487,...,Ames-Sinclair,,M,Student,4831,Liberal Party of Canada,1718,True,Defeated,
44413,43,802,True,2019-10-21,8024,Souris--Moose Mountain,Saskatchewan,"Hicks, Ashlee",45694,14488,...,Hicks,,F,Retail Worker,5774,New Democratic Party,3214,True,Defeated,
44414,43,802,True,2019-10-21,8024,Souris--Moose Mountain,Saskatchewan,"Kitchen, Robert Gordon",45695,14489,...,Kitchen,,M,Chiropractor,2159,Conservative Party of Canada,35067,True,Elected,


In [113]:
# Constituencies with ties
wins = candidates[candidates['ResultLongEn'] == 'Elected']
cols = ['ParliamentNumber', 'ElectionId', 'IsGeneral', 'ElectionDate', 'ProvinceEn', 'ConstituencyId', 'ConstituencyEn', 'Votes', 'ResultLongEn']
grp = wins.groupby(cols, as_index=False)[['DisplayName']].count()
ties = grp[grp['DisplayName'] > 1]
ties.sort_values('Votes', ascending=False)

Unnamed: 0,ParliamentNumber,ElectionId,IsGeneral,ElectionDate,ProvinceEn,ConstituencyId,ConstituencyEn,Votes,ResultLongEn,DisplayName
5692,22,527,True,1953-08-10,Prince Edward Island,6971,Queen's,10086,Elected,2
3,1,54,False,1871-03-02,Manitoba,5160,Marquette,282,Elected,2
333,2,226,True,1872-07-20,Ontario,6253,Ottawa (City of),0,Elected,2
442,2,480,False,1873-09-29,Prince Edward Island,4214,King's County,0,Elected,2
443,2,480,False,1873-09-29,Prince Edward Island,6782,Prince County,0,Elected,2
444,2,480,False,1873-09-29,Prince Edward Island,6978,Queen's County,0,Elected,2
641,3,390,True,1874-01-22,Prince Edward Island,6978,Queen's County,0,Elected,2
1772,7,288,False,1892-02-11,Nova Scotia,3479,Halifax,0,Elected,2
3146,13,383,True,1917-12-17,Nova Scotia,3479,Halifax,0,Elected,2


In [114]:
wins[(wins['ElectionId'] == 527) & (wins['ConstituencyId'] == 6971)]

Unnamed: 0,ParliamentNumber,ElectionId,IsGeneral,ElectionDate,ConstituencyId,ConstituencyEn,ProvinceEn,DisplayName,ElectionCandidateId,ElectionProcessCandidateId,...,ElectionCanadaLastName,ElectionCanadaMiddleName,Gender,OccupationEn,PartyOrganizationId,PartyNameEn,Votes,HasProfile,ResultLongEn,OtherResultLongEn
25334,22,527,True,1953-08-10,6971,Queen's,Prince Edward Island,"MacLean, John Angus",16837,0,...,MACLEAN,,M,Farmer,6876,Progressive Conservative Party,10086,True,Elected,
25504,22,527,True,1953-08-10,6971,Queen's,Prince Edward Island,"MacLean, John Angus",43267,0,...,Maclean,,M,Farmer,6876,Progressive Conservative Party,10086,True,Elected,
25337,22,527,True,1953-08-10,6971,Queen's,Prince Edward Island,"Matheson, Neil Alexander",17175,0,...,MATHESON,,M,Editor,4831,Liberal Party of Canada,10351,True,Elected,


In [115]:
candidates.to_csv('../data/parliaments/candidates.csv', index=False, encoding='utf8')

# Roles v2

In [None]:
with open(os.path.join(people_dir, 'parlinfo_28-1_28-2_28-3.json')) as f:
    people = json.load(f)

In [100]:
person = people[0]
sorted(person.keys())

['Age',
 'AssistantCriticOfEn',
 'AssistantCriticOfFr',
 'AssistantDeputySpeakerChairEn',
 'AssistantDeputySpeakerChairFr',
 'AssociateMinisterOfEn',
 'AssociateMinisterOfFr',
 'Bibliography',
 'CabinetChairEn',
 'CabinetChairFr',
 'CabinetCommitteeMemberEn',
 'CabinetCommitteeMemberFr',
 'CityOfBirthEn',
 'CityOfBirthFr',
 'ConstituencyAddresses',
 'ConstituencyEn',
 'ConstituencyFr',
 'CountryOfBirthEn',
 'CountryOfBirthFr',
 'CriticOfEn',
 'CriticOfFr',
 'CurrentConstituencyEn',
 'CurrentConstituencyFr',
 'CurrentPartyEn',
 'CurrentPartyFr',
 'DateOfBirth',
 'DateOfBirthIsApproximate',
 'Death',
 'DeputyHouseLeaderEn',
 'DeputyHouseLeaderFr',
 'DeputyPrimeMinister',
 'DeputySpeaker',
 'DeputyWhipEn',
 'DeputyWhipFr',
 'DiedInOffice',
 'DisplayName',
 'DoNotDiscloseBirthLocation',
 'DoNotDiscloseDateOfBirth',
 'Education',
 'ElectionCandidates',
 'EthnicityLongEn',
 'EthnicityLongFr',
 'ExternalNotesEn',
 'ExternalNotesFr',
 'FamilyRelations',
 'FormalFirstName',
 'Gender',
 'HOCPers

In [101]:
[k for k, v in person.items() if type(v) in [list, dict]]

['Professions',
 'FamilyRelations',
 'Pictures',
 'Roles',
 'Death',
 'YearsOfServiceSegments']

In [None]:
person_cols = ['PersonId', 'LastName', 'UsedFirstName']
rows = []
for rec in person_recs():
    person = rec['Person']
    person_props = {col: person[col] for col in person_cols}
    for role in person['Roles']:
        role_props = {k: v for k, v in role.items() if not (k.endswith('Fr') or type(v) in [list, dict])}
        row = {**person_props, **role_props}
        rows.append(row)
len(rows)

In [None]:
sorted(rec.keys())

In [None]:
sorted(rec['Person'].keys())

In [None]:
rows[0]

In [None]:
sorted(rows[0].keys())

In [103]:
df = pd.DataFrame(rows)
cleanup(df)
df.count()

PersonId                  202575
LastName                  202575
UsedFirstName             202575
PersonRoleId              202575
StartDate                 200744
StartDateIsApproximate    202575
EndDate                   198138
EndDateIsApproximate      202575
EndReasonEn                  230
EndReasonTypeEn             1685
SourceOfInformationEn       1139
IsActing                  202575
ActingTextEn                  11
HasCrossedTheFloor        202575
OrganizationId            202575
OrganizationLongEn        202575
OrganizationShortEn       196184
OrganizationAcronymEn     190713
OrganizationTypeId        202575
OrganizationTypeEn        202547
OrganizationProvinceEn      6958
OrganizationHasProfile    202575
IsSenatorialDivision      202575
ToBeStyledAsEn            180918
GroupId                   202575
Source                      8535
PortFolioEn                 7426
GroupingTitleEn            31193
GroupingOrder             202575
ParliamentStart            25827
Parliament

In [128]:
df.sample(25).sort_values(['LastName', 'UsedFirstName', 'StartDate'])

Unnamed: 0,PersonId,LastName,UsedFirstName,PersonRoleId,StartDate,StartDateIsApproximate,EndDate,EndDateIsApproximate,EndReasonEn,EndReasonTypeEn,...,PartyEn,PartyStartDate,PartyEndDate,IsMP,IsSenator,NotesEn,IsCurrent,RoleId,NameEn,Ordinal
1219,958,Aikins,James Albert Manning,30400,1911-11-15,False,1915-04-15,False,,,...,,,,False,False,,False,2998,Caucus Member,
9011,15678,Baker,George,267854,1989-04-03,False,1991-05-12,False,,,...,Liberal Party of Canada,1972-10-30,2017-09-03,False,False,,False,226,Member,300.0
17458,12709,Blain,Richard,226686,1903-03-12,False,1903-10-24,False,,,...,Conservative (1867-1942),1900-11-07,1926-11-27,False,False,,False,226,Member,300.0
19360,14748,Borden,Frederick William,231209,1896-08-19,False,1896-10-05,False,,,...,Liberal Party of Canada,1896-07-30,1911-09-20,False,False,,False,226,Member,300.0
26307,13311,Buchanan,William Ashbury,186153,1932-10-06,False,1933-05-27,False,,,...,Liberal Party of Canada,1925-09-05,1954-07-11,False,False,,False,226,Member,300.0
33078,6895,Carter,Chesley William,140835,1966-01-18,False,1967-05-08,False,,,...,Liberal Party of Canada,1949-06-27,1977-07-28,False,False,,False,226,Member,300.0
42605,4154,Copp,Arthur Bliss,211731,1915-02-04,False,1915-04-15,False,,,...,Liberal Party of Canada,1915-02-01,1917-12-16,False,False,,False,226,Member,300.0
50679,6281,Denis,Azellus,281462,1976-10-12,False,1977-10-17,False,,,...,Liberal Party of Canada,1935-10-14,1991-09-04,False,False,,False,226,Member,300.0
61317,12772,Fairweather,Robert Gordon Lee,19061,1974-02-27,False,1974-05-09,False,,,...,Progressive Conservative Party,1962-06-18,1977-08-31,False,False,,False,226,Member,300.0
78364,11908,Griesbach,William Antrobus,183562,1943-01-28,False,1944-01-26,False,,,...,Conservative (1867-1942),1921-09-15,1945-01-21,False,False,,False,226,Member,300.0


In [120]:
filename = os.path.join(people_dir, 'roles_with_provincial.csv')
df.to_csv(filename, index=False, encoding='utf8')

In [121]:
!wc $filename

  202594 2361918 58958014 ../data/people/roles_with_provincial.csv


In [123]:
zip_filename = filename + '.zip'
!rm $zip_filename
!zip $zip_filename $filename

rm: ../data/people/roles_with_provincial.csv.zip: No such file or directory
  adding: ../data/people/roles_with_provincial.csv (deflated 92%)


In [124]:
!wc $zipfilename

^C


In [126]:
!ls -al $zip_filename

-rw-r--r--  1 nedgar  staff  4930803  3 May 14:17 ../data/people/roles_with_provincial.csv.zip
