In [None]:
!pwd

In [None]:
import json
import os
import pandas as pd
import re
import requests

In [None]:
parl_api_url = 'https://lop.parl.ca/ParlinfoWebAPI'
accept_json = {'Accept': 'application/json'}

In [None]:
def drop_time(df):
    date_cols = [col for col in df.columns if col.endswith('Date')]
    for col in date_cols:
        df[col] = df[col].str[:10]

def drop_french(df):
    to_drop = [col for col in df.columns if col.endswith('Fr')]
    df.drop(columns=to_drop, inplace=True)
    
def drop_empty_cols(df):
    to_drop = [col for col in df.columns if df[col].count() == 0]
    df.drop(columns=to_drop, inplace=True)

def drop_unsupported_cols(df):
    df.drop(columns=['Documents', 'Senator'], inplace=True, errors='ignore')
    
def cleanup(df):
    drop_time(df)
    drop_french(df)
    drop_empty_cols(df)
    drop_unsupported_cols(df)

In [None]:
r = requests.get(parl_api_url + '/Person/SearchAndRefine?refiners=4-1,', headers=accept_json)
d = r.json()
len(d)

In [None]:
r = requests.get(parl_api_url + '/Person/SearchAndRefine?refiners=28-1,28-2,28-3,', headers=accept_json)
d = r.json()
len(d)

In [None]:
len(d[0]['Roles'])

In [None]:
rows = [{k: (p[k] if k in p else None) for k in ['PersonId', 'LastName', 'UsedFirstName', 'ProvincialExperienceEN', 'MunicipalExperienceEn']} for p in d]
len(rows)

In [None]:
df = pd.DataFrame(rows)
df

In [None]:
df['ProvincialExperienceEN'].value_counts()

In [None]:
df[df['LastName'] == 'Aglukkaq']

In [None]:
r = requests.get(parl_api_url + '/Person/GetPersonWebProfile/4487', headers=accept_json)
d = r.json()
len(d)

In [None]:
sorted(d.keys())

In [None]:
for k in list(d.keys()):
    if k.endswith('Fr'):
        del d[k]

In [None]:
person = d['Person']
name = f"{person['UsedFirstName']} {person['LastName']}"
name = person['DisplayName']
name

In [None]:
filename = name.replace(' ', '_') + '.json'
filename = name + '.json'
filename

In [None]:
with open(filename, 'w') as f:
    json.dump(d, f, indent=2)

In [None]:
!code $filename

In [None]:
sorted(d.keys())

In [None]:
sorted(d['Person'].keys())

In [None]:
[key for key in sorted(d['Person']['Roles'][0].keys()) if not key.endswith('Fr')]

In [None]:
def num_prefix(str):
    return int(str.split('-')[0])

def name_suffix(str):
    return str.split('-')[-1]

In [None]:
people_dir = '../ca/data/people'

# Match people .json files
def person_files():
    pattern = re.compile('[0-9]+-.+\.json$')
    matching_files = filter(lambda d: pattern.match(d), os.listdir(people_dir))
    return sorted(matching_files, key=name_suffix)

len(person_files())

In [None]:
def person_recs(n = None):
    files = person_files()
    for file in files[:n] if n else files:
        with open(os.path.join(people_dir, file)) as f:
            yield json.load(f)

In [None]:
person_cols = ['PersonId', 'LastName', 'UsedFirstName']
role_cols = ['PersonRoleId', 'ParliamentNumber', 'PartyEn', 'ToBeStyledAsEn',
             'OrganizationTypeEn', 'OrganizationLongEn', 'GroupingTitleEn', 'PortFolioEn', 'NameEn', 
             'IsMP', 'IsSenator', 'IsActing',
             'StartDate', 'StartDateIsApproximate', 
             'EndDate', 'EndDateIsApproximate', 'EndReasonEn', 'EndReasonTypeEn', 
             'NotesEn']

rows = []
for rec in person_recs():
    person = rec['Person']
    person_tuple = tuple(person[col] for col in person_cols)
    for role in rec['FederalExperience']:
        role_tuple = tuple(role[col] for col in role_cols)
        row = person_tuple + role_tuple
        rows.append(row)
len(rows)

In [None]:
df = pd.DataFrame(rows, columns = person_cols + role_cols) \
  .sort_values(['LastName', 'UsedFirstName', 'PersonId', 'StartDate', 'PersonRoleId'])
df = df.drop_duplicates()
len(df)

In [None]:
df.sample(5)

In [None]:
cleanup(df)
df.sample(5)

In [None]:
# df.loc[39471]

In [None]:
# df['ToBeStyledAsEn'].value_counts()[:30]

In [None]:
filename = os.path.join(people_dir, 'fed_roles.csv')
df.to_csv(filename, index=False, encoding='utf8')

In [None]:
!open $filename

In [None]:
# person_cols = ['PersonId', 'LastName', 'UsedFirstName']
# education_cols = ['SchoolNameLongEn', 'FieldOfStudyEn', 'DiplomaLongEn', 'GraduationYear']

# rows = []
# for rec in person_recs():
#     person = rec['Person']
#     person_tuple = tuple(person[col] for col in person_cols)
#     if person['Education']:
#         for edu in person['Education']:
#             edu_tuple = tuple(edu[col] for col in education_cols)
#             row = person_tuple + edu_tuple
#             rows.append(row)
# len(rows)

In [None]:
# df = pd.DataFrame(rows, columns = person_cols + education_cols) \
#   .sort_values(['LastName', 'UsedFirstName', 'PersonId', 'GraduationYear'])
# df = df.drop_duplicates()
# len(df)

In [None]:
# df.sample(25)

In [None]:
# filename = os.path.join(people_dir, 'education.csv')
# df.to_csv(filename, index=False, encoding='utf8')

In [None]:
education_cols

In [None]:
parl_num = 44
parl = df[df['ParliamentNumber'] == parl_num]
parl

In [None]:
filename = os.path.join(people_dir, f'parl_{parl_num}_roles.csv')
parl42.to_csv(filename, index=False, encoding='utf8')

In [None]:
!open $filename

In [None]:
mp_roles = df[df['IsMP']].drop_duplicates()
len(mp_roles)

In [None]:
mp_roles.nunique().sort_values(ascending=False)

In [None]:
mp_roles.fillna('').groupby(['GroupingTitleEn', 'OrganizationTypeEn', 'PortFolioEn', 'NameEn'])[['PersonRoleId']].count()

In [None]:
mp_roles.fillna('').groupby(['GroupingTitleEn', 'OrganizationTypeEn']).nunique()

In [None]:
mp_roles.nunique()

In [None]:
mp_roles = mp_roles.drop(['GroupingTitleEn', 'PortFolioEn', 'NameEn', 'IsMP', 'IsSenator', 'IsActing'], axis=1)
mp_roles

In [None]:
mp_roles['PersonRoleId'].value_counts()[:25]

In [None]:
cols = ['PersonId', 'LastName', 'UsedFirstName', 'PersonRoleId', 'StartDate', 'EndDate',
        'PartyEn', 'OrganizationTypeEn', 'OrganizationLongEn']
mp_roles = mp_roles[cols]
mp_roles

In [None]:
mp_roles.to_csv(os.path.join(people_dir, 'mp_roles.csv'), index=False, encoding='utf8')

In [None]:
df.columns

In [None]:
df['GroupingTitleEn'].fillna('').value_counts()

In [None]:
df['OrganizationTypeEn'].fillna('').value_counts()

In [None]:
df['IsMP'].fillna('').value_counts()

In [None]:
parl_roles = df[df['GroupingTitleEn'] == 'Parliamentarian']
parl_roles.count()

In [None]:
mps_df = df[df['IsMP']]
mps_df[['GroupingTitleEn', 'OrganizationTypeEn', 'OrganizationLongEn', 'GroupingTitleEn', 'PortFolioEn', 'NameEn', 'IsMP']].nunique()

In [None]:
mps_df['GroupingTitleEn'].value_counts()

In [None]:
mps_df['OrganizationTypeEn'].value_counts()

In [None]:
mps_df['OrganizationLongEn'].value_counts()

In [None]:
mps_df['PortFolioEn'].value_counts()

In [None]:
mps_df['NameEn'].value_counts()

In [None]:
parl_roles.fillna('') \
  .groupby(['OrganizationTypeEn', 'OrganizationLongEn', 'GroupingTitleEn', 'PortFolioEn', 'NameEn', 'IsMP'])[['PersonRoleId']].count() \
  [:25]

In [None]:
df['PartyEn'].value_counts()

In [None]:
df['IsActing'].value_counts()

In [None]:
df.groupby(['EndReasonTypeEn', 'EndReasonEn'])[['PersonRoleId']].count()

In [None]:
df[df['EndReasonTypeEn'] == 'Resignation']

In [None]:
df2 = df[~df['EndReasonTypeEn'].isna() | ~df['EndReasonEn'].isna()]
df2

In [None]:
df[['EndReasonEn', 'EndReasonTypeEn']].drop_duplicates()

In [None]:
df.fillna('na').groupby(['EndReasonTypeEn', 'EndReasonEn'])[['PartyEn', 'PersonId', 'PersonRoleId']].nunique() \
  .sort_values('PersonRoleId', ascending=False)

In [None]:
df[df['EndReasonEn'] == 'Resigned between May 30, 1930 and September 9,1930.']

In [None]:
# row = df.loc[107319]
# row

In [None]:
# row = 107319
# df.loc[row, 'NotesEn'] = df.loc[row, 'EndReasonEn']
# df.loc[row, 'EndReasonEn'] = None

In [None]:
df['PersonRoleId'].count(), df['PersonRoleId'].nunique()

In [None]:
df[:50]

In [None]:
df[:50].sort_values(['LastName', 'UsedFirstName', 'StartDate', 'PersonRoleId'])

In [None]:
len(df)

In [None]:
path = os.path.join(people_dir, 'person_roles.csv')
df.to_csv(path, index=False, encoding='utf8')

In [None]:
def extract_roles(role_type):
    global rows
    global df
    
    person_cols = ['PersonId', 'LastName', 'UsedFirstName']

    rows = []
    for rec in person_recs():
        person = {col: rec['Person'][col] for col in person_cols}
        roles = rec[role_type]
        if roles:
            for role in roles:
                classes = role.get('Classes')
                if classes is not None:
                    class_names = [c['RoleClassNameEn'] for c in classes]
                    if None in class_names:
                        print(person, "class names:", class_names)
                    role['Classes'] = '|'.join(filter(None, class_names))

                # MP info is a dict with keys OccupationTypeEn, OccupationTypeFr. Use the former.
                mp_info = role.get('MemberOfParliament')
                if mp_info is not None:
                    role['MemberOfParliament'] = mp_info['OccupationTypeEn']
                    
                row = {**person, **role}
                rows.append(row)
                
    df = pd.DataFrame(rows)
    # since run on 2021-09-08, RoleId, PersonRoleId, and StartDate are not available for Education roles
    df = df.sort_values([col for col in ['LastName', 'UsedFirstName', 'PersonId', 'StartDate', 'GraduationYear', 'RoleId'] if col in df.columns])
    df = df.set_index([col for col in ['PersonRoleId', 'PersonId'] if col in df.columns])
    cleanup(df);
    df = df.drop_duplicates()
    return df

In [None]:
df = extract_roles('ProvincialExperience')
df.head()

In [None]:
df['MemberOfParliament'].dropna().values

In [None]:
path = os.path.join(people_dir, 'provincial_experience.csv')
df.to_csv(path, encoding='utf8')
df.count()

In [None]:
df['Classes'].value_counts()

In [None]:
df

In [None]:
df = extract_roles('FederalExperience')
path = os.path.join(people_dir, 'federal_experience.csv')
df.to_csv(path, encoding='utf8')
df.count()

In [None]:
df.head()

In [None]:
df = extract_roles('MunicipalExperience')
path = os.path.join(people_dir, 'municipal_experience.csv')
df.to_csv(path, encoding='utf8')
df.count()

In [None]:
df = extract_roles('MilitaryExperience')
path = os.path.join(people_dir, 'military_experience.csv')
df.to_csv(path, encoding='utf8')
df.count()

In [None]:
df = extract_roles('Education')
path = os.path.join(people_dir, 'education.csv')
df.to_csv(path, encoding='utf8')
df.count()

In [None]:
df

In [None]:
{col:list(df[col].map(type).drop_duplicates()) for col in df.columns}

In [None]:
len(rows)

In [None]:
rows[-1]

In [None]:
df.count()

# Candidates
From https://lop.parl.ca/sites/ParlInfo/default/en_CA/ElectionsRidings/Elections

In [None]:
r = requests.get(parl_api_url + '/Parliament/GetCandidates', headers=accept_json)
d = r.json()
len(d)

In [None]:
candidates = pd.DataFrame(d)
candidates.count()

In [None]:
cleanup(candidates)
candidates.count()

In [None]:
candidates = candidates.sort_values(['ParliamentNumber', 'ProvinceEn', 'ConstituencyEn', 'Votes', 'DisplayName'])
candidates[-20:]

In [None]:
# Constituencies with ties
wins = candidates[candidates['ResultLongEn'] == 'Elected']
cols = ['ParliamentNumber', 'ElectionId', 'IsGeneral', 'ElectionDate', 'ProvinceEn', 'ConstituencyId', 'ConstituencyEn', 'Votes', 'ResultLongEn']
grp = wins.groupby(cols, as_index=False)[['DisplayName']].count()
ties = grp[grp['DisplayName'] > 1]
ties.sort_values('Votes', ascending=False)

In [None]:
wins[(wins['ElectionId'] == 527) & (wins['ConstituencyId'] == 6971)]

In [None]:
candidates.to_csv('../data/parliaments/candidates.csv', index=False, encoding='utf8')

# Roles v2

In [None]:
with open(os.path.join(people_dir, 'parlinfo_28-1_28-2_28-3.json')) as f:
    people = json.load(f)

In [None]:
person = people[0]
sorted(person.keys())

In [None]:
[k for k, v in person.items() if type(v) in [list, dict]]

In [None]:
person_cols = ['PersonId', 'LastName', 'UsedFirstName']
rows = []
for rec in person_recs():
    person = rec['Person']
    person_props = {col: person[col] for col in person_cols}
    for role in person['Roles']:
        role_props = {k: v for k, v in role.items() if not (k.endswith('Fr') or type(v) in [list, dict])}
        row = {**person_props, **role_props}
        rows.append(row)
len(rows)

In [None]:
sorted(rec.keys())

In [None]:
sorted(rec['Person'].keys())

In [None]:
rows[0]

In [None]:
sorted(rows[0].keys())

In [None]:
df = pd.DataFrame(rows)
cleanup(df)
df.count()

In [None]:
df.sample(25).sort_values(['LastName', 'UsedFirstName', 'StartDate'])

In [None]:
filename = os.path.join(people_dir, 'roles_with_provincial.csv')
df.to_csv(filename, index=False, encoding='utf8')

In [None]:
!wc $filename

In [None]:
zip_filename = filename + '.zip'
!rm $zip_filename
!zip $zip_filename $filename

In [None]:
!wc $zipfilename

In [None]:
!ls -al $zip_filename