In [240]:
from utils.parliament_utils import *
import numpy as np

## Collection

In [188]:
url = 'ftp://oda.ft.dk/ODAXML/Referat/samling'
out_path = 'data/raw/parliament/legislative_sessions_dir.xml'
save_xml_from_ftp(url, out_path)

In [189]:
with open(out_path, 'r') as f:
    sessions_dir = f.readlines()

In [190]:
parliamentary_sessions = [line.replace('\n', '').split('>          ')[-1] for line in sessions_dir]

In [270]:
parliamentary_sessions

['20091',
 '20101',
 '20102',
 '20111',
 '20121',
 '20131',
 '20141',
 '20142',
 '20151',
 '20161',
 '20171',
 '20181',
 '20182',
 '20191',
 '20201',
 '20211']

In [200]:
filenames = list_meetings_in_sessions(parliamentary_sessions)

  0%|          | 0/16 [00:00<?, ?it/s]

In [240]:
filenames[:3]

['data/raw/parliament/session_docs/session_20091.xml',
 'data/raw/parliament/session_docs/session_20101.xml',
 'data/raw/parliament/session_docs/session_20102.xml']

In [None]:
from tqdm.notebook import tqdm

In [255]:
all_transcript_filenames = extract_transcript_filenames(filenames)

  0%|          | 0/16 [00:00<?, ?it/s]

In [256]:
all_transcript_filenames['20102']

['20102_M1_helemoedet.xml', '20102_M2_helemoedet.xml']

In [267]:
subset_list = [
     '20181',
     '20182',
     '20191',
     '20201',
     '20211'
]

In [268]:
subset_dict = {k: all_transcript_filenames[k] for k in subset_list}

In [269]:
save_transcripts_from_ftp(subset_dict)


Collecting transcripts from session 20181...


  0%|          | 0/93 [00:00<?, ?it/s]


Collecting transcripts from session 20182...


  0%|          | 0/8 [00:00<?, ?it/s]


Collecting transcripts from session 20191...


  0%|          | 0/151 [00:00<?, ?it/s]


Collecting transcripts from session 20201...


  0%|          | 0/138 [00:00<?, ?it/s]


Collecting transcripts from session 20211...


  0%|          | 0/129 [00:00<?, ?it/s]

## Parsing XML

In [28]:
xml_str = xml_to_str(xml_file)

In [30]:
xml_dicts = xml_str_to_rows(xml_str)

In [43]:
pd.DataFrame(xml_dicts)['text'][4]

'Jeg tager ordet, også, fordi vi, som der står i vores betænkningsbidrag, faktisk var lidt overraskede over, at man blander tingene sammen, for det synes vi man gør her. Vi er meget indstillet på at hjælpe, hvor vi kan, i forhold til ukrainerne. Vi er gået så vidt, at vi også stemmer for en særlov. Men vi var meget undrende, da det gik op for os – og i forhandlingen bagefter virkede det, som om også integrationsministeren var lidt overrasket over, at afghanere også kommer ind under den her ordning. Det er jo placeret i to ministerier, altså også i Beskæftigelsesministeriet, så måske har der ikke været en ordentlig koordinering imellem ministerierne. Derfor vil vi bede om, at forslaget bliver delt op, sådan at man kan stemme individuelt, og så vi kan give vores tilslutning til Ukraine, men ikke til de afghanske tolke.'

In [59]:
all_xml_to_df('data/raw/parliament/xml_transcripts', 'data/clean/parliament/transcripts_v2.csv')

  0%|          | 0/1496 [00:00<?, ?it/s]

Whoops, IsADirectoryError encountered in data/raw/parliament/xml_transcripts/.ipynb_checkpoints!


In [246]:
df_politicians = pd.read_csv('data/clean/parliament/transcripts_v2.csv')

In [247]:
df_politicians

Unnamed: 0,first_name,last_name,group_name,role,start_time,end_time,text
0,Niels Helveg,Petersen,RV,aldersformanden,2009-10-06T12:00:32,2009-10-06T12:02:08,Mødet er åbnet. I henhold til grundloven er Fo...
1,Thor,Pedersen,V,formand,2009-10-06T12:02:08,2009-10-06T12:04:01,"Jeg vil gerne takke for den tillid, som Tinget..."
2,Lars Løkke,Rasmussen,,minister,2009-10-06T12:04:01,2009-10-06T12:50:39,Danmark er et godt land at leve i. Vi har bygg...
3,Thor,Pedersen,V,formand,2009-10-06T12:50:39,2009-10-06T12:51:10,Tak til statsministeren. Jeg kan oplyse Folket...
4,MødeSlut,MødeSlut,MødeSlut,MødeSlut,2009-10-06T12:51:10,,Mødet er hævet. (Kl. 12:51).
...,...,...,...,...,...,...,...
633639,Rasmus Helveg,Petersen,RV,formand,2022-09-15T12:23:24,2022-09-15T12:23:25,Værsgo.
633640,Christian Rabjerg,Madsen,,minister,2022-09-15T12:23:25,2022-09-15T12:24:14,"Jeg har en klar forventning om, at vi kan genn..."
633641,Rasmus Helveg,Petersen,RV,formand,2022-09-15T12:24:14,2022-09-15T12:24:28,Vi siger mange tak til indenrigs- og boligmini...
633642,Rasmus Helveg,Petersen,RV,formand,2022-09-15T12:24:28,2022-09-15T12:24:46,Der er ikke mere at foretage i dette møde. Fol...


In [248]:
pol_filtered = test.loc[
    (df_politicians['first_name'] != 'MødeSlut') &
    (df_politicians['role'] != 'Pause') &
    (df_politicians['role'] != 'formand') &
    (df_politicians['role'] != 'aldersformanden')
].dropna(
    subset=['first_name', 'last_name', 'group_name', 'role'], # these are questions that have been rescinded
    how='all'
).copy()

In [249]:
# Create full name from first + last name
pol_filtered['full_name'] = pol_filtered['first_name'] + ' ' + pol_filtered['last_name']

In [250]:
pol_filtered['full_name'].value_counts(dropna = False)

Ole Birk Olesen          4055
Finn Sørensen            3359
Mette Frederiksen        3285
Christian Juhl           3258
Inger Støjberg           3035
                         ... 
Jens Arne Hedegaard         1
Niels Høiby                 1
Peder Christensen           1
Sisse Marie Welling         1
Anders G. Christensen       1
Name: full_name, Length: 484, dtype: int64

In [220]:
test_filtered.loc[test_filtered.isna().any(axis=1), 'full_name'].value_counts()

Mette Frederiksen           2696
Inger Støjberg              2348
Morten Bødskov              1900
Nick Hækkerup               1771
Lars Løkke Rasmussen        1726
                            ... 
Christian Rabjerg Madsen      37
Jonas Dahl                    35
Søren Gade                    33
Kaare Dybvad                  32
Connie Hedegaard              23
Name: full_name, Length: 98, dtype: int64

In [221]:
test_filtered.loc[:, ['full_name', 'group_name']].drop_duplicates().sort_values('full_name').head(50)

Unnamed: 0,full_name,group_name
632532,Aaja Chemnitz,IA
269614,Aaja Chemnitz Larsen,IA
605577,Abbas Razvi,RV
463095,Aki-Matilda Høegh-Dam,SIU
269632,Aleqa Hammond,SIU
327209,Aleqa Hammond,UFG
415590,Aleqa Hammond,NQ
106442,Alex Ahrendtsen,DF
461870,Alex Vanopslagh,LA
587530,Alexander Grandt,S


In [222]:
# It would seem that we have to impute party affiliation for ministers and other special cases,
# since title is included instead of party info when, for instance, the speaker is Minister of X.
#
# Since a given cabinet may consist of more than one party, the most fail-safe procedure seems to
# be imputing current party membership by locating the most recent extant record of membership
# prior to appointment as minister.

In [225]:
def impute_party_membership(df, politician_list):
    '''
    TODO...
    '''
    
    new_df = df.copy()
    
    for politician in tqdm(politician_list):
        
        # Get party affiliation (potentially multiple parties)
        party_affiliations = new_df.loc[
            (new_df['full_name'] == politician) &
            new_df['group_name'].notna(),
            'group_name'
        ].drop_duplicates().tolist()
        
        try:
            (party_imputation,) = party_affiliations # unpack party list if it only contains 1 element

        except ValueError:
            party_imputation = ','.join(party_affiliations) # return entire list as str, if more than one membership
            print(f'{politician} has been a member of multiple parties,\nor party affiliation could not be determined: {party_imputation}.\n')

        # Write party membership where missing
        new_df.loc[
            (new_df['full_name'] == politician) &
            new_df['group_name'].isna(),
            ['group_name']
        ] = party_imputation
    
    return new_df

In [226]:
unknown_party_politicians = (
    test_filtered
        .loc[test_filtered['group_name'].isna(), 'full_name']
        .drop_duplicates()
        .tolist()
)

In [227]:
unknown_party_politicians[:5]

['Lars Løkke Rasmussen',
 'Eva Kjer Hansen',
 'Claus Hjort Frederiksen',
 'Troels Lund Poulsen',
 'Karen Ellemann']

In [228]:
test_imp = impute_party_membership(test_filtered, unknown_party_politicians)

  0%|          | 0/98 [00:00<?, ?it/s]

Lars Løkke Rasmussen has been a member of multiple parties,
or party affiliation could not be determined: V,UFG,M.

Inger Støjberg has been a member of multiple parties,
or party affiliation could not be determined: V,UFG.

Connie Hedegaard has been a member of multiple parties,
or party affiliation could not be determined: .

Charlotte Sahl-Madsen has been a member of multiple parties,
or party affiliation could not be determined: .

Thor Möger Pedersen has been a member of multiple parties,
or party affiliation could not be determined: .

Astrid Krag has been a member of multiple parties,
or party affiliation could not be determined: SF,S.

Uffe Elbæk has been a member of multiple parties,
or party affiliation could not be determined: RV,UFG,ALT,FG.

Ida Auken has been a member of multiple parties,
or party affiliation could not be determined: SF,RV,S.

Jørn Neergaard Larsen has been a member of multiple parties,
or party affiliation could not be determined: .

Simon Emil Ammitzbøll-

In [230]:
(
    test_imp
        .loc[test_imp['group_name'] == '', 'full_name']
        .drop_duplicates()
        .tolist()
)

['Connie Hedegaard',
 'Charlotte Sahl-Madsen',
 'Thor Möger Pedersen',
 'Jørn Neergaard Larsen',
 'Peter Hummelgaard',
 'Kaare Dybvad Bek',
 'Joy Mogensen']

In [233]:
manual_party_map = {
    'Connie Hedegaard': 'KF',
    'Charlotte Sahl-Madsen': 'KF',
    'Thor Möger Pedersen': 'SF',
    'Jørn Neergaard Larsen': 'V',
    'Peter Hummelgaard': 'S',
    'Kaare Dybvad Bek': 'S',
    'Joy Mogensen': 'S'
}

In [241]:
test_imp['group_name'] = test_imp['group_name'].replace('', np.nan)

In [242]:
test_imp['group_name'] = test_imp['group_name'].fillna(test_imp['full_name'].map(manual_party_map))

In [245]:
test_imp.loc[test_imp['group_name'] == 'M']

Unnamed: 0,first_name,last_name,group_name,role,start_time,end_time,text,full_name
629667,Lars Løkke,Rasmussen,M,medlem,2022-05-23T19:26:02,2022-05-23T19:36:32,"Tak for det. Ja, det her er jo så Folketingets...",Lars Løkke Rasmussen
629719,Lars Løkke,Rasmussen,M,medlem,2022-05-23T20:05:47,2022-05-23T20:06:52,"Tak for det. Tak for en som altid god tale, og...",Lars Løkke Rasmussen
629723,Lars Løkke,Rasmussen,M,medlem,2022-05-23T20:08:06,2022-05-23T20:08:56,Det er jo sådan set præcis den samme bekymring...,Lars Løkke Rasmussen
629818,Lars Løkke,Rasmussen,M,medlem,2022-05-23T20:57:38,2022-05-23T20:58:43,Tak for det. Jeg vil også gerne benytte lejlig...,Lars Løkke Rasmussen
629957,Lars Løkke,Rasmussen,M,medlem,2022-05-23T22:35:45,2022-05-23T22:37:03,Tak for det. Statsministeren bragte FE-sagen o...,Lars Løkke Rasmussen
629961,Lars Løkke,Rasmussen,M,medlem,2022-05-23T22:38:05,2022-05-23T22:39:10,"Jamen det er jeg sådan set enig i, og det er j...",Lars Løkke Rasmussen
633211,Lars Løkke,Rasmussen,M,medlem,2022-09-08T16:43:11,2022-09-08T16:44:23,"Tak for den meget, meget effektive debat. Jeg ...",Lars Løkke Rasmussen
633215,Lars Løkke,Rasmussen,M,medlem,2022-09-08T16:45:30,2022-09-08T16:46:35,"Jamen jeg har godt set, at der er en pulje – e...",Lars Løkke Rasmussen
