# Preprocessing transcripts

In [84]:
#%pip install pandarallel

In [17]:
# Imports
import pickle
import pandas as pd
import numpy as np
import re
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [18]:
# Load transcripts from pickle
raw_transcripts = (
    pd.read_pickle('data/clean/subset_party_imputed_v3_2015_version.p')
        .query('source == "parliament"')
        .drop(columns = ['index'])
    )  # TODO: change dataset after retrieval pipeline is cleaned up

In [19]:
print(raw_transcripts.shape)
raw_transcripts.head()

(159723, 6)


Unnamed: 0,doc,source,start_time,full_name,level,party
12,Tak. Danmarks kolonihistorie handler om militæ...,parliament,2019-10-03,Aki-Matilda Høegh-Dam,,SIU
13,"Jeg mener, at det er personen, der kan snakke ...",parliament,2019-10-03,Aki-Matilda Høegh-Dam,,SIU
14,"Det handler jo ikke om, hvad jeg synes. Det ha...",parliament,2019-10-03,Aki-Matilda Høegh-Dam,,SIU
15,"Jeg ved ikke, om det – når man snakker om at b...",parliament,2019-10-03,Aki-Matilda Høegh-Dam,,SIU
16,"Det er ikke noget, jeg bare mener; det er noge...",parliament,2019-10-03,Aki-Matilda Høegh-Dam,,SIU


In [20]:
raw_transcripts['start_time'].sort_values()

42484     2015-07-02
42483     2015-07-02
42482     2015-07-02
146643    2015-07-03
235398    2015-07-03
             ...    
234429    2022-10-06
234428    2022-10-06
234427    2022-10-06
234443    2022-10-06
475590    2022-10-06
Name: start_time, Length: 159723, dtype: object

In [21]:
query_string = 'Konservative Folkeparti'

len([doc for doc in raw_transcripts['doc'] if query_string in doc])#[:30]

3151

In [22]:
def remove_phrases(doc, harshness = 'low'):
    """
    Remove phrases from a lowercased document.
    """
    
    if harshness == 'high':
        # Remove names – requires uppercase and is thus done separately!
        name_pattern = r'(?:[Hh]r\. |[Ff]ru |[Ff]røken )[A-ZÆØÅ][a-zæøå]+(?:-[A-ZÆØÅ][a-zæøå]+)?(?:\s[A-ZÆØÅ][a-zæøå]+)*(?:-[A-ZÆØÅ][a-zæøå]+)?(?:\s[A-ZÆØÅ][a-zæøå]+)*(?:-[A-ZÆØÅ][a-zæøå]+)? '
        doc = re.sub(name_pattern, '', doc)

        # Remove party names
        party_pattern = r'Socialdemokrat[ietsrnes]*|Venstre[s]*|Dansk Folkeparti[s]*|Enhedslisten[s]*|SF[s]*|Socialistiske|Konservative Folkeparti[s]*|Konservative[s]*|Radikale Venstre[s]*|De Radikale[s]*|Radikale[s]*|Nye Borgerlige[s]*|Liberal Alliance[s]*|Alternativet[s]*|Frie Grønne[s]*'
        doc = re.sub(party_pattern, '', doc)

    doc = doc.lower()

    # Remove procedural thank yous of different kinds
    politeness_pattern = r'^[\w\s,]*tak[\w\s,]*. |^tak|tak for ordet'
    doc = re.sub(politeness_pattern, '', doc)

    # remove superfluous whitespace
    doc = re.sub(r'\s+', ' ', doc)
    
    return doc


In [23]:
test_doc = 'Tak, fru formand. Spørgsmålet fra SFs formand er ret enkelt. Liberal Alliance er her også. Er fru Pernille Vermund enig i at indføre brugerbetaling? Det er vel De Radikales politik?'
test_doc = 'SF, de Radikale, Konservatives ordfører. Liberal Alliance, Liberal Alliance. Liberal Alliance. Tak for ordet.'

In [24]:
raw_transcripts

Unnamed: 0,doc,source,start_time,full_name,level,party
12,Tak. Danmarks kolonihistorie handler om militæ...,parliament,2019-10-03,Aki-Matilda Høegh-Dam,,SIU
13,"Jeg mener, at det er personen, der kan snakke ...",parliament,2019-10-03,Aki-Matilda Høegh-Dam,,SIU
14,"Det handler jo ikke om, hvad jeg synes. Det ha...",parliament,2019-10-03,Aki-Matilda Høegh-Dam,,SIU
15,"Jeg ved ikke, om det – når man snakker om at b...",parliament,2019-10-03,Aki-Matilda Høegh-Dam,,SIU
16,"Det er ikke noget, jeg bare mener; det er noge...",parliament,2019-10-03,Aki-Matilda Høegh-Dam,,SIU
...,...,...,...,...,...,...
475586,"Det burde jo så være kulturministeren, jeg sti...",parliament,2022-06-09,Zenia Stampe,,RV
475587,"Jeg er meget glad for, at Det Konservative Fol...",parliament,2022-06-09,Zenia Stampe,,RV
475588,"Nej, man kan ikke en til en sende en refunderi...",parliament,2022-06-09,Zenia Stampe,,RV
475589,Jeg vil gerne starte med at kvittere for det g...,parliament,2022-10-06,Zenia Stampe,,RV


In [25]:
# use remove_phrases on all documents in raw_transcripts
test = raw_transcripts['doc'].parallel_apply(lambda x: remove_phrases(x, harshness='high'))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=79862), Label(value='0 / 79862')))…

In [26]:
len([d for d in test if 'radikale venstre' in d])

0

In [27]:
remove_phrases(test_doc, harshness='high')

', de , ordfører. , . . .'

In [28]:
import re
import string
import nltk
for dependency in ['punkt', 'wordnet', 'omw-1.4', 'stopwords', 'averaged_perceptron_tagger']:
    nltk.download(dependency)

def preproc_docs(text, harshness = 'low'):
    #Lowercasing words
    text = text.lower()
    
    #Removing HTML tag
    text = re.sub(r'&amp', '', text)

    #Replace "&" with "and"
    text = re.sub(r'&','and', text)
    
    #Removing punctuation
    text = text.translate(str.maketrans('', '', string.punctuation.replace('-',''))) #Taking hyphens out of punctuation to remove
    text = re.sub(r' - ','', text) #removing dash lines bounded by whitespace (and therefore not part of a word)
    text = re.sub(r'…', '', text)
    text = re.sub(r'[â€˜â€™â€œâ€â€”]','',text) #removing punctuation that is not captured by string.punctuation

    #Removing numbers
    text = re.sub(r'[0-9.]','', text)

    # Removing idiosynchratic characters in our data
    text = re.sub(r'-\n|\n-|\na-|\nb-|â€“|Â«|--|’', '', text)
    text = re.sub(r'- ', ' ', text)

    #Removing separators and superfluous whitespace
    text = text.strip()
    text = re.sub(r' +',' ',text)

    return text

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mathiasbruun/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mathiasbruun/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/mathiasbruun/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mathiasbruun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mathiasbruun/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [29]:
# function to get number of words in a string
def get_word_count(text):
    return len(text.split())

def preproc_pipeline(raw_transcripts, harshness = 'low'):
    transcripts = raw_transcripts.copy()

    # Minimal preproc: lowercase, remove punctuation, remove numbers, remove separators, remove superfluous whitespace
    if harshness == 'low':
        transcripts['doc'] = transcripts['doc'].parallel_apply(preproc_docs)

    # Moderate preproc: minimal preproc + remove thank yous/procedural fluff + remove very short docs
    if harshness == 'moderate':
        transcripts['doc'] = transcripts['doc'].parallel_apply(lambda x: remove_phrases(x, harshness='low'))
        transcripts['doc'] = transcripts['doc'].parallel_apply(lambda x: preproc_docs(x))

        transcripts['word_count'] = transcripts['doc'].parallel_apply(lambda x: get_word_count(x))
        transcripts = transcripts.loc[transcripts.word_count > 10].reset_index(drop=True)

    # Harsh preproc: moderate preproc + remove names, remove party names
    if harshness == 'high':
        transcripts['doc'] = transcripts['doc'].parallel_apply(lambda x: remove_phrases(x, harshness='high'))
        transcripts['doc'] = transcripts['doc'].parallel_apply(lambda x: preproc_docs(x))

        transcripts['word_count'] = transcripts['doc'].parallel_apply(lambda x: get_word_count(x))
        transcripts = transcripts.loc[transcripts.word_count > 10].reset_index(drop=True)

    return transcripts

In [30]:
# # Minimal preproc
# transcripts_low = preproc_pipeline(raw_transcripts, harshness = 'low')

# # Moderate preproc
# transcripts_moderate = preproc_pipeline(raw_transcripts, harshness = 'moderate')

# Harsh preproc
transcripts_high = preproc_pipeline(raw_transcripts, harshness = 'high')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=79862), Label(value='0 / 79862')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=79862), Label(value='0 / 79862')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=79862), Label(value='0 / 79862')))…

In [32]:
print(len(transcripts_low))
print(len(transcripts_moderate))
print(len(transcripts_high))

158386


In [119]:
# increase pandas print length 
pd.set_option('display.max_colwidth', 100)

In [34]:
# transcripts_low.to_pickle('data/clean/preprocessed_docs_2015_low.p')
# transcripts_moderate.to_pickle('data/clean/preprocessed_docs_2015_moderate.p')
# transcripts_high.to_pickle('data/clean/preprocessed_docs_2015_high.p')

In [36]:
test = pd.read_pickle('data/clean/preprocessed_docs_2015_high_KF.p')

test.word_count.describe()

count    158386.000000
mean        197.487695
std         224.246109
min          11.000000
25%          95.000000
50%         136.000000
75%         199.000000
max        6288.000000
Name: word_count, dtype: float64