# Importing data and packages

In [1]:
import pandas as pd

In [2]:
#reading the data and resetting index
df = pd.read_csv('../data/DPC_speeches.csv')
df = df.reset_index().iloc[:,2:]

In [3]:
#reading Oli's list of FT members
ft_mem = pd.read_csv('../data/fm.csv')

In [4]:
#looking at the data
df.head()

Unnamed: 0,Tale,Starttid,Sluttid,Navn,Rolle,Tekst
0,,2011-09-30T13:00:16.860,2011-09-30T13:00:28.140,,midlertidig formand,Mødet er åbnet.
1,,2011-09-30T13:55:01.640,2011-09-30T13:55:12.063,,midlertidig formand,Så er mødet genoptaget.
2,,2011-09-30T13:59:37.437,2011-09-30T13:59:56.640,,formand,Der er ikke mere at foretage i dette møde.Folk...
3,,2011-09-30T13:00:28.140,2011-09-30T13:00:52.110,,midlertidig formand,Forhandlingen er åbnet. Fru Karen Klint.
4,,2011-09-30T13:00:52.110,2011-09-30T13:05:42.060,Karen J. Klint,medlem,Tak for det. Det er jo dejligt at se så fyldt ...


In [5]:
#checking whether "Tale" is always None
df.Tale.unique()

array(['None'], dtype=object)

In [6]:
#excluding "Tale"
df = df[['Starttid', 'Sluttid', 'Navn', 'Rolle', 'Tekst']]

In [7]:
df.head(1)

Unnamed: 0,Starttid,Sluttid,Navn,Rolle,Tekst
0,2011-09-30T13:00:16.860,2011-09-30T13:00:28.140,,midlertidig formand,Mødet er åbnet.


In [8]:
#checking type of the start and end-time
type(df.iloc[0,1])

str

In [9]:
#describing some of the data in the Parliament Corpus
df.describe()

Unnamed: 0,Starttid,Sluttid,Navn,Rolle,Tekst
count,236966,236966,236966.0,236966,236966
unique,236966,236958,318.0,6,152763
top,2010-03-16T12:34:22.123,2010-04-22T17:17:55.517,,formand,Ordføreren.
freq,1,2,43200.0,122597,22926


# Adding parties to the politicians

In [10]:
#checking how many unique names in our dataset
xml = list(df.Navn.unique())
len(xml)

318

In [11]:
#Checking how many names in the ft dataset
ft = list(ft_mem.Spørger.unique())
ft = [x[0:-1] for x in ft] #removing whitespace in the end of every name
len(ft)

437

In [12]:
#making a list of the set values
set_ = list(set(xml) & set(ft))

In [13]:
#removing whitespace in the end of every name
ft_mem['Spørger'] = ft_mem['Spørger'].apply(lambda x: x[0:-1])

In [14]:
#filtering out the names that we are looking for which are matching in the list of FT members
match = ft_mem[ft_mem['Spørger'].isin(set_)]
match.head()

Unnamed: 0.1,Unnamed: 0,Parti,Spørger,Dato
2,2,ALT,Christian Juhl,1
5,5,ALT,Josephine Fock,26
10,10,ALT,Rasmus Prehn,1
15,15,ALT,Uffe Elbæk,47
17,17,DF,Alex Ahrendtsen,125


In [15]:
df.shape

(236966, 5)

In [16]:
#merging the two dataframes
df_m = df.merge(match, left_on = "Navn", right_on = "Spørger", how='left')

In [17]:
#looking only at specific columns
df_m = df_m[['Starttid', 'Sluttid', 'Navn', 'Rolle', 'Tekst', 'Parti']]

In [18]:
#only looking at relevant names
df_m = df_m[df_m.Navn!='None']

In [19]:
#checking which names does not have a party
df_m[df_m.isnull().Parti].Navn.unique()

array(['Henriette Kjær', 'Simon Emil Ammitzbøll', 'Peter Madsen',
       'Flemming Møller', 'Niels Høiby', 'Tage Leegaard', 'Helle Sjelle',
       'DF', 'Charlotte Sahl-Madsen', 'Per Bisgaard', 'Kurt Scheelsbeck',
       'Anna Kirsten Olesen', 'Erika Lorentsen', 'Peter Juel Jensen',
       'Jens Arne Hedegaard', 'Irene Simonsen', 'Malou Aamund',
       'Nadeem Farooq', 'Ida Damborg', 'Jacob Bjerregaard',
       'Jeppe Mikkelsen', 'Thor Möger Pedersen', 'Hans Vestager',
       'Bjarne Corydon', ', fg.', 'Ane Halsboe-Larsen', 'Mai Henriksen',
       'Marlene B. Lorentzen', 'Camilla Hersom', 'Christian Friis Bach',
       'Lone Loklindt', 'Anne Sina', 'Ulla Sandbæk',
       'Jørn Neergaard Larsen', 'Mie Bergmann', 'Mette Boye',
       'Karen Touborg', 'Linda Kristiansen', 'Kisser Franciska Lehnert',
       'Pernille Boye Koch', 'fg.', 'Annika Smith', 'Sanne Bjørn',
       'Peder Christensen', 'Rasmus Lynghøj', 'Knud Kristensen',
       'Henrik Rasmussen', 'Steen Konradsen', 'Rasmus Jarlov

In [20]:
ft

['Anders Stjernholm',
 'Carolina Magdalene Maier',
 'Christian Juhl',
 'Christian Poll',
 'Erik Christensen',
 'Josephine Fock',
 'Julius Graakjær Grantzau',
 'Mira Issa Bloch',
 'Pernille Schnoor',
 'Rasmus Nordqvist',
 'Rasmus Prehn',
 'René Gade',
 'Sikandar Siddique',
 'Susanne Zimmer',
 'Torsten Gejl',
 'Uffe Elbæk',
 'Aase D. Madsen',
 'Alex Ahrendtsen',
 'Anita Christensen',
 'Anita Knakkergaard',
 'Bent Bøgsted',
 'Birthe Skaarup',
 'Carsten Kudsk',
 'Christian Langballe',
 'Claus Kvist Hansen',
 'Colette L. Brix',
 'Dennis Flydtkjær',
 'Dorthe Ullemose',
 'Hans Kristian Skibby',
 'Henrik Brodersen',
 'Ib Poulsen',
 'Jacob Jensen',
 'Jan Erik Messmann',
 'Jan Rytkjær Callesen',
 'Jens Henrik Thulesen Dahl',
 'Jeppe Jakobsen',
 'Jesper Langballe',
 'Jørn Dohrmann',
 'Karin Nødgaard',
 'Karina Adsbøl',
 'Karina Due',
 'Karsten Lauritzen',
 'Kenneth Kristensen Berth',
 'Kim Christiansen',
 'Kristian Thulesen Dahl',
 'Liselott Blixt',
 'Marie Krarup',
 'Marlene Harpsøe',
 'Martin H

In [21]:
#We can always sort these away later on - for now we have some with and some without
df_m.head()

Unnamed: 0,Starttid,Sluttid,Navn,Rolle,Tekst,Parti
4,2011-09-30T13:00:52.110,2011-09-30T13:05:42.060,Karen J. Klint,medlem,Tak for det. Det er jo dejligt at se så fyldt ...,S
5,2011-09-30T13:05:42.060,2011-09-30T13:06:52.970,Karen J. Klint,medlem,Vedrørende valget i Grønland indstiller et eni...,S
11,2011-03-02T15:32:15.660,2011-03-02T15:32:46.380,Holger K. Nielsen,formand,Der er ikke mere at foretage i dette møde.Folk...,SF
12,2011-03-02T13:01:23.263,2011-03-02T13:03:10.263,John Dyrby Paulsen,medlem,Tak for det. Jeg står her med en rapport fra F...,S
14,2011-03-02T13:03:11.407,2011-03-02T13:04:19.743,Gitte Lillelund Bech,minister,Jeg takker for spørgsmålet. Hr. John Dyrby Pau...,V


# Inserting dates

In [22]:
#making dates into datetime
df_m.Starttid = pd.to_datetime(df_m.Starttid)
df_m.Sluttid = pd.to_datetime(df_m.Sluttid)

In [24]:
#creating a date columns
df_m['Dato'] = df_m.Starttid.dt.date

In [25]:
#and keeping start and endtime to a time variable
df_m['Starttid'] = df_m['Starttid'].dt.time
df_m['Sluttid'] = df_m['Sluttid'].dt.time

Ideas for features (apart from the ones we have) 
- weekday
- day of month
- month
- length of speech
- year

In [26]:
df_m.iloc[0,-3]

'Tak for det. Det er jo dejligt at se så fyldt en sal og publikumsbænkene. Det er også en højtidelig dag.Som formand for Udvalget til Prøvelse af Valgene kan jeg oplyse følgende om udvalgets arbejde, idet jeg samtidig skal henvise til den offentliggjorte betænkning og indstillingerne:Arbejdet har som sædvanlig vedrørt tre spørgsmål.For det første har det vedrørt spørgsmålet om gyldighed af valghandlingen, herunder valgets forberedelse, stemmeafgivelsen, optællingen af selve stemmesedlerne, vurderingen af dem samt valgbøgerne og disses førelse m.m.For det andet har det vedrørt spørgsmålet om godkendelse af ændringer af de af Indenrigs- og Sundhedsministeriets foretagne opgørelser og beregninger. For det tredje har det vedrørt spørgsmålet om godkendelse af de kandidater, der har opnået valg.Om gyldigheden af valghandlingen kan jeg sige, at vi i udvalget har gennemgået Indenrigs- og Sundhedsministeriets redegørelse, der indeholder de bemærkninger, som gennemgangen af valgbogsudskrifterne 

# Starting text-processing

### Removing simple things using gensim

In [27]:
import nltk

In [28]:
#nltk.download('stopwords')

In [31]:
#dk_sw

In [35]:
from gensim import utils
import gensim.parsing.preprocessing as gsp

unable to import 'smart_open.gcs', disabling that module


In [37]:
filters = [
           gsp.strip_tags, 
           gsp.strip_punctuation,
           gsp.strip_multiple_whitespaces,
           gsp.strip_numeric
          ]

In [38]:
def clean_text(s):
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s

In [39]:
#apply clean-text function to all the text
df_m['Tekst'] = df_m['Tekst'].apply(lambda x: clean_text(x))

### Tokenizing text

In [53]:
#looking at the same line of text as above
#df_m.iloc[0,-3] #something definitily changed - good 

In [46]:
from nltk.tokenize import word_tokenize

In [47]:
#tokenizing all text
df_m['t_text'] = df_m['Tekst'].apply(lambda x: word_tokenize(x))

### Removing stopwords

In [55]:
dk_sw = nltk.corpus.stopwords.words('danish')

In [57]:
ttt = df_m.iloc[0,-1]

In [61]:
df_m['t_tekst_clean'] = df_m['t_text'].apply(lambda x: [word for word in x if not word in dk_sw])

### Counting most frequent words

In [49]:
from collections import Counter

In [52]:
#counting words from the tokenized words
aggregate_counter = Counter()
for row_index,row in df_m.iterrows():
    if row_index%1000==0:
        print(row_index)
    c = Counter(row['t_text'])
    aggregate_counter += c

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
14000


KeyboardInterrupt: 

In [None]:
#defining common words
common_words = [word[0] for word in aggregate_counter.most_common(50)]
common_words_counts = [word[1] for word in aggregate_counter.most_common(50)]

In [None]:
# Defining function to count it
import matplotlib.pyplot as plt
import seaborn as sns

def barplot(words, words_counts, title):
    fig = plt.figure(figsize=(18,6))
    bar_plot = sns.barplot(x=words, y=words_counts)
    for item in bar_plot.get_xticklabels():
        item.set_rotation(90)
    plt.title(title)
    plt.show()

In [None]:
#plotting the most used words in folketinget
barplot(words=common_words, words_counts=common_words_counts, title='Most Frequent Words used in Folketinget')

# Classifying the text

In [62]:
df_m

Unnamed: 0,Starttid,Sluttid,Navn,Rolle,Tekst,Parti,Dato,t_text,t_tekst_clean
4,13:00:52.110000,13:05:42.060000,Karen J. Klint,medlem,tak for det det er jo dejligt at se så fyldt e...,S,2011-09-30,"[tak, for, det, det, er, jo, dejligt, at, se, ...","[tak, dejligt, se, så, fyldt, sal, publikumsbæ..."
5,13:05:42.060000,13:06:52.970000,Karen J. Klint,medlem,vedrørende valget i grønland indstiller et eni...,S,2011-09-30,"[vedrørende, valget, i, grønland, indstiller, ...","[vedrørende, valget, grønland, indstiller, eni..."
11,15:32:15.660000,15:32:46.380000,Holger K. Nielsen,formand,der er ikke mere at foretage i dette møde folk...,SF,2011-03-02,"[der, er, ikke, mere, at, foretage, i, dette, ...","[mere, foretage, møde, folketingets, næste, mø..."
12,13:01:23.263000,13:03:10.263000,John Dyrby Paulsen,medlem,tak for det jeg står her med en rapport fra fo...,S,2011-03-02,"[tak, for, det, jeg, står, her, med, en, rappo...","[tak, står, rapport, forsvarets, færdselssikke..."
14,13:03:11.407000,13:04:19.743000,Gitte Lillelund Bech,minister,jeg takker for spørgsmålet hr john dyrby pauls...,V,2011-03-02,"[jeg, takker, for, spørgsmålet, hr, john, dyrb...","[takker, spørgsmålet, hr, john, dyrby, paulsen..."
16,13:04:20.913000,13:05:18.180000,John Dyrby Paulsen,medlem,tak for svaret det er jeg jo enig med minister...,S,2011-03-02,"[tak, for, svaret, det, er, jeg, jo, enig, med...","[tak, svaret, enig, ministeren, svært, uenig, ..."
18,13:05:19.837000,13:06:05.883000,Gitte Lillelund Bech,minister,nu har jeg altså ikke haft mulighed for at læs...,V,2011-03-02,"[nu, har, jeg, altså, ikke, haft, mulighed, fo...","[altså, haft, mulighed, læse, rapporten, sige,..."
20,13:06:07.727000,13:07:07.463000,John Dyrby Paulsen,medlem,det skal være ministeren undskyldt der er sikk...,S,2011-03-02,"[det, skal, være, ministeren, undskyldt, der, ...","[ministeren, undskyldt, sikkert, masse, andet,..."
22,13:07:08.807000,13:07:51.697000,Gitte Lillelund Bech,minister,jeg betvivler sådan set ikke den konklusion so...,V,2011-03-02,"[jeg, betvivler, sådan, set, ikke, den, konklu...","[betvivler, set, konklusion, hr, john, dyrby, ..."
25,13:07:59.663000,13:08:55.447000,Per Dalgaard,medlem,jeg vil spørge om pirateri og jeg skal nok lad...,DF,2011-03-02,"[jeg, vil, spørge, om, pirateri, og, jeg, skal...","[spørge, pirateri, nok, lade, komme, utrolig, ..."


In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd

In [81]:
def dummy_fun(doc):
    return doc

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None) 

In [82]:
docs = list(df_m.t_tekst_clean)

In [84]:
tfidf.fit(docs)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2',
                preprocessor=<function dummy_fun at 0x1a8e801d90>,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern=None,
                tokenizer=<function dummy_fun at 0x1a8e801d90>, use_idf=True,
                vocabulary=None)

In [85]:
#fitting the tdfif vectorizer to our documents
vector = tfidf.transform(docs)

In [88]:
#printing the number of different words
len(tfidf.get_feature_names())

152168

In [99]:
#checking the shape
vector.shape

(209596, 152168)

It is as supposed. It has the exact amount of lines as we have speeches and cols as we have words. 

In [104]:
X = vector

In [127]:
def kmeans_clusters(k_, document_tfidf_matrix):
    true_k = k_
    model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
    model.fit(document_tfidf_matrix)
    
    #
    order_centroids = model.cluster_centers_.argsort()[:, ::-1]
    terms = tfidf.get_feature_names()
    
    #creating list of the clusters
    list_ = []
    for i in range(true_k):
        list_.append("Cluster %d" % i)
        
    #creating empty df
    df_ = pd.DataFrame(columns = list_)
    
    for i in range(true_k):
        #cerating empty list
        lst = []
        for ind in order_centroids[i, :10]:
            #append top 5 words to list
            lst.append(terms[ind])
        #insert lst into pandas on the correct column
        df_[list_[i]] = pd.Series(lst)
        
    #printing to csv
    df_.to_csv('../data/clusters/clusters_'+str(true_k)+'.csv')

#### Running function in iterations

In [132]:
n_clusters = list(range(5,26,5))

In [133]:
n_clusters

[5, 10, 15, 20, 25]

In [140]:
for k in n_clusters:
    print(k)
    kmeans_clusters(k,X)

5
10
15
20
25


#### Running everything manually

In [105]:
true_k = 10
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=10, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [114]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = tfidf.get_feature_names()

In [118]:
list_ = []
for i in range(true_k):
    list_.append("Cluster %d" % i)

In [120]:
df = pd.DataFrame(columns = list_)

In [121]:
for i in range(true_k):
    #print("Cluster %d:" % i),
    lst = []
    for ind in order_centroids[i, :5]:
        #print(' %s' % terms[ind])
        lst.append(terms[ind])
    df[list_[i]] = pd.Series(lst)

In [125]:
#df.to_csv('../data/clusters/test_'+str(true_k)+'.csv')

In [124]:
#print to check
p = 'no'

if p=='yes':
    for i in range(true_k):
        print("Cluster %d:" % i),
        for ind in order_centroids[i, :5]:
            print(' %s' % terms[ind])