In [0]:

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import re
import seaborn as sns
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
nltk.download("punkt") # Download if not already downloaded (needed for sentence tockenizeing).
import pickle

# Load all necesary files and such.
#reading the data and resetting index.
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/DPC_speeches.csv', delimiter=",")
df = df.reset_index().iloc[:,2:]

#reading Oli's list of FT members
ft_mem = pd.read_csv('/content/drive/My Drive/Colab Notebooks/fm3.csv')
#df = df[~((df['Navn'] == ', fg.'))] # Delete observation that doesnt make sense.
df = df[~df.Rolle.isin([', fg.'])] # Delete observation that doesnt make sense.
# read in stop words for danish as a list.
stopord_ = pd.read_csv("/content/drive/My Drive/Colab Notebooks/stopord.txt")    
stopord = []
for i in stopord_["ad"]:
    stopord.append(i)

df = df[['Starttid', 'Sluttid', 'Navn', 'Rolle', 'Tekst']]
df.head(1)

# Add parties to speakers
#checking how many unique names in our dataset
xml = list(df.Navn.unique())
len(xml)
#Checking how many names in the ft dataset
ft = list(ft_mem.Spørger.unique())
ft = [x[0:-1] for x in ft] #removing whitespace in the end of every name
len(ft)
set_ = list(set(xml) & set(ft)) # making a list of the set values
#removing whitespace in the end of every name
ft_mem['Spørger'] = ft_mem['Spørger'].apply(lambda x: x[0:-1])
#filtering out the names that we are looking for which are matching in the list of FT members
match = ft_mem[ft_mem['Spørger'].isin(set_)]
match.head()
#merging the two dataframes
df_m = df.merge(match, left_on = "Navn", right_on = "Spørger", how='left')
#looking only at specific columns
df_m = df_m[['Starttid', 'Sluttid', 'Navn', 'Rolle', 'Tekst', 'Parti']]
#only looking at relevant names
df_m = df_m[df_m.Navn!='None']
#checking which names does not have a party
df_m[df_m.isnull().Parti].Navn.unique()

df_m.iloc[0,-2]

# Starting text-processing
from gensim import utils
import gensim.parsing.preprocessing as gsp

filters = [
           gsp.strip_tags, 
           gsp.strip_punctuation,
           gsp.strip_multiple_whitespaces,
           gsp.strip_numeric
          ]
def clean_text(s):
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s

# apply clean-text function to all the text
df_m['Tekst'] = df_m['Tekst'].apply(lambda x: clean_text(x))
# tokenizing all text
df_m['t_text'] = df_m['Tekst'].apply(lambda x: word_tokenize(x))
# Take out stop words
df_m['t_tekst_clean'] = df_m['t_text'].apply(lambda x: [word for word in x if not word in stopord])
# Removing the 'formand' of folketinget - it is usually not important the things he is saying
df_m = df_m[~df_m.Rolle.isin(['formand'])]
#choosing the stemmer
stemmer = SnowballStemmer("danish")
#testing 
stemmer.stem("spiser")
# Apply to all text.
df_m['stemmed'] = df_m['t_tekst_clean'].apply(lambda x: [stemmer.stem(item) for item in x])

# Classifying the text
# def dummy_fun(doc):
#     return doc
# tfidf = TfidfVectorizer(analyzer='word',tokenizer=dummy_fun, preprocessor=dummy_fun, token_pattern=None)
# docs = list(df_m.stemmed)
# tfidf.fit(docs)
# # fitting the tdfif vectorizer to our documents
# vector = tfidf.transform(docs)
# vector.shape
# true_k = 20
# model = KMeans(n_clusters=true_k, init='k-means++', max_iter=500, n_init=1)
# model.fit(vector)
# model_indices = model.fit_predict(vector)



  import pandas.util.testing as tm


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:
words = []
for i in range(len(df_m)):
   w = df_m["stemmed"].iloc[i] # Get words for the i'th text.
   words = words + w # Add the words from the i'th text to all previous words. 

with open("/content/drive/My Drive/Colab Notebooks/words.txt", "wb") as fp:   #Pickling
  pickle.dump(words, fp)

In [0]:
with open("/content/drive/My Drive/Colab Notebooks/words.txt", "rb") as fp:   # Unpickling
...   words = pickle.load(fp)

In [0]:
# Classifying the text
def dummy_fun(doc):
   return doc
tfidf = TfidfVectorizer(analyzer='word', tokenizer=dummy_fun, preprocessor=dummy_fun, token_pattern=None)
docs = list(df_m.stemmed)
tfidf.fit(docs)
# # fitting the tdfif vectorizer to our documents
vector = tfidf.transform(docs)
vector.shape
true_k = 25
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=500, n_init=1)
model.fit(vector)
model_indices = model.fit_predict(vector)


In [0]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = tfidf.get_feature_names()

# print the centroids into which clusters they belongs
for i in range(true_k):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :50]:
         print(' %s' % terms[ind], end='')
    print()
     

Cluster 0: børn barn forældr famili ung rigt – år dansk mul hjælp kommun danmark minist men forslag altså dag ordfør regering ret forhold faktisk gern giv sikr vigt udsat tid problem tal arbejd tror mennesk tag fattigdom sted voksn brug sid samfund skol sig ting handl sag bedr ansvar går mor
Cluster 1: hr – jens spørgsmål aaen frank svar lausts bjarn sig dansk regering altså gern spørg tror ole rigt dag mort tal enig forslag men ting år står forhold hækkerup sagt set faktisk christian sør giv sagd rasmus gang dahl danmark jesp hør karst forstå stil kristian debat venstr folketing socialdemokrati
Cluster 2: arbejdsplads virksom danmark arbejd skab dansk job regering – skat vækst betal rigt sikr land virksomhed mennesk arbejdsmarked arbejdsløs år afgift beskæftig altså peng samfund mul brug giv folk dag tror nye gang gern forslag set faktisk økonomisk erhvervsliv økonomi kris led forhold bedr vigt måd politik hel lang selvfølg
Cluster 3: lovforslag forslag støt mul – lov dansk dag del ve

In [0]:
# Preview a centroid.
for ind in order_centroids[4]:
        print(' %s' % terms[ind], end='')



In [0]:
#df_m["stemmed"].shape == model_indices.shape
#df_m["Tekst"].shape == model_indices.shape
# Make sure both "Parti" shape and model_indices shape are equal.
df_m["Parti"].shape == model_indices.shape
#len(df_m["stemmed"])
#len(model_indices)
#model_indices

True

In [0]:
cluster_party = np.c_[model_indices, df_m["Parti"]] # Combine the two columns into one numpy array.

In [0]:
cluster_party.shape[0] == len(model_indices) # Make sure these are both the same length.

True

In [0]:
cluster_party = pd.DataFrame(data=cluster_party, columns=["cluster", "party"]) # Convert to pandas dataframe for easier data wrangling. 

In [0]:
party_counts = cluster_party.groupby(["cluster", "party"]).size() # Show counts of parties in every cluster.
#with open("/content/drive/My Drive/Colab Notebooks/party_counts.txt", "wb") as fp:   #Pickling
 # pickle.dump(party_counts, fp)


In [0]:
pd.set_option("display.max_rows", 1000)
print(party_counts)

cluster  party        
0        DF                278
         EL                 72
         KD                  1
         KF                212
         LA                 23
         RV                 62
         S                 104
         SF                 37
         SIU                 3
         V                 173
         udpeget af SF      10
1        ALT                32
         DF                392
         EL                171
         KD                  7
         KF                201
         LA                157
         RV                478
         S                 101
         SF                 50
         UFG                 8
         V                 587
2        ALT                90
         DF                221
         EL                408
         IA                  2
         JF                  2
         KD                  6
         KF                 44
         LA                122
         RV                353
         S      