In [1]:
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import time
import sys
import os

import seaborn as sns
import gensim

import django
import platform

if platform.node() == "srv-mcc-apsis":
    sys.path.append('/home/leey/tmv/BasicBrowser/')
else:
    # local paths
    sys.path.append('/Documents/Data/tmv/BasicBrowser/')

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "BasicBrowser.settings")
django.setup()

# import from appended path
import parliament.models as pm
from parliament.tasks import do_search, run_tm
import cities.models as cmodels
from django.contrib.auth.models import User
from tmv_app.models import *
from utils.tm_mgmt import update_topic_scores
from utils.text import *
from django.db.models import Q, Count, Func, F, Sum, Value, Case, When, IntegerField

In [2]:
def clean_text(text):

    text = text.replace('\r', '\n')
    text = text.replace(u'\xa0', ' ')
    text = text.replace(u'\x96', '-')
    text = text.replace(u'\xad', '-')
    text = text.replace(u'\u2014', '–')
    # text = text.replace(u'\u2013', '–')
    text = text.replace('(', '')
    text = text.replace(')', '')
    return text

### Finding agenda items

In [3]:
# finding the agenda items of the utterances that appear in the search (id = 89)
utterances = pm.Utterance.objects.filter(search_matches__pk=89)
#agenda_items = pm.AgendaItem.objects.filter()

In [4]:
agenda_items = []
for ut in utterances:
    try:
        agenda_items.append(ut.agenda_item.title)
    except AttributeError:
        pass

In [5]:
df_agenda_items = pd.DataFrame(agenda_items)
df_agenda_items.columns = ['Agenda Item']

In [6]:
# extract unique agenda items only 
agenda_items_unique = list(set(agenda_items))
df_agenda_items_unique = pd.DataFrame(agenda_items_unique)
df_agenda_items_unique.columns = ['Agenda Item']

---

### Preparing corpus and dictionary

In [9]:
agenda_items_unique_lower = [word.lower() for word in agenda_items_unique]

In [10]:
agenda_items_unique_clean = [clean_text(text) for text in agenda_items_unique_lower]

In [11]:
from gensim import corpora
from nltk.corpus import stopwords
stop_words = set(stopwords.words('german'))

# Tokenize(split) the sentences into words
texts = [[text for text in doc.split()] for doc in agenda_items_unique_clean]

# Remove stopwords 
texts_no_sw = []
for text in texts:
    no_sw = [word for word in text if word not in stop_words]
    texts_no_sw.append(no_sw)
    

In [12]:
# Create dictionary
dictionary = corpora.Dictionary(texts_no_sw)

# Get information about the dictionary
print(dictionary)

Dictionary(400 unique tokens: ['-preis', '2', 'bundesregierung', 'co', 'haltung']...)


In [19]:
agenda_corpus = [dictionary.doc2bow(text, allow_update=True) for text in texts_no_sw]

### Running a simple topic model

In [14]:
# Step 0: Import packages and stopwords
from gensim.models import LdaModel, LdaMulticore
from gensim.utils import simple_preprocess, lemmatize
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)

In [15]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=agenda_corpus,
                                           id2word=dictionary,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

2019-08-07 12:09:39,114 : INFO : using autotuned alpha, starting with [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
2019-08-07 12:09:39,117 : INFO : using symmetric eta at 0.1
2019-08-07 12:09:39,118 : INFO : using serial LDA version on this node
2019-08-07 12:09:39,210 : INFO : running online (multi-pass) LDA training, 10 topics, 10 passes over the supplied corpus of 99 documents, updating model once every 99 documents, evaluating perplexity every 99 documents, iterating 50x with a convergence threshold of 0.001000
2019-08-07 12:09:39,813 : INFO : -10.530 per-word bound, 1478.1 perplexity estimate based on a held-out corpus of 99 documents with 1214 words
2019-08-07 12:09:39,815 : INFO : PROGRESS: pass 0, at document #99/99
2019-08-07 12:09:40,233 : INFO : optimized alpha [0.09175396, 0.083210476, 0.085939124, 0.08090174, 0.08895925, 0.08173998, 0.075357035, 0.07777944, 0.077606685, 0.086443864]
2019-08-07 12:09:40,262 : INFO : topic #6 (0.075): 0.043*"beratung" + 0.034*"abgeordn

2019-08-07 12:09:41,869 : INFO : topic #0 (0.077): 0.049*"fraktion" + 0.043*"beratung" + 0.036*"abgeordneter" + 0.036*"weiterer" + 0.035*"abgeordneten" + 0.028*"ausschusses" + 0.028*"beschlussempfehlung" + 0.028*"ausschuss" + 0.025*"antrags" + 0.021*"berichts"
2019-08-07 12:09:41,870 : INFO : topic #9 (0.080): 0.066*"90/die" + 0.066*"grünen" + 0.066*"bündnis" + 0.059*"beratung" + 0.052*"antrags" + 0.042*"fraktion" + 0.029*"abgeordneten" + 0.028*"weiterer" + 0.028*"abgeordneter" + 0.025*"dr."
2019-08-07 12:09:41,871 : INFO : topic diff=0.068424, rho=0.408248
2019-08-07 12:09:42,082 : INFO : -5.710 per-word bound, 52.3 perplexity estimate based on a held-out corpus of 99 documents with 1214 words
2019-08-07 12:09:42,083 : INFO : PROGRESS: pass 5, at document #99/99
2019-08-07 12:09:42,197 : INFO : optimized alpha [0.074617215, 0.06386206, 0.06329118, 0.0534401, 0.06640918, 0.059296377, 0.050380312, 0.053161733, 0.05040785, 0.07958908]
2019-08-07 12:09:42,223 : INFO : topic #6 (0.050): 0.

2019-08-07 12:09:43,821 : INFO : topic #0 (0.068): 0.050*"fraktion" + 0.042*"beratung" + 0.036*"abgeordneter" + 0.036*"weiterer" + 0.036*"abgeordneten" + 0.029*"ausschusses" + 0.029*"beschlussempfehlung" + 0.029*"ausschuss" + 0.025*"antrags" + 0.023*"berichts"
2019-08-07 12:09:43,824 : INFO : topic #9 (0.079): 0.074*"90/die" + 0.074*"grünen" + 0.074*"bündnis" + 0.059*"beratung" + 0.051*"antrags" + 0.046*"fraktion" + 0.031*"abgeordneten" + 0.030*"weiterer" + 0.030*"abgeordneter" + 0.028*"dr."
2019-08-07 12:09:43,826 : INFO : topic diff=0.023772, rho=0.301511


In [27]:
doc_lda = lda_model[agenda_corpus]

In [26]:
import pprint
pp = pprint.PrettyPrinter(indent=0)
pp.pprint(lda_model.print_topics())

2019-08-07 12:24:38,451 : INFO : topic #0 (0.068): 0.050*"fraktion" + 0.042*"beratung" + 0.036*"abgeordneter" + 0.036*"weiterer" + 0.036*"abgeordneten" + 0.029*"ausschusses" + 0.029*"beschlussempfehlung" + 0.029*"ausschuss" + 0.025*"antrags" + 0.023*"berichts"
2019-08-07 12:24:38,452 : INFO : topic #1 (0.057): 0.061*"bundesregierung" + 0.059*"beratung" + 0.045*"entwurfs" + 0.045*"eingebrachten" + 0.045*"gesetzes" + 0.034*"erste" + 0.017*"2019" + 0.012*"abgeordneten" + 0.012*"–" + 0.012*"2018"
2019-08-07 12:24:38,453 : INFO : topic #2 (0.056): 0.064*"dr." + 0.054*"abgeordneter" + 0.053*"weiterer" + 0.053*"abgeordneten" + 0.052*"fraktion" + 0.049*"beratung" + 0.043*"antrags" + 0.019*"afd" + 0.015*"berichts" + 0.015*"antrag"
2019-08-07 12:24:38,454 : INFO : topic #3 (0.045): 0.043*"fraktion" + 0.041*"linke" + 0.032*"antrags" + 0.032*"beratung" + 0.031*"abgeordneten" + 0.031*"abgeordneter" + 0.031*"weiterer" + 0.021*"dr." + 0.021*"c" + 0.021*"matthias"
2019-08-07 12:24:38,456 : INFO : topi

[(0,
'0.050*"fraktion" + 0.042*"beratung" + 0.036*"abgeordneter" + 0.036*"weiterer" '
'+ 0.036*"abgeordneten" + 0.029*"ausschusses" + 0.029*"beschlussempfehlung" + '
'0.029*"ausschuss" + 0.025*"antrags" + 0.023*"berichts"'),
(1,
'0.061*"bundesregierung" + 0.059*"beratung" + 0.045*"entwurfs" + '
'0.045*"eingebrachten" + 0.045*"gesetzes" + 0.034*"erste" + 0.017*"2019" + '
'0.012*"abgeordneten" + 0.012*"–" + 0.012*"2018"'),
(2,
'0.064*"dr." + 0.054*"abgeordneter" + 0.053*"weiterer" + 0.053*"abgeordneten" '
'+ 0.052*"fraktion" + 0.049*"beratung" + 0.043*"antrags" + 0.019*"afd" + '
'0.015*"berichts" + 0.015*"antrag"'),
(3,
'0.043*"fraktion" + 0.041*"linke" + 0.032*"antrags" + 0.032*"beratung" + '
'0.031*"abgeordneten" + 0.031*"abgeordneter" + 0.031*"weiterer" + 0.021*"dr." '
'+ 0.021*"c" + 0.021*"matthias"'),
(4,
'0.072*"dr." + 0.046*"weiterer" + 0.046*"abgeordneter" + 0.046*"abgeordneten" '
'+ 0.044*"fraktion" + 0.043*"beratung" + 0.041*"antrags" + 0.023*"b" + '
'0.022*"afd" + 0.017*"linke

### Visualising results of topic model

In [18]:
import pyLDAvis 
import pyLDAvis.gensim 
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, agenda_corpus, dictionary)
vis

2019-08-07 12:10:09,871 : INFO : Generating grammar tables from /usr/lib/python3.6/lib2to3/Grammar.txt
2019-08-07 12:10:09,919 : INFO : Generating grammar tables from /usr/lib/python3.6/lib2to3/PatternGrammar.txt
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
