In [2]:
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import time
import sys
import os

import seaborn as sns
import matplotlib.dates as mdates
from datetime import datetime, date

# preamble for jupyter notebook and django
import django
import platform

if platform.node() == "srv-mcc-apsis":
    sys.path.append("/home/muef/tmv/BasicBrowser/")
    #sys.path.append('/home/leey/tmv/BasicBrowser/')
    dtm_path = "/home/galm/software/dtm/dtm/main"
else:
    # local paths
    sys.path.append('/media/Data/MCC/tmv/BasicBrowser/')
    dtm_path = "/home/finn/dtm/dtm/main"

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "BasicBrowser.settings")
django.setup()

# import from appended path
import parliament.models as pm
from parliament.tasks import do_search, run_tm, combine_searches
import cities.models as cmodels
from django.contrib.auth.models import User
import tmv_app.models as tm
import scoping.models as sm
from utils.tm_mgmt import update_topic_scores
from django.db.models import Q, Count, Func, F, Sum, Avg, Value as V
from django.db.models.functions import TruncDate, TruncMonth, TruncYear

In [3]:
user1 =  User.objects.get(username='muef')
coal_project = sm.Project.objects.get(id=134)

search_regex = '(?<!Gemeinschaft für )kohle(?!nwasser)(?!nstoff)(?!ndiox)(?!nmonox)(?!rnte)'
search_object_type = 2
search_object_text = 'speech'

In [4]:
# simple search for utterances
par_search, created = pm.Search.objects.get_or_create(
                title="Kohle tei {}".format(search_object_text),
                text=search_regex,
                creator=user1,
                document_source="GermaParlTEI",
                search_object_type=search_object_type,
                project=coal_project)
par_search.save()

if created:
    print("doing search")
    do_search(par_search.id)
print(par_search.par_count)
print(par_search.utterance_count)
print(par_search.runstats_set.all())
print(par_search.id)

doing search
2951 utterances with search (?<!Gemeinschaft für )kohle(?!nwasser)(?!nstoff)(?!ndiox)(?!nmonox)(?!rnte)
32726
2951
None
None
<QuerySet []>
72


In [5]:
# simple search for utterances
par_search2, created = pm.Search.objects.get_or_create(
                title="Kohle pdf 18/211 - 18/245 {}".format(search_object_text),
                text=search_regex,
                creator=user1,
                start_date=date(2017,1,18),
                stop_date=date(2017,5,9),
                document_source="from https.*scans of pdfs with xml metadata",
                search_object_type=search_object_type,
                project=coal_project)
par_search2.save()

if created:
    print("doing search")
    do_search(par_search2.id)
print(par_search2.par_count)
print(par_search2.utterance_count)
print(par_search2.runstats_set.all())
print(par_search2.id)

doing search
24 utterances with search (?<!Gemeinschaft für )kohle(?!nwasser)(?!nstoff)(?!ndiox)(?!nmonox)(?!rnte)
185
24
None
None
<QuerySet []>
73


In [6]:
# search for current parliament
par_search_xml, created = pm.Search.objects.get_or_create(
                title="Kohle XML {}".format(search_object_text),
                text=search_regex,
                creator=user1,
                document_source="from www.bundestag.de/service/opendata",
                search_object_type=search_object_type,
                project=coal_project)
par_search_xml.save()

if created:
    print("doing search")
    do_search(par_search_xml.id)
print(par_search_xml.par_count)
print(par_search_xml.utterance_count)
print(par_search_xml.runstats_set.all())
print(par_search_xml.id)

doing search
510 utterances with search (?<!Gemeinschaft für )kohle(?!nwasser)(?!nstoff)(?!ndiox)(?!nmonox)(?!rnte)
8870
510
None
None
<QuerySet []>
74


In [7]:
# search in older protocols
par_search_old, created = pm.Search.objects.get_or_create(
                title="Kohle pdf 01/01 - 13/85 {}".format(search_object_text),
                text=search_regex,
                creator=user1,
                stop_date=date(1996,2,7),
                document_source="from https://www.bundestag.de/service/opendata",
                search_object_type=search_object_type,
                project=coal_project)
par_search_old.save()

if created:
    print("doing search")
    do_search(par_search_old.id)
print(par_search_old.par_count)
print(par_search_old.utterance_count)
print(par_search_old.runstats_set.all())
print(par_search_old.id)

doing search
5559 utterances with search (?<!Gemeinschaft für )kohle(?!nwasser)(?!nstoff)(?!ndiox)(?!nmonox)(?!rnte)
61444
5559
None
None
<QuerySet []>
75


In [8]:
# combine function
combine_searches([par_search.id,par_search2.id,par_search_xml.id, par_search_old.id])
#combine_searches([64, 65, 66, 67])

all search object types identical
Created combined search: id = 76


In [11]:
pm.Utterance.objects.filter(search_matches__id=76).count()

9044

In [9]:
# delete searches
#pm.Search.objects.get(id=70).delete()

(1,
 {'parliament.Document_search_matches': 0,
  'parliament.Paragraph_search_matches': 0,
  'parliament.Search': 1,
  'parliament.Search_speaker_regions': 0,
  'parliament.Utterance_search_matches': 0})

In [22]:
# stopwords from overlap analysis
extra_stopwords = ["herr", "jahr", "mehr", "sag", "land", "schon", "neu", "ganz", "polit",
                   "heut", "antrag", "gibt", "kolleg", "geht", "berat", "frag", "mensch"]

# stopwords from overlap analysis + irrelevant topics
extra_stopwords = ['stimmt', 'ganz', 'schreib', 'drucksach', 'kolleg', 'gesagt', 'notwend', 'aufgab', 'parlamentar', 'rat', 'heut', 'bitt', 'tagesordnungspunkt', 'wunscht',
                       'neu', 'abstimm', 'zuzustimm', 'mensch', 'abgeordnet', 'vorschrift', 'ziff', 'beantwort', 'land', 'altestenrat', 'ausschuss', 'federfuhr', 'berichterstatt',
                       'beschlussempfehl', 'fraktion', 'bundnis', 'moglich', 'schriftlich', 'ander', 'handzeich',
                       'uberweisungsvorschlag', 'uberweis', 'welt', 'wer', 'regel', 'geht', 'verehrt', 'jahr', 'enthalt',
                       'polit', 'kind', 'mocht', 'vervielfaltigt', 'verordn', 'massnahm', 'antwort', 'prasidentin', 'vorlag',
                       'erst', 'tagesordn', 'sag', 'herrn', 'anfrag', 'gibt', 'besond', 'lieb', 'schon', 'umdruck', 'gegenprob',
                       'angenomm', 'kollegin', 'antrag', 'bereit', 'berat', 'frau', 'jung', 'betreff', 'brauch', 'dafur', 'word',
                       'eingebracht', 'ruf', 'uberwies', 'frag', 'beschlussfass', 'bericht', 'glaub', 'dam', 'mehr', 'beantwortet',
                       'herr']

# stopwords from overlap analysis + irrelevant topics + topwords of dominating dynamic topics
extra_stopwords = ['stimmt', 'ganz', 'schreib', 'drucksach', 'kolleg', 'gesagt', 'notwend', 'aufgab', 'parlamentar', 'rat', 'heut', 'bitt', 'tagesordnungspunkt', 'wunscht',
                       'neu', 'abstimm', 'zuzustimm', 'mensch', 'abgeordnet', 'vorschrift', 'ziff', 'beantwort', 'land', 'altestenrat', 'ausschuss', 'federfuhr', 'berichterstatt',
                       'beschlussempfehl', 'fraktion', 'bundnis', 'moglich', 'schriftlich', 'ander', 'handzeich',
                       'uberweisungsvorschlag', 'uberweis', 'welt', 'wer', 'regel', 'geht', 'verehrt', 'jahr', 'enthalt',
                       'polit', 'kind', 'mocht', 'vervielfaltigt', 'verordn', 'massnahm', 'antwort', 'prasidentin', 'vorlag',
                       'erst', 'tagesordn', 'sag', 'herrn', 'anfrag', 'gibt', 'besond', 'lieb', 'schon', 'umdruck', 'gegenprob',
                       'angenomm', 'kollegin', 'antrag', 'bereit', 'berat', 'frau', 'jung', 'betreff', 'brauch', 'dafur', 'word',
                       'eingebracht', 'ruf', 'uberwies', 'frag', 'beschlussfass', 'bericht', 'glaub', 'dam', 'mehr', 'beantwortet',
                       'herr', 'gut', 'gross', 'imm', 'thema', 'komm', 'deshalb']

In [29]:
# submit tasks for different number of topics
K_list = [20, 30, 40]

method = "BD" # Blei dynamic topic model
alpha = 0.01  # default is 0.01

for K in K_list:
    # run_tm.delay(par_search_pdf_all.id, K=K, method='BT')
    # instead of using delay (which sends to the default queue), we use the more 
    # verbose way to call the funtion (apply async) where we can specify that we
    # want to send it to the long queue, which only has two workers

    run_tm.apply_async(
        args=[s.id],
        kwargs={
            "K": K,
            "method": method,
            "alpha": alpha,
            "dtm_path": dtm_path,
            "max_iter": 50,
            "extra_stopwords": extra_stopwords
        },
        queue="muef"
    )
    print('{} submitted run with K={}, alpha={} and method={} to queue "muef"'.format(str(datetime.now()), str(K), str(alpha), method))
    time.sleep(0.5) # to avoid problems with simultaneous start

2019-05-31 13:07:25.671749 submitted run with K=20, alpha=0.01 and method=BD to queue "muef"
2019-05-31 13:07:26.176316 submitted run with K=30, alpha=0.01 and method=BD to queue "muef"
2019-05-31 13:07:26.681283 submitted run with K=40, alpha=0.01 and method=BD to queue "muef"


In [24]:
# submit tasks for different alpha
alpha_list = [0.005, 0.02]

method = "BD" # Blei dynamic topic model
K = 30

for alpha in alpha_list:
    # run_tm.delay(par_search_pdf_all.id, K=K, method='BT')
    # instead of using delay (which sends to the default queue), we use the more 
    # verbose way to call the funtion (apply async) where we can specify that we
    # want to send it to the long queue, which only has two workers

    run_tm.apply_async(
        args=[s.id],
        kwargs={
            "K": K,
            "method": method,
            "alpha": alpha,
            "dtm_path": dtm_path,
            "extra_stopwords": extra_stopwords
        },
        queue="muef"
    )
    print('{} submitted run with K={}, alpha={} and method={} to queue "muef"'.format(str(datetime.now()), str(K), str(alpha), method))
    time.sleep(0.5) # to avoid problems with simultaneous start

2019-05-31 13:05:52.831892 submitted run with K=30, alpha=0.005 and method=BD to queue "muef"
2019-05-31 13:05:53.336028 submitted run with K=30, alpha=0.02 and method=BD to queue "muef"


In [25]:
# submit tasks for top_chain_var
var_list = [0.01, 0.001]

method = "BD" # Blei dynamic topic model
K = 30

for var in var_list:
    # run_tm.delay(par_search_pdf_all.id, K=K, method='BT')
    # instead of using delay (which sends to the default queue), we use the more 
    # verbose way to call the funtion (apply async) where we can specify that we
    # want to send it to the long queue, which only has two workers

    run_tm.apply_async(
        args=[s.id],
        kwargs={
            "K": K,
            "method": method,
            "alpha": 0.01,
            "dtm_path": dtm_path,
            "top_chain_var": var,
            "extra_stopwords": extra_stopwords
        },
        queue="muef"
    )
    print('{} submitted run with K={}, alpha={} and method={} to queue "muef"'.format(str(datetime.now()), str(K), str(alpha), method))
    time.sleep(0.5) # to avoid problems with simultaneous start

2019-05-31 13:05:56.976474 submitted run with K=30, alpha=0.02 and method=BD to queue "muef"
2019-05-31 13:05:57.480283 submitted run with K=30, alpha=0.02 and method=BD to queue "muef"


In [4]:
# submit tasks for different rngs
rngs_list = [2]

method = "BD" # Blei dynamic topic model
alpha = 0.01  # default is 0.01
K = 30

for rngs in rngs_list:
    # run_tm.delay(par_search_pdf_all.id, K=K, method='BT')
    # instead of using delay (which sends to the default queue), we use the more 
    # verbose way to call the funtion (apply async) where we can specify that we
    # want to send it to the long queue, which only has two workers

    run_tm.apply_async(
        args=[s.id],
        kwargs={
            "K": K,
            "method": method,
            "alpha": alpha,
            "dtm_path": dtm_path,
            "rng_seed": rngs,
            "max_iter": 500,
            "extra_stopwords": extra_stopwords
        },
        queue="muef"
    )
    print('{} submitted run with K={}, alpha={} and method={} to queue "muef"'.format(str(datetime.now()), str(K), str(alpha), method))
    time.sleep(0.5) # to avoid problems with simultaneous start

2019-04-17 08:05:02.686884 submitted run with K=30, alpha=0.01 and method=BD to queue "muef"


In [4]:
# submit tasks for different max_iter
max_iter_list = [50, 500]

method = "BD" # Blei dynamic topic model
alpha = 0.01  # default is 0.01
K = 30

for max_iter in max_iter_list:
    # run_tm.delay(par_search_pdf_all.id, K=K, method='BT')
    # instead of using delay (which sends to the default queue), we use the more 
    # verbose way to call the funtion (apply async) where we can specify that we
    # want to send it to the long queue, which only has two workers

    run_tm.apply_async(
        args=[s.id],
        kwargs={
            "K": K,
            "method": method,
            "alpha": alpha,
            "dtm_path": dtm_path,
            "rng_seed": 1,
            "max_iter": max_iter,
            "extra_stopwords": extra_stopwords,
        },
        queue="muef"
    )
    print('{} submitted run with K={}, alpha={} and method={} to queue "muef"'.format(str(datetime.now()), str(K), str(alpha), method))
    time.sleep(0.5) # to avoid problems with simultaneous start

2019-04-16 12:23:56.548468 submitted run with K=30, alpha=0.01 and method=BD to queue "muef"
2019-04-16 12:23:57.052371 submitted run with K=30, alpha=0.01 and method=BD to queue "muef"


NMF
---

In [None]:
# submit tasks for different number of topics
K_list = [30, 40]

method = "NM" # Blei dynamic topic model
alpha = 0.01  # default is 0.01

for K in K_list:
    # run_tm.delay(par_search_pdf_all.id, K=K, method='BT')
    # instead of using delay (which sends to the default queue), we use the more 
    # verbose way to call the funtion (apply async) where we can specify that we
    # want to send it to the long queue, which only has two workers

    run_tm.apply_async(
        args=[s.id],
        kwargs={
            "K": K,
            "method": method,
            "alpha": alpha,
            "dtm_path": dtm_path,
            "extra_stopwords": extra_stopwords
        },
        queue="muef"
    )
    print('{} submitted run with K={}, alpha={} and method={} to queue "muef"'.format(str(datetime.now()), str(K), str(alpha), method))
    time.sleep(0.5) # to avoid problems with simultaneous start

Check the topic models
----------------------------------

In [55]:
stats = tm.RunStats.objects.filter(method='BD').order_by('run_id')

In [56]:
for s in stats:
    if s.psearch:
        print(s.run_id)
        print(s.max_iter)
        print(s.psearch)
        print(s.psearch.project)
        print()
        

1040
200
Search object (36)
Coal-discourse

1041
200
Search object (36)
Coal-discourse

1331
200
Search object (44)
Coal-discourse

1332
200
Search object (44)
Coal-discourse

1333
200
Search object (44)
Coal-discourse

1334
200
Search object (44)
Coal-discourse

1335
200
Search object (44)
Coal-discourse

1348
200
Search object (44)
Coal-discourse

1350
200
Search object (44)
Coal-discourse

1351
200
Search object (44)
Coal-discourse

1352
200
Search object (44)
Coal-discourse

1353
200
Search object (44)
Coal-discourse

1368
200
Search object (44)
Coal-discourse

1369
200
Search object (44)
Coal-discourse

1370
200
Search object (44)
Coal-discourse

1537
200
Search object (44)
Coal-discourse



In [39]:
stats = tm.RunStats.objects.filter(psearch__id=44)

In [61]:
search = pm.Search.objects.get(id=43)
project = sm.Project.objects.get(id=134)

In [62]:
search.project=project
search.save()

In [8]:
# test use of Gemeinschaft für Kohle
par_search, created = pm.Search.objects.get_or_create(
                title="Test Europäische Gemeinschaft",
                text="(?<!Europäische )(?<!Europäischen )Gemeinschaft für Kohle",
                creator=user1,
                stop_date=date(1996,2,7),
                document_source="from https://www.bundestag.de/service/opendata",
                search_object_type=1,
                project=coal_project)
par_search.save()

if created:
    print("doing search")
    do_search(par_search.id)
print(par_search.par_count)
print(par_search.utterance_count)
print(par_search.runstats_set.all())
print(par_search.id)

doing search
18 paragraphs with search (?<!Europäische )(?<!Europäischen )Gemeinschaft für Kohle
18
14
None
None
<QuerySet []>
71
