In [1]:
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import time
import sys
import os

import seaborn as sns
import matplotlib.dates as mdates
from datetime import datetime, date

# preamble for jupyter notebook and django
import django
import platform

if platform.node() == "srv-mcc-apsis":
    sys.path.append("/home/muef/tmv/BasicBrowser/")
    #sys.path.append('/home/leey/tmv/BasicBrowser/')
    dtm_path = "/home/galm/software/dtm/dtm/main"
else:
    # local paths
    sys.path.append('/media/Data/MCC/tmv/BasicBrowser/')
    dtm_path = "/home/finn/dtm/dtm/main"

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "BasicBrowser.settings")
django.setup()

# import from appended path
import parliament.models as pm
from parliament.tasks import do_search, run_tm, combine_searches
import cities.models as cmodels
from django.contrib.auth.models import User
import tmv_app.models as tm
import scoping.models as sm
from utils.tm_mgmt import update_topic_scores
from django.db.models import Q, Count, Func, F, Sum, Avg, Value as V
from django.db.models.functions import TruncDate, TruncMonth, TruncYear

In [2]:
s = pm.Search.objects.get(id=89)

In [3]:
if s.search_object_type == 1:
    print(pm.Paragraph.objects.filter(search_matches=s).count())
else:
    print(pm.Utterance.objects.filter(search_matches=s).count())

9167


In [4]:
# stopwords from overlap analysis
extra_stopwords = ["herr", "jahr", "mehr", "sag", "land", "schon", "neu", "ganz", "polit",
                   "heut", "antrag", "gibt", "kolleg", "geht", "berat", "frag", "mensch"]

# stopwords from overlap analysis + irrelevant topics
extra_stopwords = ['stimmt', 'ganz', 'schreib', 'drucksach', 'kolleg', 'gesagt', 'notwend', 'aufgab', 'parlamentar', 'rat', 'heut', 'bitt', 'tagesordnungspunkt', 'wunscht',
                       'neu', 'abstimm', 'zuzustimm', 'mensch', 'abgeordnet', 'vorschrift', 'ziff', 'beantwort', 'land', 'altestenrat', 'ausschuss', 'federfuhr', 'berichterstatt',
                       'beschlussempfehl', 'fraktion', 'bundnis', 'moglich', 'schriftlich', 'ander', 'handzeich',
                       'uberweisungsvorschlag', 'uberweis', 'welt', 'wer', 'regel', 'geht', 'verehrt', 'jahr', 'enthalt',
                       'polit', 'kind', 'mocht', 'vervielfaltigt', 'verordn', 'massnahm', 'antwort', 'prasidentin', 'vorlag',
                       'erst', 'tagesordn', 'sag', 'herrn', 'anfrag', 'gibt', 'besond', 'lieb', 'schon', 'umdruck', 'gegenprob',
                       'angenomm', 'kollegin', 'antrag', 'bereit', 'berat', 'frau', 'jung', 'betreff', 'brauch', 'dafur', 'word',
                       'eingebracht', 'ruf', 'uberwies', 'frag', 'beschlussfass', 'bericht', 'glaub', 'dam', 'mehr', 'beantwortet',
                       'herr']

# stopwords from overlap analysis + irrelevant topics + topwords of dominating dynamic topics
extra_stopwords = ['stimmt', 'ganz', 'schreib', 'drucksach', 'kolleg', 'gesagt', 'notwend', 'aufgab', 'parlamentar', 'rat', 'heut', 'bitt', 'tagesordnungspunkt', 'wunscht',
                       'neu', 'abstimm', 'zuzustimm', 'mensch', 'abgeordnet', 'vorschrift', 'ziff', 'beantwort', 'land', 'altestenrat', 'ausschuss', 'federfuhr', 'berichterstatt',
                       'beschlussempfehl', 'fraktion', 'bundnis', 'moglich', 'schriftlich', 'ander', 'handzeich',
                       'uberweisungsvorschlag', 'uberweis', 'welt', 'wer', 'regel', 'geht', 'verehrt', 'jahr', 'enthalt',
                       'polit', 'kind', 'mocht', 'vervielfaltigt', 'verordn', 'massnahm', 'antwort', 'prasidentin', 'vorlag',
                       'erst', 'tagesordn', 'sag', 'herrn', 'anfrag', 'gibt', 'besond', 'lieb', 'schon', 'umdruck', 'gegenprob',
                       'angenomm', 'kollegin', 'antrag', 'bereit', 'berat', 'frau', 'jung', 'betreff', 'brauch', 'dafur', 'word',
                       'eingebracht', 'ruf', 'uberwies', 'frag', 'beschlussfass', 'bericht', 'glaub', 'dam', 'mehr', 'beantwortet',
                       'herr', 'gut', 'gross', 'imm', 'thema', 'komm', 'deshalb']

# new stopword list from Sept. 11, 2019
extra_stopwords = ['stimmt', 'ganz', 'schreib', 'drucksach', 'kolleg', 'gesagt', 'notwend', 'aufgab', 'parlamentar', 'rat', 'heut', 'bitt', 'tagesordnungspunkt', 'wunscht',
                       'neu', 'abstimm', 'zuzustimm', 'mensch', 'abgeordnet', 'vorschrift', 'ziff', 'beantwort', 'land', 'altestenrat', 'ausschuss', 'federfuhr', 'berichterstatt',
                       'beschlussempfehl', 'fraktion', 'bundnis', 'moglich', 'schriftlich', 'ander', 'handzeich',
                       'uberweisungsvorschlag', 'uberweis', 'welt', 'wer', 'regel', 'geht', 'verehrt', 'jahr', 'enthalt',
                       'kind', 'mocht', 'vervielfaltigt', 'antwort', 'prasidentin', 'vorlag',
                       'erst', 'tagesordn', 'sag', 'herrn', 'anfrag', 'gibt', 'besond', 'lieb', 'schon', 'umdruck', 'gegenprob',
                       'angenomm', 'kollegin', 'antrag', 'bereit', 'berat', 'frau', 'betreff', 'brauch', 'dafur', 'word',
                       'eingebracht', 'ruf', 'uberwies', 'frag', 'beschlussfass', 'bericht', 'glaub', 'dam', 'mehr', 'beantwortet',
                       'herr', 'gut', 'gross', 'imm', 'thema', 'komm', 'deshalb',
                       'gilt', 'letzt', 'seit']

print(", ".join(sorted(extra_stopwords)))

abgeordnet, abstimm, altestenrat, ander, anfrag, angenomm, antrag, antwort, aufgab, ausschuss, beantwort, beantwortet, berat, bereit, bericht, berichterstatt, beschlussempfehl, beschlussfass, besond, betreff, bitt, brauch, bundnis, dafur, dam, deshalb, drucksach, eingebracht, enthalt, erst, federfuhr, frag, fraktion, frau, ganz, gegenprob, geht, gesagt, gibt, gilt, glaub, gross, gut, handzeich, herr, herrn, heut, imm, jahr, kind, kolleg, kollegin, komm, land, letzt, lieb, mehr, mensch, mocht, moglich, neu, notwend, parlamentar, prasidentin, rat, regel, ruf, sag, schon, schreib, schriftlich, seit, stimmt, tagesordn, tagesordnungspunkt, thema, uberweis, uberweisungsvorschlag, uberwies, umdruck, verehrt, vervielfaltigt, vorlag, vorschrift, welt, wer, word, wunscht, ziff, zuzustimm


In [6]:
# submit tasks for different number of topics
K_list = [35]

method = "BD" # Blei dynamic topic model
alpha = 0.05  # default is 0.01
top_chain_var = 0.01
max_iter = 200

for K in K_list:
    # run_tm.delay(par_search_pdf_all.id, K=K, method='BT')
    # instead of using delay (which sends to the default queue), we use the more 
    # verbose way to call the funtion (apply async) where we can specify that we
    # want to send it to the long queue, which only has two workers

    run_tm.apply_async(
        args=[s.id],
        kwargs={
            "K": K,
            "method": method,
            "alpha": alpha,
            "dtm_path": dtm_path,
            "max_iter": max_iter,
            "extra_stopwords": extra_stopwords,
            "top_chain_var": top_chain_var
        },
        queue="muef"
    )
    print('{} submitted run with K={}, alpha={} and method={} to queue "muef"'.format(str(datetime.now()), str(K), str(alpha), method))
    time.sleep(0.5) # to avoid problems with simultaneous start

2019-09-11 10:06:15.807890 submitted run with K=35, alpha=0.05 and method=BD to queue "muef"


In [5]:
# submit tasks for different alpha
alpha_list = [0.01, 0.05]

method = "BD" # Blei dynamic topic model
K = 40

for alpha in alpha_list:
    # run_tm.delay(par_search_pdf_all.id, K=K, method='BT')
    # instead of using delay (which sends to the default queue), we use the more 
    # verbose way to call the funtion (apply async) where we can specify that we
    # want to send it to the long queue, which only has two workers

    run_tm.apply_async(
        args=[s.id],
        kwargs={
            "K": K,
            "method": method,
            "alpha": alpha,
            "dtm_path": dtm_path,
            "extra_stopwords": extra_stopwords,
            "max_iter": 50,
            "top_chain_var": 0.01
        },
        queue="muef"
    )
    print('{} submitted run with K={}, alpha={} and method={} to queue "muef"'.format(str(datetime.now()), str(K), str(alpha), method))
    time.sleep(0.5) # to avoid problems with simultaneous start

2019-06-27 20:43:53.481657 submitted run with K=40, alpha=0.01 and method=BD to queue "muef"
2019-06-27 20:43:53.987424 submitted run with K=40, alpha=0.05 and method=BD to queue "muef"


In [7]:
# submit tasks for top_chain_var
var_list = [0.1, 0.05, 0.01]

method = "BD" # Blei dynamic topic model
K = 40

for var in var_list:
    # run_tm.delay(par_search_pdf_all.id, K=K, method='BT')
    # instead of using delay (which sends to the default queue), we use the more 
    # verbose way to call the funtion (apply async) where we can specify that we
    # want to send it to the long queue, which only has two workers

    run_tm.apply_async(
        args=[s.id],
        kwargs={
            "K": K,
            "method": method,
            "alpha": 0.01,
            "dtm_path": dtm_path,
            "top_chain_var": var,
            "extra_stopwords": extra_stopwords,
            "max_iter": 50
        },
        queue="muef"
    )
    print('{} submitted run with K={}, alpha={} and method={} to queue "muef"'.format(str(datetime.now()), str(K), str(alpha), method))
    time.sleep(0.5) # to avoid problems with simultaneous start

2019-06-20 15:22:11.661317 submitted run with K=40, alpha=0.05 and method=BD to queue "muef"
2019-06-20 15:22:12.164534 submitted run with K=40, alpha=0.05 and method=BD to queue "muef"
2019-06-20 15:22:12.668167 submitted run with K=40, alpha=0.05 and method=BD to queue "muef"


In [4]:
# submit tasks for different rngs
rngs_list = [2]

method = "BD" # Blei dynamic topic model
alpha = 0.01  # default is 0.01
K = 30

for rngs in rngs_list:
    # run_tm.delay(par_search_pdf_all.id, K=K, method='BT')
    # instead of using delay (which sends to the default queue), we use the more 
    # verbose way to call the funtion (apply async) where we can specify that we
    # want to send it to the long queue, which only has two workers

    run_tm.apply_async(
        args=[s.id],
        kwargs={
            "K": K,
            "method": method,
            "alpha": alpha,
            "dtm_path": dtm_path,
            "rng_seed": rngs,
            "max_iter": 500,
            "extra_stopwords": extra_stopwords
        },
        queue="muef"
    )
    print('{} submitted run with K={}, alpha={} and method={} to queue "muef"'.format(str(datetime.now()), str(K), str(alpha), method))
    time.sleep(0.5) # to avoid problems with simultaneous start

2019-04-17 08:05:02.686884 submitted run with K=30, alpha=0.01 and method=BD to queue "muef"


In [4]:
# submit tasks for different max_iter
max_iter_list = [50, 500]

method = "BD" # Blei dynamic topic model
alpha = 0.01  # default is 0.01
K = 30

for max_iter in max_iter_list:
    # run_tm.delay(par_search_pdf_all.id, K=K, method='BT')
    # instead of using delay (which sends to the default queue), we use the more 
    # verbose way to call the funtion (apply async) where we can specify that we
    # want to send it to the long queue, which only has two workers

    run_tm.apply_async(
        args=[s.id],
        kwargs={
            "K": K,
            "method": method,
            "alpha": alpha,
            "dtm_path": dtm_path,
            "rng_seed": 1,
            "max_iter": max_iter,
            "extra_stopwords": extra_stopwords,
        },
        queue="muef"
    )
    print('{} submitted run with K={}, alpha={} and method={} to queue "muef"'.format(str(datetime.now()), str(K), str(alpha), method))
    time.sleep(0.5) # to avoid problems with simultaneous start

2019-04-16 12:23:56.548468 submitted run with K=30, alpha=0.01 and method=BD to queue "muef"
2019-04-16 12:23:57.052371 submitted run with K=30, alpha=0.01 and method=BD to queue "muef"


In [7]:
# submit tasks for different alpha and top_chain_var
alpha_list = [0.01, 0.05, 0.1]
var_list = [0.01, 0.005, 0.05]

method = "BD" # Blei dynamic topic model
K = 45
max_iter = 200

for var in var_list:
    for alpha in alpha_list:


        if tm.RunStats.objects.filter(alpha=alpha, top_chain_var=var, K=K, method=method, psearch=s.id):
            print('model on s={} with K={}, alpha={}, top_chain_var={} and method={} already run'.format(s.id, str(K), str(alpha), str(var), method))
        else:
            run_tm.apply_async(
                args=[s.id],
                kwargs={
                    "K": K,
                    "method": method,
                    "alpha": alpha,
                    "dtm_path": dtm_path,
                    "top_chain_var": var,
                    "rng_seed": 1,
                    "max_iter": max_iter,
                    "extra_stopwords": extra_stopwords,
                },
                queue="muef"
            )
        
        print('{} submitted run on s={} with K={}, alpha={}, top_chain_var={} and method={} to queue "muef"'.format(str(datetime.now()), s.id, str(K), str(alpha), str(var), method))
        time.sleep(0.5) # to avoid problems with simultaneous start

2019-07-07 10:05:59.790756 submitted run on s=89 with K=45, alpha=0.01, top_chain_var=0.01 and method=BD to queue "muef"
2019-07-07 10:06:00.298292 submitted run on s=89 with K=45, alpha=0.05, top_chain_var=0.01 and method=BD to queue "muef"
2019-07-07 10:06:00.805662 submitted run on s=89 with K=45, alpha=0.1, top_chain_var=0.01 and method=BD to queue "muef"
2019-07-07 10:06:01.312689 submitted run on s=89 with K=45, alpha=0.01, top_chain_var=0.005 and method=BD to queue "muef"
2019-07-07 10:06:01.819214 submitted run on s=89 with K=45, alpha=0.05, top_chain_var=0.005 and method=BD to queue "muef"
2019-07-07 10:06:02.326950 submitted run on s=89 with K=45, alpha=0.1, top_chain_var=0.005 and method=BD to queue "muef"
2019-07-07 10:06:02.833448 submitted run on s=89 with K=45, alpha=0.01, top_chain_var=0.05 and method=BD to queue "muef"
2019-07-07 10:06:03.340766 submitted run on s=89 with K=45, alpha=0.05, top_chain_var=0.05 and method=BD to queue "muef"
2019-07-07 10:06:03.848942 subm

# delete runs

In [1]:
# delete runs that are not finished
#tm.RunStats.objects.filter(psearch__id=81, status=1).delete()

In [3]:
# delete old runs

tm.RunStats.objects.filter(run_id=2169).delete()

(52039,
 {'tmv_app.DocTopic': 0,
  'tmv_app.KFold': 0,
  'tmv_app.RunStats': 1,
  'tmv_app.RunStats_periods': 19,
  'tmv_app.Term_run_id': 20000,
  'tmv_app.TimeDocTotal': 0,
  'tmv_app.Topic': 45,
  'tmv_app.TopicARScores': 0,
  'tmv_app.TopicCorr': 0,
  'tmv_app.TopicDTopic': 0,
  'tmv_app.TopicIntrusion_real_topics': 0,
  'tmv_app.TopicTerm': 31974,
  'tmv_app.TopicTimePeriodScores': 0,
  'tmv_app.TopicYear': 0,
  'tmv_app.Topic_primary_dtopic': 0})

In [11]:
tm.RunStats.objects.filter(psearch__id=38).delete()

(527325,
 {'tmv_app.DocTopic': 0,
  'tmv_app.KFold': 0,
  'tmv_app.RunStats': 9,
  'tmv_app.RunStats_periods': 0,
  'tmv_app.Term_run_id': 39285,
  'tmv_app.TimeDocTotal': 0,
  'tmv_app.Topic': 360,
  'tmv_app.TopicARScores': 0,
  'tmv_app.TopicCorr': 0,
  'tmv_app.TopicDTopic': 0,
  'tmv_app.TopicIntrusion_real_topics': 0,
  'tmv_app.TopicTerm': 487671,
  'tmv_app.TopicTimePeriodScores': 0,
  'tmv_app.TopicYear': 0,
  'tmv_app.Topic_primary_dtopic': 0})

In [25]:
pm.Search.objects.filter(id=76).delete()

(849875,
 {'parliament.Document_search_matches': 0,
  'parliament.Paragraph_search_matches': 0,
  'parliament.Search': 1,
  'parliament.Search_speaker_regions': 0,
  'parliament.Utterance_search_matches': 8518,
  'tmv_app.DocTopic': 596260,
  'tmv_app.KFold': 0,
  'tmv_app.RunStats': 2,
  'tmv_app.RunStats_periods': 38,
  'tmv_app.Term_run_id': 40000,
  'tmv_app.TimeDocTotal': 38,
  'tmv_app.Topic': 70,
  'tmv_app.TopicARScores': 0,
  'tmv_app.TopicCorr': 0,
  'tmv_app.TopicDTopic': 0,
  'tmv_app.TopicIntrusion_real_topics': 0,
  'tmv_app.TopicTerm': 203618,
  'tmv_app.TopicTimePeriodScores': 1330,
  'tmv_app.TopicYear': 0,
  'tmv_app.Topic_primary_dtopic': 0})

NMF
---

In [None]:
# submit tasks for different number of topics
K_list = [30, 40]

method = "NM" # Blei dynamic topic model
alpha = 0.01  # default is 0.01

for K in K_list:
    # run_tm.delay(par_search_pdf_all.id, K=K, method='BT')
    # instead of using delay (which sends to the default queue), we use the more 
    # verbose way to call the funtion (apply async) where we can specify that we
    # want to send it to the long queue, which only has two workers

    run_tm.apply_async(
        args=[s.id],
        kwargs={
            "K": K,
            "method": method,
            "alpha": alpha,
            "dtm_path": dtm_path,
            "extra_stopwords": extra_stopwords
        },
        queue="muef"
    )
    print('{} submitted run with K={}, alpha={} and method={} to queue "muef"'.format(str(datetime.now()), str(K), str(alpha), method))
    time.sleep(0.5) # to avoid problems with simultaneous start

Check the topic models
----------------------------------

In [55]:
stats = tm.RunStats.objects.filter(method='BD').order_by('run_id')

In [56]:
for s in stats:
    if s.psearch:
        print(s.run_id)
        print(s.max_iter)
        print(s.psearch)
        print(s.psearch.project)
        print()

1040
200
Search object (36)
Coal-discourse

1041
200
Search object (36)
Coal-discourse

1331
200
Search object (44)
Coal-discourse

1332
200
Search object (44)
Coal-discourse

1333
200
Search object (44)
Coal-discourse

1334
200
Search object (44)
Coal-discourse

1335
200
Search object (44)
Coal-discourse

1348
200
Search object (44)
Coal-discourse

1350
200
Search object (44)
Coal-discourse

1351
200
Search object (44)
Coal-discourse

1352
200
Search object (44)
Coal-discourse

1353
200
Search object (44)
Coal-discourse

1368
200
Search object (44)
Coal-discourse

1369
200
Search object (44)
Coal-discourse

1370
200
Search object (44)
Coal-discourse

1537
200
Search object (44)
Coal-discourse



In [39]:
stats = tm.RunStats.objects.filter(psearch__id=44)

In [61]:
search = pm.Search.objects.get(id=43)
project = sm.Project.objects.get(id=134)

In [62]:
search.project=project
search.save()

In [8]:
# test use of Gemeinschaft für Kohle
par_search, created = pm.Search.objects.get_or_create(
                title="Test Europäische Gemeinschaft",
                text="(?<!Europäische )(?<!Europäischen )Gemeinschaft für Kohle",
                creator=user1,
                stop_date=date(1996,2,7),
                document_source="from https://www.bundestag.de/service/opendata",
                search_object_type=1,
                project=coal_project)
par_search.save()

if created:
    print("doing search")
    do_search(par_search.id)
print(par_search.par_count)
print(par_search.utterance_count)
print(par_search.runstats_set.all())
print(par_search.id)

doing search
18 paragraphs with search (?<!Europäische )(?<!Europäischen )Gemeinschaft für Kohle
18
14
None
None
<QuerySet []>
71
