In [1]:
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import time
import sys
import os

import seaborn as sns
import matplotlib.dates as mdates
from datetime import datetime, date

In [3]:
# preamble for jupyter notebook and django
import django
import platform

if platform.node() == "srv-mcc-apsis":
    sys.path.append("/home/muef/tmv/BasicBrowser/")
    #sys.path.append('/home/leey/tmv/BasicBrowser/')
    dtm_path = "/home/galm/software/dtm/dtm/main"
else:
    # local paths
    sys.path.append('/media/Data/MCC/tmv/BasicBrowser/')
    dtm_path = "/home/finn/dtm/dtm/main"

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "BasicBrowser.settings")
django.setup()

# import from appended path
import parliament.models as pm
from parliament.tasks import do_search, run_tm, combine_searches
import cities.models as cmodels
from django.contrib.auth.models import User
from tmv_app.models import *
from utils.tm_mgmt import update_topic_scores
from django.db.models import Q, Count, Func, F, Sum, Avg, Value as V
from django.db.models.functions import TruncDate, TruncMonth, TruncYear

In [4]:
user1, created =  User.objects.get_or_create(username='leey')
user1.save()

In [5]:
# simple search for utterances
par_search, created = pm.Search.objects.get_or_create(
                title="Kohle tei utterance",
                text='(?<!Europäische Gemeinschaft für )kohle(?!nwasser)(?!nstoff)(?!ndiox)(?!nmonox)(?!rnte)',
                creator=user1,
                document_source="GermaParlTEI",
                search_object_type=2)
par_search.save()

if created:
    print("doing search")
    do_search(par_search.id)
print(par_search.par_count)
print(par_search.utterance_count)
print(par_search.runstats_set.all())
print(par_search.id)

doing search
2962 utterances with search (?<!Europäische Gemeinschaft für )kohle(?!nwasser)(?!nstoff)(?!ndiox)(?!nmonox)(?!rnte)
32910
2962
None
None
<QuerySet []>
41


In [6]:
# simple search for utterances
par_search2, created = pm.Search.objects.get_or_create(
                title="Kohle pdf utterance 18/211 - 18/245",
                text='(?<!Europäische Gemeinschaft für )kohle(?!nwasser)(?!nstoff)(?!ndiox)(?!nmonox)(?!rnte)',
                creator=user1,
                start_date=date(2017,1,18),
                stop_date=date(2017,5,9),
                document_source="from https.*scans of pdfs with xml metadata",
                search_object_type=2)
par_search2.save()

if created:
    print("doing search")
    do_search(par_search2.id)
print(par_search2.par_count)
print(par_search2.utterance_count)
print(par_search2.runstats_set.all())
print(par_search2.id)

doing search
24 utterances with search (?<!Europäische Gemeinschaft für )kohle(?!nwasser)(?!nstoff)(?!ndiox)(?!nmonox)(?!rnte)
185
24
None
None
<QuerySet []>
42


In [7]:
# add search for current parliament here
par_search_xml, created = pm.Search.objects.get_or_create(
                title="Kohle XML utterance",
                text="(?<!Europäische Gemeinschaft für )kohle(?!nwasser)(?!nstoff)(?!ndiox)(?!nmonox)(?!rnte)",
                creator=user1,
                document_source="from www.bundestag.de/service/opendata",
                search_object_type=2)
par_search_xml.save()

if created:
    print("doing search")
    do_search(par_search_xml.id)
print(par_search_xml.par_count)
print(par_search_xml.utterance_count)
print(par_search_xml.runstats_set.all())

doing search
292 utterances with search (?<!Europäische Gemeinschaft für )kohle(?!nwasser)(?!nstoff)(?!ndiox)(?!nmonox)(?!rnte)
4086
292
None
None
<QuerySet []>


In [8]:
# test combine function
combine_searches([par_search.id,par_search2.id,par_search_xml.id])

all search object types identical
Created combined search: id = 44


In [9]:
s = pm.Search.objects.get(id=44)
print(pm.Utterance.objects.filter(search_matches=s).count())

3278


In [10]:
# stopwords from overlap analysis
extra_stopwords = set(["herr", "jahr", "mehr", "sag", "land", "schon", "neu", "ganz", "polit",
                   "heut", "antrag", "gibt", "kolleg", "geht", "berat", "frag", "mensch"])

# stopwords from overlap analysis + irrelevant topics
extra_stopwords = set(['stimmt', 'ganz', 'schreib', 'drucksach', 'kolleg', 'gesagt', 'notwend', 'aufgab', 'parlamentar', 'rat', 'heut', 'bitt', 'tagesordnungspunkt', 'wunscht',
                       'neu', 'abstimm', 'zuzustimm', 'mensch', 'abgeordnet', 'vorschrift', 'ziff', 'beantwort', 'land', 'altestenrat', 'ausschuss', 'federfuhr', 'berichterstatt',
                       'beschlussempfehl', 'fraktion', 'bundnis', 'moglich', 'schriftlich', 'ander', 'handzeich',
                       'uberweisungsvorschlag', 'uberweis', 'welt', 'wer', 'regel', 'geht', 'verehrt', 'jahr', 'enthalt',
                       'polit', 'kind', 'mocht', 'vervielfaltigt', 'verordn', 'massnahm', 'antwort', 'prasidentin', 'vorlag',
                       'erst', 'tagesordn', 'sag', 'herrn', 'anfrag', 'gibt', 'besond', 'lieb', 'schon', 'umdruck', 'gegenprob',
                       'angenomm', 'kollegin', 'antrag', 'bereit', 'berat', 'frau', 'jung', 'betreff', 'brauch', 'dafur', 'word',
                       'eingebracht', 'ruf', 'uberwies', 'frag', 'beschlussfass', 'bericht', 'glaub', 'dam', 'mehr', 'beantwortet',
                       'herr'])

In [14]:
# this probably needs updating for max's new method to submit tasks
K_list = [30, 40, 50, 60, 70]

for K in K_list:
    # run_tm.delay(par_search_pdf_all.id, K=K, method='BT')
    # instead of using delay (which sends to the default queue), we use the more 
    # verbose way to call the funtion (apply async) where we can specify that we
    # want to send it to the long queue, which only has two workers
    run_tm.apply_async(
        args=[s.id],
        kwargs={
            "K": K,
            "method": "BT",
            "dtm_path": dtm_path,
        },
        queue="long"
    )
    # We don't need to sleep anymore, because we know they are being nicely scheduled
    #time.sleep(5)