In [1]:
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import time
import sys
import os

import seaborn as sns
import matplotlib.dates as mdates
from datetime import datetime, date

# preamble for jupyter notebook and django
import django
import platform

if platform.node() == "srv-mcc-apsis":
    sys.path.append("/home/muef/tmv/BasicBrowser/")
    #sys.path.append('/home/leey/tmv/BasicBrowser/')
    dtm_path = "/home/galm/software/dtm/dtm/main"
else:
    # local paths
    sys.path.append('/media/Data/MCC/tmv/BasicBrowser/')
    dtm_path = "/home/finn/dtm/dtm/main"

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "BasicBrowser.settings")
django.setup()

# import from appended path
import parliament.models as pm
from parliament.tasks import do_search, run_tm, combine_searches
import cities.models as cmodels
from django.contrib.auth.models import User
import tmv_app.models as tm
import scoping.models as sm
from utils.tm_mgmt import update_topic_scores
from django.db.models import Q, Count, Func, F, Sum, Avg, Value as V
from django.db.models.functions import TruncDate, TruncMonth, TruncYear

In [2]:
user1 =  User.objects.get(username='muef')
coal_project = sm.Project.objects.get(id=134)

search_regex = '(?<!Gemeinschaft für )kohle(?!nwasser)(?!nstoff)(?!ndiox)(?!nmonox)(?!rnte)'
search_object_type = 2
search_object_text = 'utterance'

In [3]:
# search in older protocols
par_search_old, created = pm.Search.objects.get_or_create(
                title="Kohle pdf 01/01 - 13/85 {}".format(search_object_text),
                text=search_regex,
                creator=user1,
                stop_date=date(1996,2,7),
                document_source="from https.*scans of pdfs with xml metadata",
                search_object_type=search_object_type,
                project=coal_project)
par_search_old.save()

if created:
    print("doing search")
    do_search(par_search_old.id)
print(par_search_old.par_count)
print(par_search_old.utterance_count)
print(par_search_old.runstats_set.all())
print(par_search_old.id)

doing search
5607 utterances with search (?<!Gemeinschaft für )kohle(?!nwasser)(?!nstoff)(?!ndiox)(?!nmonox)(?!rnte)
62076
5607
None
None
<QuerySet []>
82


In [4]:
# search in speeches parsed from TEI documents
par_search, created = pm.Search.objects.get_or_create(
                title="Kohle tei {}".format(search_object_text),
                text=search_regex,
                creator=user1,
                document_source="GermaParlTEI",
                search_object_type=search_object_type,
                project=coal_project)
par_search.save()

if created:
    print("doing search")
    do_search(par_search.id)
print(par_search.par_count)
print(par_search.utterance_count)
print(par_search.runstats_set.all())
print(par_search.id)

doing search
4542 paragraphs with search (?<!Gemeinschaft für )kohle(?!nwasser)(?!nstoff)(?!ndiox)(?!nmonox)(?!rnte)
4542
2951
None
None
<QuerySet []>
77


In [5]:
# search for missing protocols from 18th period
par_search2, created = pm.Search.objects.get_or_create(
                title="Kohle pdf 18/211 - 18/245 {}".format(search_object_text),
                text=search_regex,
                creator=user1,
                start_date=date(2017,1,18),
                stop_date=date(2017,5,9),
                document_source="from https.*scans of pdfs with xml metadata",
                search_object_type=search_object_type,
                project=coal_project)
par_search2.save()

if created:
    print("doing search")
    do_search(par_search2.id)
print(par_search2.par_count)
print(par_search2.utterance_count)
print(par_search2.runstats_set.all())
print(par_search2.id)

doing search
34 paragraphs with search (?<!Gemeinschaft für )kohle(?!nwasser)(?!nstoff)(?!ndiox)(?!nmonox)(?!rnte)
34
24
None
None
<QuerySet []>
78


In [6]:
# search for current parliament
par_search_xml, created = pm.Search.objects.get_or_create(
                title="Kohle XML {}".format(search_object_text),
                text=search_regex,
                creator=user1,
                start_date=date(2017,10,24),
                document_source="XML from www.bundestag.de/service/opendata",
                search_object_type=search_object_type,
                project=coal_project)
par_search_xml.save()

if created:
    print("doing search")
    do_search(par_search_xml.id)
print(par_search_xml.par_count)
print(par_search_xml.utterance_count)
print(par_search_xml.runstats_set.all())
print(par_search_xml.id)

doing search
980 paragraphs with search (?<!Gemeinschaft für )kohle(?!nwasser)(?!nstoff)(?!ndiox)(?!nmonox)(?!rnte)
980
510
None
None
<QuerySet []>
79


In [12]:
# combine function
#combine_searches([par_search.id,par_search2.id,par_search_xml.id, par_search_old.id])
combine_searches([72, 73, 74, 82])

all search object types identical
Created combined search: id = 83


In [11]:
pm.Utterance.objects.filter(search_matches=75).count()

5033

In [8]:
pm.Utterance.objects.filter(search_matches=82, document__parlperiod__n=12).count()

581

In [9]:
# delete searches
#pm.Search.objects.get(id=70).delete()

(1,
 {'parliament.Document_search_matches': 0,
  'parliament.Paragraph_search_matches': 0,
  'parliament.Search': 1,
  'parliament.Search_speaker_regions': 0,
  'parliament.Utterance_search_matches': 0})