## Parse

In [1]:
%load_ext autoreload
%autoreload 2
from gesetz import Gesetz, progressBar
import pickle

In [2]:
import sys
sys.path.append('..')
import collections
import statistics
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
%matplotlib inline
from typing import List, Dict
import re


## Topic model

In [3]:
with open("pickles/laws_links.pickle", "rb") as fp:   # Unpickling
    Gesetz.collected_laws = pickle.load(fp)

In [4]:
from nltk.tokenize import word_tokenize 
from nltk.stem.wordnet import WordNetLemmatizer
import string
import unicodedata


In [5]:
def remove_symbols(text):
        text = text.replace("\xa0","")
        text = text.replace("–","")
        text = text.replace("§","")
        text = text.replace("'","")
        text = text.replace("“","")
        text = text.replace("„","")
        text = text.replace("."," ")
        text = text.replace("‒","")
        text = text.replace("--","")
        text = text.replace("+++","")
        return text

def remove_numbers(text):
    return re.sub('( |\\\n)\(?\d+[a-z]?\.?\)?( |\\\n)', ' ', text)

In [6]:
#Take care of stopwords, punctuation
from nltk.stem.cistem import Cistem


german_stopwords = stopwords.words('german')

exclude = set([])
exclude.add("absatz")
exclude.add("artikel")
exclude.add("paragraph")
exclude.add("gesetz")


def get_lemma(word):
    lemma = wordnet.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)


def prepare_content_for_lda(law:Gesetz):
    text = law.content
    text = remove_numbers(remove_symbols(text.casefold()))
    tokens = word_tokenize(text)
    stemmer = Cistem()
    tokens = [get_lemma2(token) for token in tokens if token not in german_stopwords and token not in set(string.punctuation) and len(token)>1]
    return tokens

In [7]:
corpus_clean = {}
for law in progressBar(Gesetz.collected_laws.values()):
    corpus_clean[law.name_short] = prepare_content_for_lda(law)

1663/1663 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.0% (ZweckVG)


In [8]:
import pickle
if len(corpus_clean)>1000:
    with open("pickles/corpus.pickle", "wb") as fp:   #Pickling
        pickle.dump(corpus_clean, fp)
else:
    print("Corpus is to short.")

## lets start with the topic modeling

In [1]:
import pickle

from gesetz import Gesetz, progressBar
with open("pickles/laws_links.pickle", "rb") as fp:   # Unpickling
    Gesetz.collected_laws = pickle.load(fp)

with open("pickles/corpus.pickle", "rb") as fp:   # Unpickling
    corpus_load = pickle.load(fp)

In [2]:
corpus_mirror = [key for key in corpus_load.keys()]
corpus_raw = [corpus_load[key] for key in corpus_mirror]

In [3]:
print(len(corpus_raw))

1663


In [4]:
print(corpus_raw[100])

['``', 'gesetz', 'rabatte', 'arzneimittel', 'dezember', 'bgbl', '2262', '2275', 'zuletzt', 'artikel', 'gesetzes', 'november', 'bgbl', 'geändert', 'worden', "''", 'stand', 'zuletzt', 'geändert', 'art', '11', 'fussnote', 'textnachweis', 'ab', 'wurde', 'artikel', '12', 'bundestag', 'beschlossen', 'gem', 'artikel', 'ab', 'kraft', 'getreten', 'anspruch', 'abschläge', 'pharmazeutischen', 'unternehmer', 'unternehmen', 'privaten', 'krankenversicherung', 'trägern', 'kosten', 'krankheits-', 'pflege-', 'geburtsfällen', 'beamtenrechtlichen', 'vorschriften', 'verschreibungspflichtige', 'arzneimittel', 'deren', 'kosten', 'ganz', 'teilweise', 'erstattet', 'anteil', 'kostentragung', 'abschläge', 'entsprechend', 'absatz', '1a', 'fünften', 'buches', 'sozialgesetzbuch', 'gewähren', 'gilt', 'sonstige', 'träger', 'kosten', 'krankheitsfällen', 'rahmen', 'absicherung', 'krankheitsfall', 'tragen', 'versicherungspflicht', 'absatz', 'satz', 'versicherungsvertragsgesetzes', 'absatz', 'nummer', 'fünften', 'buches

In [5]:
# Importing Gensim
import time

import gensim
from gensim import corpora
from gensim.models import LdaMulticore, LdaModel, TfidfModel

import pyLDAvis.gensim



In [6]:
# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(corpus_raw)
print(len(dictionary))
dictionary.filter_extremes(no_below=0,no_above=0.1,keep_n=150000)
print(len(dictionary))

  and should_run_async(code)
148207
147091


In [7]:
# Converting list of documents (corpus) into a matrix which counts the occurence of each word in the dictionary in the specific document
corpus = [dictionary.doc2bow(law) for law in corpus_raw]

  and should_run_async(code)


In [8]:
# Set training parameters.
num_topics = 30
chunksize = 300
passes = 300




  and should_run_async(code)


In [9]:
import re
import math
import matplotlib.pyplot as plt

import os

import logging
logging.basicConfig(filename='gensim.log',
                    format="%(asctime)s:%(levelname)s:%(message)s",
                    level=logging.INFO)

  and should_run_async(code)
  from collections import MutableMapping
  from collections import Iterable, Mapping
  from collections import Iterable, Mapping
  from collections import Sized


In [10]:
from gensim.models.callbacks import CoherenceMetric

  and should_run_async(code)


In [11]:
from gensim.models.callbacks import CallbackAny2Vec
class EpochLogger(CallbackAny2Vec):
    logger="shell"
    init = 0
    def __init__(self):

        self.epoch = 1
    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
    def get_value(self,topics, model, other_model):
        if self.init == 0:
            self.init = time.perf_counter()
        elapsed = time.perf_counter() - self.init
        print("Epoch #"+str(self.epoch)+" after "+str(math.floor(elapsed/3600))+"hrs "+str(math.floor((elapsed%3600)/60))+ "min "+str(math.floor(elapsed%60))+"sec")

        if self.epoch % 10 ==0:
            print("saving image")
            p = re.compile(r"(-*\d+\.\d+) per-word .* (\d+\.\d+) perplexity")
            matches = [p.findall(l) for l in open('gensim.log')]
            matches = [m for m in matches if len(m) > 0]
            tuples = [t[0] for t in matches]
            perplexity = [float(t[1]) for t in tuples]
            liklihood = [float(t[0]) for t in tuples]
            iter = list(range(0,len(tuples)*10,10))
            plt.plot(iter,liklihood,c="black")
            plt.ylabel("log liklihood")
            plt.xlabel("iteration")
            plt.title("Topic Model Convergence")
            plt.grid()
            plt.savefig("training/convergence_likelihood_"+str(self.epoch)+".png")
            plt.close()

        if self.epoch % 50 ==0 or self.epoch == 10:
            print("creating pyldavis")
            try:
                visualization = pyLDAvis.gensim.prepare(model,corpus,dictionary, mds='mmds',sort_topics=False)
                pyLDAvis.save_html(visualization,"ldavis/vis-"+str(self.epoch)+".html")
            except TypeError:
                print("Oops!  Saving pyLDAvis for epoch "+self.epoch+" gave a TypeError...")
            print("finished.")

        self.epoch += 1
        return 0
epoch_logger = EpochLogger()

  and should_run_async(code)


In [None]:
# Running and training LDA model on the documents word_occurence_counting_matrix

start = time.perf_counter()
ldamodel = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    num_topics=num_topics,
    passes=passes,
    callbacks=[epoch_logger],
    random_state=2021,
)
end = time.perf_counter()

sec = end - start

print("Finished! "+str(sec/3600))

In [None]:
ldamodel.save(fname="savedmodel/model_30_300_fulldict")


## load the model

In [12]:
# names of topics
ldamodel = LdaModel.load(fname="savedmodels/model_30_300_fulldict")
topic_names = {
    2:(1,"GesR"),
    28:(2,"UmweltR"),
    21:(3,"ProzessR"),
    14:(4,"StrafR"),
    16:(5,"BahnR"),
    12:(6,"ArbR"),
    26:(7,"BeamtenR"),
    13:(8,"MedR"),
    7:(9,"SteuerR"),
    5:(10,"VersorgungsR"),
    25:(11,"SachenR"),
    8:(12,"OrganR"),
    29:(13,"VerkehrsR"),
    6:(14,"FinMarktR"),
    15:(15,"EntsorgungsR"),
    4:(16,"KommR"),
    20:(17,"WahlR"),
    24:(18,"BauR"),
    23:(19,"GebührenR"),
    3:(20,"BerufsR"),
    30:(21,"InsolvenzR"),
    27:(22,"StatistikR"),
    17:(23,"BankR"),
    10:(24,"PatentR"),
    1:(25,"ProduktR"),
    18:(26,"AgrarR"),
    19:(27,"KriegsfolgenR"),
    11:(28,"WohnR"),
    9:(29,"AsylR"),
    22:(30,"SonderVersR"),
   }

  and should_run_async(code)


In [None]:
visualization_unsorted = pyLDAvis.gensim.prepare(ldamodel,corpus,dictionary, mds='mmds',sort_topics=False)

In [None]:
pyLDAvis.show(visualization_unsorted)

# Do inference

In [13]:
new_doc_bow = dictionary.doc2bow(corpus_raw[830])
print(ldamodel.get_document_topics(new_doc_bow))

[(1, 0.025421666), (7, 0.033902537), (11, 0.61739963), (14, 0.030636057), (15, 0.12550537), (23, 0.033177197), (24, 0.057360116), (25, 0.030003246), (27, 0.025330858), (28, 0.015133855)]
  and should_run_async(code)


In [14]:
#categorize the corpus by the retrieved topics
speech_topic_matrix = []

for law in corpus:
    vector = ldamodel[law]
    speech_topic_matrix.append(vector)

  and should_run_async(code)


In [15]:
end = len(corpus_mirror)
#end = 5
Gesetz.topic_names = topic_names
for index in range(end):
    topics = speech_topic_matrix[index]
    #print(corpus_mirror[index]+str(topics))
    Gesetz.collected_laws[corpus_mirror[index]].set_topics(topics)

  and should_run_async(code)


In [16]:
print(Gesetz.get_topic_count())

{'OrganR': 116, 'BeamtenR': 85, 'VersorgungsR': 158, 'ArbR': 90, 'MedR': 80, 'KommR': 19, 'UmweltR': 219, 'KriegsfolgenR': 22, 'StatistikR': 44, 'BankR': 56, 'GebührenR': 10, 'EntsorgungsR': 21, 'FinMarktR': 16, 'InsolvenzR': 17, 'WohnR': 5, 'GesR': 174, 'BahnR': 66, 'SachenR': 71, 'WahlR': 12, 'ProduktR': 14, 'ProzessR': 106, 'StrafR': 106, 'SteuerR': 70, 'PatentR': 19, 'BauR': 23, 'AgrarR': 12, 'VerkehrsR': 19, 'BerufsR': 10, 'AsylR': 3}
  and should_run_async(code)


In [None]:
for (num,words) in ldamodel.show_topics(30,20,formatted=False):
    topic = topic_names[num+1][1]
    if topic in ["GesR","FinMarktR","OrganR","ArbR","WahlR","ProzessR","SachenR","UmweltR","InsolvenzR","StrafR"]:
        print((topic+": ").ljust(12)+str([word[0] for word in words]))

In [17]:
import pickle
with open("pickles/laws_links_topics.pickle", "wb") as fp:   #Pickling
    pickle.dump(Gesetz.collected_laws, fp)

  and should_run_async(code)


  and should_run_async(code)
GesR:       ['versicherer', 'aktien', 'handelsgesetzbuchs', 'versicherungsnehmer', 'genossenschaft', 'gesellschafter', 'ehegatten', 'kündigung', 'aif', 'verbraucher', 'rechtsträger', 'sache', 'verbindlichkeiten', 'unterabschnitt', 'aktiengesetzes', 'unternehmer', 'vermögensgegenstände', 'handelsregister', 'geschäftsjahr', 'gesellschaften']
FinMarktR:  ['aufsichtsbehörde', 'institut', 'versicherungsunternehmen', 'bundesbank', 'kunden', 'wertpapierdienstleistungsunternehmen', 'kreditwesengesetzes', 'anleger', 'inländischen', 'instituts', 'handel', 'gruppe', 'wertpapiere', 'risiken', 'institute', 'delegierten', 'finanzinstrumente', 'organisierten', '575/2013', 'aktien']
OrganR:     ['stiftung', 'vorstand', 'bundestages', 'frau', 'präsident', 'sitzungen', 'mitgliedschaft', 'dienststelle', 'betriebsrat', 'ausschuss', 'stellvertreter', 'amtszeit', 'präsidenten', 'verwaltungsrat', 'gewählt', 'vorsitzende', 'geschäftsordnung', 'stimmen', 'organe', 'vorstandes']
Arb