#### Steps
- Map the sentences and their sources.
- Choose some domain (themes of the news), e.g., "sport", "politik"
- Read and tokenize the sentences of each domain (only keep words)
- Pair the words with their lemmas using either the word_lemma list or a lemmatizer
- Count the lemmas and save lemmas and their counts into a dictionary

In [111]:
import os
import pandas as pd 
import re
from collections import Counter

Map sentences and their sources

In [112]:
data = []
with open(os.path.join("deu_news_2024_300K", "deu_news_2024_300k-sentences.txt")) as f1, open(os.path.join("deu_news_2024_300K", "deu_news_2024_300k-sources.txt")) as f2:
    for sent, src in zip(f1, f2):
        sent = sent.split()
        src = src.split()
        if sent[0] == src[0]: ## just double checking
            data.append([sent[0], " ".join(sent[1:]), src[1], src[2]])  
data_df = pd.DataFrame(data, columns=["id", "sentence", "source", "date"])     

In [113]:
data_df

Unnamed: 0,id,sentence,source,date
0,1,"""007 Action"" (ab 007. September) ist eine über...",https://www.kleinezeitung.at/wirtschaft/625303...,2024-01-05
1,2,«1.604 Euro Durchschnittsrente nach mindestens...,https://www.zdf.de/sport/dfb-pokal-ergebnisse-...,2024-10-30
2,3,«16 Betriebe haben sich nicht an die gesetzlic...,https://www.luzernerzeitung.ch/sport/sport-new...,2024-12-04
3,4,"""16 Prozent haben körperliche Gewalt erfahren ...",https://www.zdf.de/sport/fussball-champions-le...,2024-10-02
4,5,"""1883"" spielt im Jahr des Titels und folgt ein...",https://www.heise.de/bestenlisten/testsieger/t...,2024-12-07
...,...,...,...,...
265570,265571,Unsere AZ-Reporterinnen Andrea Zaschka und Len...,https://www.zeit.de/politik/ausland/2024-12/ji...,2024-12-31
265571,265572,Unsere Bar ist ein gutes Beispiel dafür.,https://www.zeit.de/politik/deutschland/2024-1...,2024-12-31
265572,265573,Unsere beiden Billig-Lader von Aliexpress aber...,https://www.zeit.de/sport/2024-12/doku-em-2024...,2024-12-31
265573,265574,Unsere Bevölkerung ist sehr jung.,https://www.zeit.de/video/2024-12/636657051311...,2024-12-31


Check the sources and decide on domains. <br>
sport, politik, finanzen+finanznachrichten+finanzierung, story, [leben, kultur, lokales]

In [114]:
domains = []
for src in data_df["source"]:
    src = src.split("/") # split by section delimiter "/"
    #NOTE: what about source of multiple domains?
    if "sport" in src:
        domains.append("sport")
    elif "politik" in src:
        domains.append("politics")
    elif "story" in src:
        domains.append("story")
    elif "leben" in src:
        domains.append("life")
    elif "kultur" in src:
        domains.append("culture")
    elif "lokales" in src:
        domains.append("locals")
    else:
        d = ""
        for t in src:
            if "finanz" in t:
                d = "finance"
        domains.append(d)
len(domains)

265575

In [115]:
data_df["domain"] = domains
data_df[data_df["domain"]!=""].shape

(86788, 5)

In [116]:
data_df.groupby(["domain"]).count()/data_df.shape[0]

Unnamed: 0_level_0,id,sentence,source,date
domain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,0.673207,0.673207,0.673207,0.673207
culture,0.023568,0.023568,0.023568,0.023568
finance,0.015551,0.015551,0.015551,0.015551
life,0.011737,0.011737,0.011737,0.011737
locals,0.06736,0.06736,0.06736,0.06736
politics,0.083942,0.083942,0.083942,0.083942
sport,0.101659,0.101659,0.101659,0.101659
story,0.022977,0.022977,0.022977,0.022977


Group and tokenize the sentences of each domain. <br>

In [117]:
data_sport = data_df[data_df["domain"]=="sport"]
data_politics = data_df[data_df["domain"]=="politics"]
data_finance = data_df[data_df["domain"]=="finance"]
data_life = data_df[data_df["domain"].isin(["locals", "life", "culture"])]
data_story = data_df[data_df["domain"] == "story"]

In [None]:
# Tokenization: 
# 1. remove digits
# 2. extract all other symbols and German words
# TODO. decompose compounds ???
def tokenize(domain_data):
    word_domain = []
    for sent in domain_data["sentence"]:
        # 1st lower-case and split by spaces TODO Should we lower-case all German words?
        word_list = sent.lower().split() 
        # Iterate over all words/tokens in the sentence
        for token in word_list:
            # For each word: remove the digits 
            token = re.sub("\d+", "", token) 
            # For each word: find all symbols that are not German letters
            matches = re.finditer("[\W_]", token, re.UNICODE) #NOTE, should we split by "-"?
            # For each word: replace all found non-German-letter symbols with '" " + itself + " "'
            if matches:
                for match in matches:
                    token = token.replace(match.group(), " "+match.group()+" ")
            # For each word: split each word by spaces and add to the domain word list
            word_domain += token.split()            
    return word_domain

In [119]:
words_sport = tokenize(data_sport)
words_politics = tokenize(data_politics)
words_finance = tokenize(data_finance)
words_life = tokenize(data_life)
words_story = tokenize(data_story)

Prepare the wordform_lemma list <br>
Add/map the lemma to/with each word of the domain

In [120]:
# key: form; value: (lemma, POS, freq)
form_lemma_dict = dict()
with open(os.path.join("DeReKo-2014-II-form_lemma_pos-list", "DeReKo-2014-II-MainArchive-STT.100000.freq")) as f:
    for line in f:
        line = line.split()
        if len(line) == 4:
            form, lemma, pos, freq = line[0], line[1], line[2], line[3]
        elif len(line) == 3:
            #print(line)
            form, lemma, pos, freq = line[0], line[0], line[1], line[2]
        #TODO Are there same forms with different lemmas?
        if form not in form_lemma_dict:
            form_lemma_dict[form] = (lemma, pos, freq)   
len(form_lemma_dict)

93948

In [121]:
#!python -m pip install HanTa
# the package that return German word lemma and pos
from HanTa import HanoverTagger as ht
tagger = ht.HanoverTagger('morphmodel_ger.pgz')
tagger.analyze("atomsprengköpfe")

('atomsprengköpfen', 'VV(FIN)')

In [122]:
# Map the words with their lemmas
# 1. retrieve from the "form_lemma_dict"
# 2. If not exists there, use German lemmatizer to get the lemma
def map_lemma(words_domain, lemmatizer):
    word_lemma_domain = []
    not_in_form_lemma_list = []
    for word in words_domain:
        if word in form_lemma_dict:
            lemma, pos = form_lemma_dict[word][0], form_lemma_dict[word][1]
            word_lemma_domain.append((word, lemma, pos))
        else:
            not_in_form_lemma_list.append(word)
            lemma, pos = lemmatizer.analyze(word)
            word_lemma_domain.append((word, lemma, pos))          
    return word_lemma_domain

In [123]:
# compute frequencies of lemmas and save them into a csv file
def freq_to_tsv(word_lemma_list, domain):
    print(f"Total lemma count of {domain}:", len(word_lemma_list))
    dict = Counter([(t[1], t[2]) for t in word_lemma_list]).most_common()
    dict = [(tuple[0][0], tuple[0][1], tuple[1]) for tuple in dict]
    dict = pd.DataFrame(dict, columns=["lemma", "pos", "abs_freq"])
    dict["rel_freq"] = dict["abs_freq"]/dict["abs_freq"].sum()
    dict.to_csv(f"frequency_dict_{domain}.tsv", sep="\t")
    return dict   

In [124]:
# Results of sport news
word_lemma_sport = map_lemma(words_sport, tagger)
sport_dict = freq_to_tsv(word_lemma_sport, "sport")

Total lemma count of sport: 490474


In [125]:
# Results of politics news
word_lemma_politics = map_lemma(words_politics, tagger)
politics_dict = freq_to_tsv(word_lemma_politics, "politics")

Total lemma count of politics: 402768


In [126]:
# Results of finance news
word_lemma_finance = map_lemma(words_finance, tagger)
finance_dict = freq_to_tsv(word_lemma_finance, "finance")
finance_dict

Total lemma count of finance: 74956


Unnamed: 0,lemma,pos,abs_freq,rel_freq
0,die,ART,6673,0.089026
1,.,$.,4257,0.056793
2,",","$,",3220,0.042959
3,eine,ART,1591,0.021226
4,und,KON,1430,0.019078
...,...,...,...,...
14297,proaktiv,ADJ(D),1,0.000013
14298,entschärfen,VVINF,1,0.000013
14299,matrosov,ADJ(D),1,0.000013
14300,Gründer,NN,1,0.000013


In [127]:
# Results of life news
word_lemma_life = map_lemma(words_life, tagger)
life_dict = freq_to_tsv(word_lemma_life, "life")
life_dict

Total lemma count of life: 492846


Unnamed: 0,lemma,pos,abs_freq,rel_freq
0,die,ART,44004,0.089285
1,.,$.,28349,0.057521
2,",","$,",20676,0.041952
3,eine,ART,10790,0.021893
4,und,KON,9819,0.019923
...,...,...,...,...
50254,beunruhigt,ADJD,1,0.000002
50255,Struber,NE,1,0.000002
50256,heimersheim,ADV,1,0.000002
50257,Südflorida,NN,1,0.000002


In [128]:
# Results of story news
word_lemma_story = map_lemma(words_story, tagger)
story_dict = freq_to_tsv(word_lemma_story, "story")
story_dict

Total lemma count of story: 110383


Unnamed: 0,lemma,pos,abs_freq,rel_freq
0,die,ART,9914,0.089815
1,.,$.,6379,0.057790
2,",","$,",4337,0.039290
3,eine,ART,2429,0.022005
4,und,KON,2159,0.019559
...,...,...,...,...
18507,Küstenstreifen,NN,1,0.000009
18508,Integrated,FM,1,0.000009
18509,Classification,FM,1,0.000009
18510,Ipc,NE,1,0.000009
