# DaNLP Test

Test the DaNLP software for linguistic processing of Danish text, in particular the dependency parsing. Links:
    
* Installation: https://pypi.org/project/danlp/
* Dependency parsing: https://github.com/alexandrainst/danlp/blob/master/docs/docs/tasks/dependency.md

In [90]:
import danlp
import os
import pandas as pd
import re

from danlp.models import load_spacy_model
from danlp.models import load_bert_ner_model

## 1. Syntactic analysis with DaNLP

### 1.1 POS tagging

In [None]:
nlp = load_spacy_model()

In [5]:
# https://github.com/alexandrainst/danlp/blob/master/docs/docs/tasks/pos.md

doc = nlp('Jeg hopper på en bil, som er rød sammen med Niels.')
analysis = []
for token in doc:
    analysis.append({"text": token.text, "pos": token.pos_})
pd.DataFrame(analysis)

Unnamed: 0,text,pos
0,Jeg,PRON
1,hopper,VERB
2,på,ADP
3,en,DET
4,bil,NOUN
5,",",PUNCT
6,som,ADP
7,er,AUX
8,rød,ADJ
9,sammen,ADV


### 1.2 Dependency parsing

In [75]:
def print_results(doc):
    analysis = []
    for token in doc:
        analysis.append({ "text": token.text, "lemma": token.lemma_, "head": token.head.i, "dep": token.dep_, "pos": token.pos_ })
    return pd.DataFrame(analysis)

In [76]:
# https://github.com/alexandrainst/danlp/blob/master/docs/docs/tasks/dependency.md

doc = nlp('Ordene sættes sammen til meningsfulde sætninger.')
print_results(doc)

Unnamed: 0,text,lemma,head,dep,pos
0,Ordene,Ordene,1,nsubj,NOUN
1,sættes,sættes,1,ROOT,VERB
2,sammen,sammen,1,advmod,ADV
3,til,til,5,case,ADP
4,meningsfulde,meningsfulde,5,amod,ADJ
5,sætninger,sætninger,2,obl,NOUN
6,.,.,1,punct,PUNCT


### 1.3 Named Entity Recognition

In [91]:
bert = load_bert_ner_model()

Downloading file /tmp/tmpu9z17901


100% |#########################################################################|


In [125]:
bert.predict([ "Som", "Paul", "W.", "Franks", ",", "professor", "i", "genetisk", "epidemiologi", "ved", "Lund", "Universitet", ",", "bemærker" ], IOBformat=False)

{'text': 'Som Paul W. Franks , professor i genetisk epidemiologi ved Lund Universitet , bemærker',
 'entities': [{'type': 'PER',
   'text': 'Paul W. Franks',
   'start_pos': 4,
   'end_pos': 18},
  {'type': 'ORG', 'text': 'Lund Universitet', 'start_pos': 59, 'end_pos': 75}]}

## 2. Process texts

In [30]:
def read_texts(directory):
    file_names = os.listdir(directory)
    texts = {}
    for file_name in file_names:
        if re.search("\.txt$", file_name):
            texts[file_name] = ""
            infile = open(f"{directory}/{file_name}", "r")
            for line in infile:
                texts[file_name] += line
            infile.close()
    return texts

In [126]:
def make_ner_analysis(text_tokens):
    paragraph_tokens = []
    ner_analysis = []
    for token in text_tokens:
        if token in [ '\n\n', '\n', '.', ' ' ]:
            if len(paragraph_tokens) > 0:
                ner_analysis.append(bert.predict(paragraph_tokens, IOBformat=False))
            paragraph_tokens = []
        else:
            paragraph_tokens.append(token)
    if len(paragraph_tokens) > 0:
        ner_analysis.append(bert.predict(paragraph_tokens, IOBformat=False))
    return ner_analysis

In [129]:
texts = read_texts("../danish")

There three texts: "20200528.txt" (longest), "20201213.txt" (shortest) and "20211212.txt"

In [130]:
doc = nlp(texts["20200528.txt"])
nlp_analysis = print_results(doc)
nlp_analysis[25:31]

Unnamed: 0,text,lemma,head,dep,pos
25,Oprindeligt,Oprindeligt,18,flat,ADV
26,udgivet,udgivet,6,acl:relcl,VERB
27,af,af,28,case,ADP
28,Gatestone,Gatestone,26,obl,PROPN
29,Institute,Institute,28,flat,PROPN
30,.,.,3,punct,PUNCT


In [131]:
ner_analysis = make_ner_analysis([ token.text for token in doc ])
ner_analysis[:3]

[{'text': 'Den svenske “ model ” til bekæmpelse af coronavirus',
  'entities': []},
 {'text': '28. maj 2020', 'entities': []},
 {'text': '24 NYT Sverige , Tophistorie',
  'entities': [{'type': 'LOC',
    'text': 'Sverige',
    'start_pos': 7,
    'end_pos': 14}]}]

## 3. Who, where, when, what, why and how?

In [58]:
def show_results(results_dict):
    results_df = pd.DataFrame([{ "key": key, "count": results_dict[key], "best value": ""} 
                               for key in sorted(results_dict, key=lambda k: results_dict[k], reverse=True)])
    best_keys = get_best_keys(results_df)
    for i, row in results_df.iterrows():
        if row["key"] in best_keys:
            results_df.at[i, "best value"] = "yes"
    return results_df

In [60]:
def get_best_keys(results_df):
    best_count = -1
    best_keys = []
    for i, row in results_df.iterrows():
        if row["count"] > best_count:
            best_count = row["count"]
            best_keys = [row["key"]]
        elif row["count"] == best_count:
            best_keys.append(row["key"])
    case_is_upper = []
    for key in best_keys:
        case_is_upper.append(re.search(r"^[A-Z]", key) != None)
    if True in case_is_upper:
        best_keys = [ best_keys[i] for i in range(0, len(best_keys)) if case_is_upper[i] ] 
    return best_keys

In [86]:
def get_actions(nlp_table_df):
    actions = {}
    for i, row in nlp_table_df.iterrows():
        if row["pos"] == "VERB" and re.search("[a-zA-Z]", row["text"]):
            if row["text"] in actions:
                actions[row["text"]] += 1
            else:
                actions[row["text"]] = 1
    return actions

In [87]:
def get_actors(nlp_table_df):
    actors = {}
    for i, row in nlp_table_df.iterrows():
        if row["dep"] == "nsubj":
            actor = row["text"]
        else:
            actor = ""
        if actor != "":
            if actor in actors:
                actors[actor] += 1
            else:
                actors[actor] = 1
    return actors

In [123]:
def get_locations(ner_analysis):
    locations = {}
    for paragraph in ner_analysis:
        for entity in paragraph["entities"]:
            if entity["type"] == "LOC":
                location = entity["text"]
            else:
                location = ""
            if location != "":
                if location in locations:
                    locations[location] += 1
                else:
                    locations[location] = 1
    return locations

Missing: times, causes, manners

In [67]:
show_results(get_actions(nlp_analysis))[:5]

Unnamed: 0,key,count,best value
0,er,10,yes
1,har,5,
2,sagde,4,
3,mener,3,
4,have,3,


In [89]:
show_results(get_actors(nlp_analysis))[:10]

Unnamed: 0,key,count,best value
0,Sverige,13,yes
1,der,10,
2,vi,9,
3,de,8,
4,man,8,
5,Tegnell,6,
6,det,6,
7,han,4,
8,WHO,2,
9,Jeg,2,


In [124]:
show_results(get_locations(ner_analysis))[:5]

Unnamed: 0,key,count,best value
0,Sverige,21,yes
1,Iran,5,
2,Sveriges,5,
3,Danmark,3,
4,Norge,3,
