### Exploring dependency tree parsing using stanza and also CoreNLP

In [3]:
import os
import re
import unicodedata
import numpy as np
import pandas as pd
import warnings
import nltk
import spacy
import stanza
import en_core_web_md
import matplotlib.pyplot as plt
import plotly.express as px
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import euclidean_distances
from collections import defaultdict
from textblob import TextBlob
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
word_token = TreebankWordTokenizer()

2023-04-11 10:35:48.907092: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-11 10:35:49.066631: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-04-11 10:35:49.572226: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-04-11 10:35:49.572319: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

### Load up the speeches

In [4]:
# Load up the files
#paths = ['./Data/speeches/', './Data/NYTimes/', './Data/WSJ/'] 
paths = ['./Data/GWB/']
list_of_files = []

dates = pd.read_csv('./Data/genData/dateSpeeches.csv')
dates = pd.read_csv('./Data/genData/speech_and_date_gwb.csv')
for path in paths:
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith('.txt'):
                list_of_files.append(os.path.join(root,file))

speeches = []
for file in list_of_files:
    with open(file, encoding='utf-8') as f:
        #print(file)
        text = f.read()
    f.close()
    speeches.append([text, file])

#clean out goofy unicode  space characters 
speeches = [(unicodedata.normalize("NFKD", speech[0]), speech[1]) for speech in speeches if len(speech)>0 ]

# remove [stuff] in between square brackets
def remove_bracket(text):
    return re.sub('(\[[^w]*\]\s)', '',text)
speeches = [(remove_bracket(speech[0]), speech[1]) for speech in speeches]

def get_source(text):
    regex = "[^./][a-zA-Z]+[^/]"
    string = re.findall(regex, str(text))[0]
    if string == 'speeches': string = 'oba'
    if string == 'NYTimes': string = 'nyt'
    return string.lower()

def get_date(text):
    regex = "([0-9]+[\-][0-9]+[\-][0-9]+)"
    return re.findall(regex, str(text))[0]

def get_filename(text):
    regex = "[-]([a-zA-Z]+)"
    return re.findall(regex, str(text))[0]

cols = ['text', 'filepath']
text_df = pd.DataFrame(speeches, columns=cols)
# A couple tweaks for the GWB data
dates['file'] = [ file.replace('GWB/', './Data/GWB/') for file in dates['file'] ]
dates = dates.rename(columns={"file": "filepath"})
#text_df['date'] = text_df['filepath'].apply(get_date)
text_df = pd.merge(text_df, dates, how='left', on='filepath')
text_df['date'] = pd.to_datetime(text_df['date'], format='%Y-%m-%d')
text_df['source'] = text_df['filepath'].apply(get_source)

text_df['sentences'] = text_df['text'].apply(sent_tokenize)
text_df['words'] = text_df['text'].apply(word_token.tokenize)
text_df['num_sents'] = text_df['sentences'].apply(len)
text_df['num_words'] = text_df['words'].apply(len)
text_df['word_set'] = text_df['words'].apply(set)
text_df['num_unique_words'] = text_df['word_set'].apply(len)
text_df.head(3)

Unnamed: 0,text,filepath,date,source,sentences,words,num_sents,num_words,word_set,num_unique_words
0,"Thank you for coming. Prime Minister Olmert, P...",./Data/GWB/remarks-the-annapolis-conference-annapol...,2007-11-27,gwb,"[Thank you for coming., Prime Minister Olmert,...","[Thank, you, for, coming., Prime, Minister, Ol...",116,2461,"{commitment, assistance, toward, concluded, Is...",739
1,"Good evening. During the next few minutes, I w...",./Data/GWB/address-the-nation-the-proposed-departme...,2002-06-06,gwb,"[Good evening., During the next few minutes, I...","[Good, evening., During, the, next, few, minut...",71,1605,"{finish, thrust, assistance, emerging, conclud...",674
2,Our Nation is shocked and saddened by the news...,./Data/GWB/remarks-the-shootings-virginia-tech-blac...,2007-04-16,gwb,[Our Nation is shocked and saddened by the new...,"[Our, Nation, is, shocked, and, saddened, by, ...",10,207,"{are, pledged, be, American, have, ones, I, Sc...",128


<A HREF="https://textblob.readthedocs.io/en/latest/quickstart.html">TextBlob Quickstart guide</A>

In [5]:
text_df['TBsubjectivity']=[TextBlob(text).sentiment.subjectivity for text in text_df['text']]
text_df['TBpolarity']=[TextBlob(text).sentiment.polarity for text in text_df['text']]
#text_df.to_csv('./Data/genData/numwords_TBpolar_gwb.csv', index=False)

<A HREF="https://plotly.com/python/plotly-express/">Plotly Express</A><BR><A HREF="https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf">Pandas cheat sheet</A>

<A HREF="https://universaldependencies.org/u/pos/">Universal POS tags</A>

<A HREF="https://en.wikipedia.org/wiki/Interjection">Wikipedia - Interjections</A>

<A HREF="https://pypi.org/project/NRCLex/">NRCLex</A>

### Attempt 1. Stanza

### <A HREF="https://stanfordnlp.github.io/stanza/getting_started.html">Stanza quickstart guide</A>

In [6]:
nlp = stanza.Pipeline(lang="en") # Initialize the default English pipeline

2023-04-11 10:35:52 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-04-11 10:35:54 INFO: Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| constituency | wsj       |
| depparse     | combined  |
| sentiment    | sstplus   |
| ner          | ontonotes |

2023-04-11 10:35:54 INFO: Using device: cpu
2023-04-11 10:35:54 INFO: Loading: tokenize
2023-04-11 10:35:54 INFO: Loading: pos
2023-04-11 10:35:54 INFO: Loading: lemma
2023-04-11 10:35:54 INFO: Loading: constituency
2023-04-11 10:35:55 INFO: Loading: depparse
2023-04-11 10:35:55 INFO: Loading: sentiment
2023-04-11 10:35:55 INFO: Loading: ner
2023-04-11 10:35:56 INFO: Done loading processors!


In [7]:
intxt=stanza.Document([], text=[sentence for sentence in text_df['sentences']][20][20])
out=nlp(intxt)
out

[
  [
    {
      "id": 1,
      "text": "With",
      "lemma": "with",
      "upos": "ADP",
      "xpos": "IN",
      "head": 5,
      "deprel": "case",
      "start_char": 0,
      "end_char": 4,
      "ner": "O",
      "multi_ner": [
        "O"
      ]
    },
    {
      "id": 2,
      "text": "Al",
      "lemma": "Al",
      "upos": "PROPN",
      "xpos": "NNP",
      "feats": "Number=Sing",
      "head": 5,
      "deprel": "nmod:poss",
      "start_char": 5,
      "end_char": 7,
      "ner": "S-PERSON",
      "multi_ner": [
        "S-PERSON"
      ]
    },
    {
      "id": 3,
      "text": "'s",
      "lemma": "'s",
      "upos": "PART",
      "xpos": "POS",
      "head": 2,
      "deprel": "case",
      "start_char": 7,
      "end_char": 9,
      "ner": "O",
      "multi_ner": [
        "O"
      ]
    },
    {
      "id": 4,
      "text": "principled",
      "lemma": "principled",
      "upos": "ADJ",
      "xpos": "JJ",
      "feats": "Degree=Pos",
      "head": 5,
      "de

In [8]:
print(*[f'id: {word.id}\tword: {word.text}\thead id: {word.head}\thead: {sent.words[word.head-1].text if word.head > 0 else "root"}\tdeprel: {word.deprel}' for sent in out.sentences for word in sent.words], sep='\n')

id: 1	word: With	head id: 5	head: leadership	deprel: case
id: 2	word: Al	head id: 5	head: leadership	deprel: nmod:poss
id: 3	word: 's	head id: 2	head: Al	deprel: case
id: 4	word: principled	head id: 5	head: leadership	deprel: amod
id: 5	word: leadership	head id: 12	head: continue	deprel: obl
id: 6	word: ,	head id: 12	head: continue	deprel: punct
id: 7	word: the	head id: 8	head: Department	deprel: det
id: 8	word: Department	head id: 12	head: continue	deprel: nsubj
id: 9	word: of	head id: 10	head: Justice	deprel: case
id: 10	word: Justice	head id: 8	head: Department	deprel: nmod
id: 11	word: will	head id: 12	head: continue	deprel: aux
id: 12	word: continue	head id: 0	head: root	deprel: root
id: 13	word: this	head id: 15	head: mission	deprel: det
id: 14	word: important	head id: 15	head: mission	deprel: amod
id: 15	word: mission	head id: 12	head: continue	deprel: obj
id: 16	word: and	head id: 18	head: defend	deprel: cc
id: 17	word: will	head id: 18	head: defend	deprel: aux
id: 18	word: def

### Attempt 2. Stanford's CoreNLP

In [9]:
# Using Stanford's CoreNLP parser with NLTK
# 1. Download CoreNLP from https://stanfordnlp.github.io/CoreNLP/download.html
# 2. make sure Java is installed, otherwise download and install Java - https://www.java.com/en/download/windows_manual.jsp
# 3. Unzip/extract CoreNLP zip file to a directory
# 4. Go to that directory and open a command terminal, and run the following command...
# 4b. on my laptop its in C:\Users\peter\stanford-corenlp-4.5.2
# 5. java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000
# 6. Now for graphviz if you want to view the parse trees, download from https://graphviz.org/download/ then install
# 7. Now, can run the following python code

In [10]:
# Can try this here, but may have to run from cmd terminal before continuing
!java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000

Error: Could not find or load main class edu.stanford.nlp.pipeline.StanfordCoreNLPServer
Caused by: java.lang.ClassNotFoundException: edu.stanford.nlp.pipeline.StanfordCoreNLPServer


In [11]:
from graphviz import Source
from nltk.parse.corenlp import CoreNLPDependencyParser
import os

In [12]:
os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz/bin/'

In [None]:
sdp = CoreNLPDependencyParser()
sentence = [sentence for sentence in text_df['sentences']][20][20]
result = list(sdp.raw_parse(sentence))
dep_tree_dot_repr = [parse for parse in result][0].to_dot()
source = Source(dep_tree_dot_repr, filename="dep_tree", format='png')
source.view()
# Opens in pop-under window... well isn't that nice!

In [None]:
# Graph image doesn't get saved, need to re-run the code
source

### Attempt 3. Spacy

<A HREF="https://stackoverflow.com/questions/64591644/how-to-get-height-of-dependency-tree-with-spacy">Followed this code</A> couldn't have done it without this!

In [14]:
# Try this just to the the height of the parse tree... using spacy
nlp = spacy.load("en_core_web_md")
doc = nlp(sentence)
depths = {}
def walk_tree(node, depth):
    depths[node.orth_] = depth
    if node.n_lefts + node.n_rights > 0:
        return [walk_tree(child, depth+1) for child in node.children]
[walk_tree(sent.root, 0) for sent in doc.sents]
print(depths)
print(max(depths.values()))

{'continue': 0, 'With': 1, 'leadership': 2, 'Al': 3, "'s": 4, 'principled': 3, ',': 1, 'Department': 1, 'the': 6, 'of': 6, 'Justice': 3, 'will': 2, 'mission': 1, 'this': 2, 'important': 2, 'and': 5, 'defend': 1, 'security': 2, 'Americans': 7, 'all': 8, 'liberty': 5, '.': 1}
8


In [28]:
# This prints noun phrases
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
            chunk.root.head.text)

Al's principled leadership leadership pobj With
the Department Department nsubj continue
Justice Justice pobj of
this important mission mission dobj continue
the security security dobj defend
all Americans Americans pobj of
the liberty liberty conj Americans
all Americans Americans pobj of


In [29]:
# Navigating the tree - from Spacy's site
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

With prep continue VERB [leadership]
Al poss leadership NOUN ['s]
's case Al PROPN []
principled amod leadership NOUN []
leadership pobj With ADP [Al, principled]
, punct continue VERB []
the det Department PROPN []
Department nsubj continue VERB [the, of]
of prep Department PROPN [Justice]
Justice pobj of ADP []
will aux continue VERB []
continue ROOT continue VERB [With, ,, Department, will, mission, and, defend, .]
this det mission NOUN []
important amod mission NOUN []
mission dobj continue VERB [this, important]
and cc continue VERB []
will aux defend VERB []
defend conj continue VERB [will, security]
the det security NOUN []
security dobj defend VERB [the, of]
of prep security NOUN [Americans]
all det Americans PROPN []
Americans pobj of ADP [all, and, liberty]
and cc Americans PROPN []
the det liberty NOUN []
liberty conj Americans PROPN [the, of]
of prep liberty NOUN [Americans]
all det Americans PROPN []
Americans pobj of ADP [all]
. punct continue VERB []


In [15]:
def walk_tree_depth(node, depth):
    if node.n_lefts + node.n_rights > 0:
        return max(walk_tree_depth(child, depth+1) for child in node.children )
    else:
        return depth
    
print([walk_tree_depth(sent.root, 0) for sent in doc.sents])

[8]


In [16]:
[walk_tree_depth(sent.root, 0) for sent in doc.sents][0]

8

#### This makes the actual depth dataframe

In [None]:
# Make data frame of sentences and parse tree depth of each
def walk_tree_depth(node, depth):
    if node.n_lefts + node.n_rights > 0:
        return max(walk_tree_depth(child, depth+1) for child in node.children )
    else:
        return depth
    
tree_depth = pd.DataFrame(columns = ['date', 'source', 'sentence', 'depth'])
for i, speech in enumerate(text_df['sentences']):
    for j, sentence in enumerate(speech):
        doc = nlp(sentence)
        depth = [walk_tree_depth(sent.root, 0) for sent in doc.sents][0]
        tree_depth.loc[len(tree_depth)] = [text_df['date'].iloc[i], text_df['source'].iloc[i], sentence, depth]
        

In [None]:
tree_depth.head()

In [None]:
tree_depth.shape

In [None]:
#tree_depth.to_csv('./Data/genData/sentence_depth_gwb.csv',index=False)

### Let's try spacy again for visualization

In [None]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("For everywhere in this country, there are first steps to be taken, there’s new ground to cover, there are more bridges to be crossed.")
# Since this is an interactive Jupyter environment, we can use displacy.render here
displacy.render(doc, style='dep')

In [None]:
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

In [None]:
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
            chunk.root.head.text)

In [None]:
# Finding a verb with a subject from below — good
from spacy.symbols import nsubj, VERB
verbs = set()
for possible_subject in doc:
    if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
        verbs.add(possible_subject.head)
print(verbs)

In [None]:
print([token.text for token in doc[2].lefts])  # ['bright', 'red']
print([token.text for token in doc[2].rights])  # ['on']
print(doc[2].n_lefts)  # 2
print(doc[2].n_rights)  # 1

In [None]:
root = [token for token in doc if token.head == token][0]
subject = list(root.lefts)[0]
for descendant in subject.subtree:
    assert subject is descendant or subject.is_ancestor(descendant)
    print(descendant.text, descendant.dep_, descendant.n_lefts,
            descendant.n_rights,
            [ancestor.text for ancestor in descendant.ancestors])

In [None]:
displacy.serve(doc, style="dep")