In [None]:
! pip install swifter
! pip install pandas
! pip install textacy
! pip install spacy==2.1.0
! pip install neuralcoref --no-binary neuralcoef

In [None]:
! python -m spacy download en_core_web_sm

In [None]:
!wget -O data.tar.gz https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/6MZN76/CRUNF0
!tar -xf data.tar.gz

In [3]:
import spacy
import pandas as pd
from tqdm.auto import tqdm

pd.options.plotting.backend = "plotly"

In [4]:
en = spacy.load("en_core_web_sm") # loading spacy model

In [None]:
import neuralcoref
from spacy.tokens import Token

Token.set_extension("corefs", default=[])
coref = neuralcoref.NeuralCoref(en.vocab)
en.add_pipe(coref, name="neuralcoref")

In [9]:
df = pd.read_table('Dail_debates_1919-2013.tab')


In [12]:
df.date = pd.to_datetime(df.date)
df2 = df[df.date.dt.year>1999]
df3 = df2.sample(frac=.002, random_state=123)
docs = df3['speech'].apply(en)

In [14]:
DEONTIC_VERBS = ["can", "could", "may", "might", "must", "shall", "should"]

In [55]:
def get_deontic(sent):
  for token in sent:
      if token.lemma_ in DEONTIC_VERBS:
          return token
  return None


def get_verb(deontic):
  return deontic.head if deontic is not None else ""


def get_child_dep(verb, dep):
  return [child for child in verb.children if child.dep_ == dep]


def get_subjects_clausual(verb):
  # [y for x in [a, b] for y in x]
  return [ch for child in get_child_dep(verb, "csubj") for ch in get_child_dep(child, "nsubj")]

def get_coref(token):
  corefs = token._.corefs
  if len(corefs) == 0 or token.pos_ != "PRON":
      return ""
  return corefs[0]

In [209]:
def ig_sent_tag(sent):
  deontic = get_deontic(sent)
  attributes, objects, verbs = [], [], []
  verb = get_verb(deontic)

  while verb:
    attr = verb
    verbs.append(verb)
    verb = None

    subject = get_child_dep(attr, "nsubj")
    passive_subject = get_child_dep(attr, "nsubjpass")

    if len(subject) == len(passive_subject) == 0:
        attributes = get_subjects_clausual(attr)
    attributes.extend(subject)
    objects.extend(passive_subject)
    if attr.dep_ == "conf" and attr.pos_ == "VERB":
      verb = attr.head
    last_subj = None
    for subj in attributes:
      if last_subj == subj:
        continue
      if subj.dep_ == "conj":
        attributes.append(subj)
      attributes.extend(get_child_dep(subj, "conj"))
      if subj.pos_ == "PRON":
        subj.pos_ = get_coref(subj)
      last_subj = subj
    
    for obj in objects:
      objects.extend(get_child_dep(obj, "conj"))
  return {
    "deontic": deontic.lemma_ if deontic else "",
    "attributes": attributes,
    "objects": objects,
    "verbs": verbs
}

In [236]:
def ig_tagging(docs):
  return pd.DataFrame([{**ig_sent_tag(sent), **{"doc_id": i, "party_name": df3.loc[i].party_name}} for i, doc in docs.iteritems() for sent in list(doc.sents) if get_deontic(sent)])


In [237]:
igt = ig_tagging(docs.iloc[:500])

In [None]:
igt

In [300]:
pivot = igt.pivot_table(index=['party_name'], columns='deontic', aggfunc='size', fill_value=0)

In [301]:
col_names = list(pivot.columns)
# sum deonties by parties
pivot['sum'] = pivot[list(pivot.columns)].sum(axis=1)
# sum deonties by type
rows_sum = pivot.sum(axis=0)
rows_sum.name = "all parties"
pivot = pivot.append(rows_sum)

In [None]:
pivot

In [303]:
# convert numbers to percentage
for col_name in col_names:
  pivot[col_name] = pivot[col_name]/pivot['sum']
pivot = pivot.drop("sum", axis=1)

In [None]:
pivot.plot(kind='bar')