# Clause Parser Algorithm with Custom Metrics

In [1]:
!python --version
!python -m spacy download en_core_web_md
print("Downloaded")
#TODO: use en_core_web_lg in a better machine. lg is running out of space in binder. 

Python 3.6.5
Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.0.0/en_core_web_md-2.0.0.tar.gz
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.0.0/en_core_web_md-2.0.0.tar.gz (120.8MB)
[K    100% |████████████████████████████████| 120.9MB 40.3MB/s ta 0:00:01B/s eta 0:00:02    83% |██████████████████████████▊     | 100.8MB 17.0MB/s eta 0:00:02    84% |███████████████████████████     | 102.0MB 15.1MB/s eta 0:00:02
[33mYou are using pip version 9.0.3, however version 10.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m

[93m    Linking successful[0m
    /srv/conda/lib/python3.6/site-packages/en_core_web_md -->
    /srv/conda/lib/python3.6/site-packages/spacy/data/en_core_web_md

    You can now load the model via spacy.load('en_core_web_md')

Downloaded


In [2]:
import spacy
import html
from spacy import displacy

nlp = spacy.load('en_core_web_md')
print("Loaded models")

Loaded models


In [3]:
from random import shuffle
def print_sample_list(arr):
    indices = [i for i in range(len(arr))]
    shuffle(indices)
    for x in indices[:min(5, len(arr))]:
        print(x, arr[x])

In [4]:
from io import StringIO
import pandas as pd, numpy as np

df = pd.read_csv("./input.csv")
print(df.columns)
df.sample(frac=1).head()

Index(['UID', 'survey_id', 'prompt_number', 'prompt_id', 'prompt', 'response',
       'score', 'selectionTag', 'AnalystComments'],
      dtype='object')


Unnamed: 0,UID,survey_id,prompt_number,prompt_id,prompt,response,score,selectionTag,AnalystComments
496,3151.36,3151,36,48,Sometimes I wish that,"We all stopped, fell into silent spaciousness,...",6.5,12,
93,1889.3,1889,30,30,If I were in charge,/if I were following...together are a unity wh...,6.5,15,
92,1889.28,1889,28,71,When developing strategies&hellip;,...is a subtle framing of an assumption of sep...,6.5,6,
88,1889.17,1889,17,17,When they avoided me,"their deviation was also their alignment, a un...",6.0,40,
37,1791.34,1791,34,47,Technology,might one day be the next step of evolution. I...,6.0,35,


In [5]:
if "prompt" in df.columns: #Original dataset
    df['sentence'] = df.apply(lambda row : "{} {}".format(row['prompt'], row['response']), axis = 1)

df.sample(frac=1).head()

Unnamed: 0,UID,survey_id,prompt_number,prompt_id,prompt,response,score,selectionTag,AnalystComments,sentence
345,2544.05,2544,5,5,Being with other people,is fun,2.0,9,,Being with other people is fun
515,3223.21,3223,21,21,I just can\'t stand people who,there are no people really\nno worth not stand...,5.0,39,,I just can\'t stand people who there are no pe...
352,2546.18,2546,18,42,Rules,are useful and sometimes not useful.,2.0,45,,Rules are useful and sometimes not useful.
368,2556.23,2556,23,23,I am,.\n,3.0,13,,I am .\n
422,2785.12,2785,12,12,A good boss,leads!,2.5,9,,A good boss leads!


In [6]:
import re, html
PATTERN = "[^a-zA-Z0-9\s]+"
rgx = re.compile(PATTERN, re.IGNORECASE)

df['preprocessed_sentence'] = df['sentence'].apply(lambda ip : re.sub('\s+', ' ', rgx.sub(' ', html.unescape(ip))))
print(df.columns)
df.sample(frac=1).head()

Index(['UID', 'survey_id', 'prompt_number', 'prompt_id', 'prompt', 'response',
       'score', 'selectionTag', 'AnalystComments', 'sentence',
       'preprocessed_sentence'],
      dtype='object')


Unnamed: 0,UID,survey_id,prompt_number,prompt_id,prompt,response,score,selectionTag,AnalystComments,sentence,preprocessed_sentence
19,1695.22,1695,22,43,At times I worry about,my dreams,3.5,18,,At times I worry about my dreams,At times I worry about my dreams
437,2843.25,2843,25,25,My main problem is,"at the moment, that Iâ€™m hesitant, almost pro...",4.0,16,,"My main problem is at the moment, that Iâ€™m h...",My main problem is at the moment that I m hesi...
33,1766.3,1766,30,30,If I were in charge,Of anything right now it would be a challenge ...,5.0,14,,If I were in charge Of anything right now it w...,If I were in charge Of anything right now it w...
126,2020.13,2020,13,40,We could make the world a better place if,If we are happy for people when successful. I...,3.5,30,,We could make the world a better place if If w...,We could make the world a better place if If w...
41,1806.23,1806,23,23,I am,amazed at how quickly the world gives way to t...,5.0,44,,I am amazed at how quickly the world gives way...,I am amazed at how quickly the world gives way...


In [7]:
df['nlp_doc'] = df['preprocessed_sentence'].apply(lambda ip : nlp(ip))
print(df.columns)
df.sample(frac=1).head()

Index(['UID', 'survey_id', 'prompt_number', 'prompt_id', 'prompt', 'response',
       'score', 'selectionTag', 'AnalystComments', 'sentence',
       'preprocessed_sentence', 'nlp_doc'],
      dtype='object')


Unnamed: 0,UID,survey_id,prompt_number,prompt_id,prompt,response,score,selectionTag,AnalystComments,sentence,preprocessed_sentence,nlp_doc
516,3227.23,3227,23,23,I am,...here. And lost inside of everyone and every...,6.0,16,,I am ...here. And lost inside of everyone and ...,I am here And lost inside of everyone and ever...,"(I, am, here, And, lost, inside, of, everyone,..."
490,3151.06,3151,6,6,The thing I like about myself is,is that I am eternal infinite stardust meeting...,6.0,2,,The thing I like about myself is is that I am ...,The thing I like about myself is is that I am ...,"(The, thing, I, like, about, myself, is, is, t..."
111,1975.32,1975,32,32,If I can\'t get what I want,I try to work out another way of getting what ...,3.5,21,,If I can\'t get what I want I try to work out ...,If I can t get what I want I try to work out a...,"(If, I, can, t, get, what, I, want, I, try, to..."
157,2164.03,2164,3,3,Change is,"Difficult, frustrating, fearful yet beautiful,...",5.5,48,,"Change is Difficult, frustrating, fearful yet ...",Change is Difficult frustrating fearful yet be...,"(Change, is, Difficult, frustrating, fearful, ..."
326,2537.24,2537,24,24,If I had more money,I would buy a swim with seals.,1.5,38,,If I had more money I would buy a swim with se...,If I had more money I would buy a swim with se...,"(If, I, had, more, money, I, would, buy, a, sw..."


#### Metrics

* Total % of sentences with correct reconstructions =  0.9061 . It's actually greater than 91% since complex first clauses followed by conjunctions put the conjuction with the parent clause in the first.
* Response expected = actual verbatim : 

In [8]:
def flatten_list(l):
    flat_list = [item for sublist in l for item in sublist]
    return flat_list

def get_children(doc):
    if len([x for x in doc.children]) == 0:
        return [doc]
    if doc.pos_ == "VERB" and doc.dep_ not in ["xcomp", "aux"]:
        return []

    op = flatten_list([get_children(l) for l in doc.lefts]) + [doc] + flatten_list([get_children(r) for r in doc.rights])
    return op

def postprocess(tokens_arr):
    if len(tokens_arr) == 1 and ( tokens_arr[0].dep_ in ["aux", "auxpass"] or tokens_arr[0].tag_ in ["VBG"]): 
        return []
    return tokens_arr

def get_text_from_tokens(tokens_arr):
    op = ' '.join([x.text for x in tokens_arr])
    op = op.replace(" nt", "nt").replace(" '", "'")
    return op

def clause_split_by_verbs(doc):
    op = []
    for token in doc:
        if token.pos_ == "VERB":
            arr = flatten_list([get_children(l) for l in token.lefts]) + [token] + flatten_list([get_children(r) for r in token.rights])
            arr = postprocess(arr)
            op.append(arr)
    if len(op)==0:
        op.append(doc)
    return op

df['split_by_verbs_arr'] = df['nlp_doc'].apply(clause_split_by_verbs)
df.sample(frac = 1).head()

Unnamed: 0,UID,survey_id,prompt_number,prompt_id,prompt,response,score,selectionTag,AnalystComments,sentence,preprocessed_sentence,nlp_doc,split_by_verbs_arr
365,2552.1,2552,10,10,When people are helpless,I try to make them not helpless,2.5,44,,When people are helpless I try to make them no...,When people are helpless I try to make them no...,"(When, people, are, helpless, I, try, to, make...","[[When, people, are, helpless], [I, try, to, m..."
408,2721.16,2721,16,16,I feel sorry,for the pain that all humans and all beings on...,4.5,3,,I feel sorry for the pain that all humans and ...,I feel sorry for the pain that all humans and ...,"(I, feel, sorry, for, the, pain, that, all, hu...","[[I, feel, sorry, for, the, pain], [], [that, ..."
409,2721.25,2721,25,25,My main problem is,This stem is irrelevant. I have no problem. ...,6.0,32,,My main problem is This stem is irrelevant. I...,My main problem is This stem is irrelevant I h...,"(My, main, problem, is, This, stem, is, irrele...","[[My, main, problem, is], [This, stem, is, irr..."
86,1889.15,1889,15,41,Privacy,inevitably has all of creation flowing through...,6.5,7,,Privacy inevitably has all of creation flowing...,Privacy inevitably has all of creation flowing...,"(Privacy, inevitably, has, all, of, creation, ...","[[Privacy, inevitably, has, all, of, creation]..."
247,2391.07,2391,7,38,My co-workers and I,Work very well together as we have a long hist...,4.5,40,,My co-workers and I Work very well together as...,My co workers and I Work very well together as...,"(My, co, workers, and, I, Work, very, well, to...","[[My, co, workers, and, I, Work, very, well, t..."


df postprocessing and the clause delimiting

In [12]:
def remove_prompts(df):
    prompt, tokens_arr = df.prompt, df.split_by_verbs_arr
    pdoc = nlp(prompt)
    ignore_indices = [x.i for x in pdoc]
    new_arr = []
    for clause in tokens_arr:
        new_clause = [t for t in clause if t.i not in ignore_indices]
        if len(new_clause) >= 0:
            new_arr.append(new_clause)
    return [x for x in new_arr if len(x) != 0]

def process_text_df(clauses_arr):
    new_arr = []
    # first pass
    first_pass = []
    tok_arr = [[ tok.i for tok in clause] for clause in clauses_arr]

    for i in range(len(tok_arr)):
        x = tok_arr[i]
        if len(x) ==  0:
            continue
        is_subset = False
        for y in tok_arr:
            if set(x).issubset(y) and not set(x) == set(y):
                is_subset = True
        if not is_subset:
            first_pass.append(clauses_arr[i])
    
    for clauses in first_pass:
        if len(clauses) == 0:
            continue
        txt = get_text_from_tokens(clauses)
        new_arr.append(txt)
    
    return new_arr
        
df['clauses_doc_final'] = df[['prompt', 'split_by_verbs_arr']].apply(remove_prompts, axis = 1) 
df['clauses_text_final'] = df['clauses_doc_final'].apply(process_text_df)
df['split_by_verbs_arr_cleaned'] = df['split_by_verbs_arr'].apply(process_text_df)
df.sample(frac = 1).head(20)

Unnamed: 0,UID,survey_id,prompt_number,prompt_id,prompt,response,score,selectionTag,AnalystComments,sentence,preprocessed_sentence,nlp_doc,split_by_verbs_arr,split_by_verbs_arr_cleaned,clauses_doc_final,clauses_text_final
100,1909.25,1909,25,25,My main problem is,"integrating emptiness with everyday life, find...",5.5,21,,My main problem is integrating emptiness with ...,My main problem is integrating emptiness with ...,"(My, main, problem, is, integrating, emptiness...","[[], [My, main, problem, is, integrating, empt...",[My main problem is integrating emptiness with...,"[[integrating, emptiness, with, everyday, life...","[integrating emptiness with everyday life, fin..."
511,3185.34,3185,34,47,Technology,is exciting,3.0,48,,Technology is exciting,Technology is exciting,"(Technology, is, exciting)","[[Technology, is, exciting]]",[Technology is exciting],"[[is, exciting]]",[is exciting]
265,2445.2,2445,20,44,Business and society,...are linked very closely. The 'business' of ...,6.0,3,,Business and society ...are linked very closel...,Business and society are linked very closely T...,"(Business, and, society, are, linked, very, cl...","[[], [Business, and, society, are, linked, ver...","[Business and society are linked very closely,...","[[are, linked, very, closely], [The, business,...","[are linked very closely, The business of Grac..."
497,3158.02,3158,2,2,When I am criticized,"I realise that my being or expression, my cons...",5.5,18,,When I am criticized I realise that my being o...,When I am criticized I realise that my being o...,"(When, I, am, criticized, I, realise, that, my...","[[], [When, I, am, criticized], [I, realise], ...","[When I am criticized, I realise, that my bein...","[[I, realise], [that, my, being, or, expressio...","[I realise, that my being or expression my con..."
333,2541.05,2541,5,5,Being with other people,is fun,2.0,16,,Being with other people is fun,Being with other people is fun,"(Being, with, other, people, is, fun)","[[Being, with, other, people], [is, fun]]","[Being with other people, is fun]","[[is, fun]]",[is fun]
428,2806.31,2806,31,31,My father,Died @ an early age.,2.5,32,,My father Died @ an early age.,My father Died an early age,"(My, father, Died, an, early, age)","[[My, father, Died, an, early, age]]",[My father Died an early age],"[[Died, an, early, age]]",[Died an early age]
445,2862.33,2862,33,33,When I am nervous,my breathing rate increases,2.5,36,,When I am nervous my breathing rate increases,When I am nervous my breathing rate increases,"(When, I, am, nervous, my, breathing, rate, in...","[[When, I, am, nervous, my, breathing, rate, i...",[When I am nervous my breathing rate increases],"[[my, breathing, rate, increases]]",[my breathing rate increases]
537,3356.16,3356,16,16,I feel sorry,for not keeping in contact with people that ar...,3.5,9,,I feel sorry for not keeping in contact with p...,I feel sorry for not keeping in contact with p...,"(I, feel, sorry, for, not, keeping, in, contac...","[[I, feel, sorry, for], [not, keeping, in, con...","[I feel sorry for, not keeping in contact with...","[[for], [not, keeping, in, contact, with, peop...","[for, not keeping in contact with people, that..."
429,2808.04,2808,4,37,"These days, work",gets me through the day,2.0,1,,"These days, work gets me through the day",These days work gets me through the day,"(These, days, work, gets, me, through, the, day)","[[These, days, work, gets, me, through, the, d...",[These days work gets me through the day],"[[me, through, the, day]]",[me through the day]
145,2102.15,2102,15,41,Privacy,is both illusion and actual.,5.0,36,,Privacy is both illusion and actual.,Privacy is both illusion and actual,"(Privacy, is, both, illusion, and, actual)","[[Privacy, is, both, illusion, and, actual]]",[Privacy is both illusion and actual],"[[is, both, illusion, and, actual]]",[is both illusion and actual]


In [15]:
a_poss, p_yn, p_beverb, p_get, a_def, undef = "A_pron_x", "P_yn", "P_bevb_x", "P_get_x", "A_def", "Undefined"

def voice_rule_engine(clause):
    if True not in [x.pos_ == "VERB" for x in clause]:
        return undef
    
    for x in clause:
        if x.dep_ == "poss":
            return a_poss
        
    ct = 0
    for x in clause:
        if x.text.lower().strip() in ['yes', 'no']:
            ct += 1
    if ct >= len(clause)/2:
        return p_yn

    BEING_VERBS = ['is', 'was', 'am', 'are', 'were', 'wasn', 'weren', 'isn']
    for x in clause:
        if x.text.lower().strip() in BEING_VERBS:
            return p_beverb
    
    for x in clause:
        if x.dep_ == "acomp":
            return p_get
    
    return a_def
    
def clauses_voice(arr_of_clauses):
    op = []
    for clause in arr_of_clauses:
        voice = voice_rule_engine(clause)
        op.append(voice)         
    return op

df['voice'] = df.clauses_doc_final.apply(clauses_voice)
df[['sentence', 'clauses_doc_final', 'voice']].sample(frac = 1).head()

Unnamed: 0,sentence,clauses_doc_final,voice
462,My main problem is getting over decades spent ...,"[[getting, over, decades], [spent, proving, my...","[A_def, A_def, A_def]"
114,A good boss is mature enough to set a clear st...,"[[is, mature, enough, to, set, a, clear, strat...","[P_bevb_x, A_def, A_def, A_pron_x, A_pron_x, A..."
203,Business and society can\'t have one without t...,"[[can, t, have, one, without, the, other]]",[A_def]
343,When they didn't let me join in I felt sad,"[[I, felt, sad]]",[P_get_x]
33,If I were in charge Of anything right now it w...,"[[Of, anything, right, now], [it, would, be, a...","[Undefined, A_def, P_get_x, P_get_x, A_def, A_..."


In [16]:
def htmlise(df):
    html_fs = """
    <html>
        <head>
            <title>{}</title>
        </head>
        <body>
            <div>{}</div>
            <div>{}</div>
            <div>{}</div>
        </body>
    </html>"""
    op = spacy.displacy.render(df.nlp_doc, style='dep')
    with open("./html/file_{}.html".format(df.idx), "w") as f:
        f.write(html_fs.format(df.prompt, df.response, df.clauses_text_final, op))
    return
        
df['idx'] = df.index
df.apply(htmlise, axis = 1)
print("HTML processing done")

HTML processing done


In [17]:
df[['prompt', 'response', 'clauses_text_final', 'voice', 'idx']].to_csv("./voice_classified.csv", index = False)