In [1]:
from __future__ import division
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl 
import numpy as np
import re
from nltk import word_tokenize
from nltk import pos_tag

In [2]:
df = pd.read_json("edited_edited_corpus.json")

In [3]:
# This makes the display show more info
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', -1)

# 4. The tell-DP-wh surprise-DP-wh errors
Several legitimate embedded verbs which include DP objects in addition to CP objects are being caught by the relative clause heuristic. In this section we try to save them.

Verbs that must be transferred to Embedded Clause:
- know
- tell
- surprise
- depe

In [4]:
rel = df.loc[df.questType.isin(["Relative Clause"])]

In [394]:
len(rel)

330027

### Check out the verbs to see whether there are any other obvious cases that we might have missed
First cut off the verbs at most frequent

In [4]:
rel.mat_verb.value_counts()

be          54267
know         9742
's           7282
have         6812
see          4923
            ...  
Cheesed         1
Target          1
sync            1
deluge          1
celibate        1
Name: mat_verb, Length: 7284, dtype: int64

In [22]:
rel_freq_vs = rel.groupby('mat_verb').filter(lambda x: len(x) >= 2000)

In [23]:
# rel_freq_vs.groupby(["mat_verb"])["mat_verb"].count()

rel_freq_vs.pivot_table(index=['mat_verb'], values='sentence', aggfunc=len).sentence.transform(lambda x: x/sum(x))

mat_verb
's      0.061813
ask     0.017181
be      0.460643
come    0.020109
do      0.021968
find    0.022112
get     0.026009
give    0.022070
go      0.025686
have    0.057823
know    0.082695
make    0.025720
say     0.040091
see     0.041789
take    0.024235
tell    0.031042
‘       0.019014
Name: sentence, dtype: float64

It looks like there are several good cases

## Know

In [27]:
know = rel.loc[rel.mat_verb.isin(["know"])]

In [28]:
len(know)

9742

In [29]:
# know.sentence

In [7]:
knowwh = rel.loc[rel.sentence.str.contains("know where|know how|know who")]
knewwh = rel.loc[rel.sentence.str.contains("knew where|knew how|knew who")]
knowswh = rel.loc[rel.sentence.str.contains("knows where|knows how|knows who")]

In [10]:
len(knewwh)

756

In [11]:
len(knowswh)

575

In [8]:
len(knowwh)

5869

In [27]:
# knowswh.sentence

In [25]:
# get indexes
k1 = knewwh.index.values.tolist()
k2 = knowwh.index.values.tolist() 
k3 = knowswh.index.values.tolist()

# put them together unpacked
know_indexes = [*k1,*k2,*k3]

In [26]:
# now edit the DF

df.at[know_indexes,'questType'] = 'Embeded Question'

## Become

In [58]:
become = rel.loc[rel.mat_verb.isin(["become"])]

In [59]:
len(become)

900

In [54]:
becomewh = rel.loc[rel.sentence.str.contains("become where|become how|become who")]

In [57]:
becomewh.sentence

109961    From everything she says, it's clear this gilgul's been on the scene for quite a while — there is no way she could have learned what she's learned, cleaned up her act and become who she is now if the only time she's had to do it in was the brief interval between our confrontation at the Mephistco stand and her appearance on the beach at Bournemouth.
Name: sentence, dtype: object

In [55]:
len(becomewh)

1

900

## Consider

In [14]:
consider = rel.loc[rel.mat_verb.isin(["consider"])]

In [15]:
len(consider)

646

In [28]:
considerwh = rel.loc[rel.sentence.str.contains("consider where|consider how|consider who")]
consideredwh = rel.loc[rel.sentence.str.contains("considered where|considered how|considered who")]
considerswh = rel.loc[rel.sentence.str.contains("considers where|considers how|considers who")]

In [30]:
len(considerswh)

15

In [32]:
# get indexes
c1 = considerwh.index.values.tolist()
c2 = consideredwh.index.values.tolist() 
c3 = consideredwh.index.values.tolist()

# put them together unpacked
consider_indexes = [*c1,*c2,*c3]

In [33]:
# now edit the DF
df.at[consider_indexes,'questType'] = 'Embeded Question'

## Explain

In [34]:
explain = rel.loc[rel.mat_verb.isin(["explain"])]

In [35]:
explainwh = rel.loc[rel.sentence.str.contains("explain where|explain how|explain who")]
explainedwh = rel.loc[rel.sentence.str.contains("explained where|explained how|explained who")]
explainswh = rel.loc[rel.sentence.str.contains("explains where|explains how|explains who")]

In [37]:
# get indexes
e1 = explainwh.index.values.tolist()
e2 = explainedwh.index.values.tolist() 
e3 = explainedwh.index.values.tolist()

# put them together unpacked
explain_indexes = [*e1,*e2,*e3]

In [38]:
# now edit the DF
df.at[explain_indexes,'questType'] = 'Embeded Question'

## Depend

In [39]:
depend = rel.loc[rel.mat_verb.isin(["depend"])]

In [19]:
len(depend)

682

In [40]:
dependwh = rel.loc[rel.sentence.str.contains("depend on where|depend on how|depend on who")]
dependedwh = rel.loc[rel.sentence.str.contains("depended on where|depended on how|depended on who")]
dependswh = rel.loc[rel.sentence.str.contains("depends on where|depends on how|depends on who")]

In [41]:

# get indexes
d1 = dependwh.index.values.tolist()
d2 = dependedwh.index.values.tolist() 
d3 = dependedwh.index.values.tolist()

# put them together unpacked
depend_indexes = [*d1,*d2,*d3]

In [42]:
# now edit the DF
df.at[depend_indexes,'questType'] = 'Embeded Question'

## Ask

In [43]:
ask = rel.loc[rel.mat_verb.isin(["ask"])]

In [44]:
len(ask)

2024

In [45]:
askwh = rel.loc[rel.sentence.str.contains("ask where|ask how|ask who")]
askedwh = rel.loc[rel.sentence.str.contains("asked where|asked how|asked who")]
askswh = rel.loc[rel.sentence.str.contains("asks where|asks how|asks who")]

# get indexes
a1 = askwh.index.values.tolist()
a2 = askedwh.index.values.tolist() 
a3 = askedwh.index.values.tolist()

# put them together unpacked
ask_indexes = [*a1,*a2,*a3]

In [46]:
# now edit the DF
df.at[ask_indexes,'questType'] = 'Embeded Question'

## Surprise

In [34]:
surprise = rel.loc[rel.sentence.str.contains("surprise")]

In [35]:
len(surprise)

965

In [36]:
surprise.sentence.head()

511     It ' s possible to create order by introducing...
670     The findings don ' t surprise Australian food ...
898     Common brown snakes like this one surprised th...
1129    Hardly surprising Canine behaviourist and trai...
1199    Meat and Livestock Australia says the average ...
Name: sentence, dtype: object

In [37]:
surprise2 = rel.loc[rel.mat_verb.isin(["surprise"])]

In [38]:
len(surprise2)

116

In [48]:
surprisewh = rel.loc[rel.sentence.str.contains("surprise where|surprise how|surprise who")]
surprisedwh = rel.loc[rel.sentence.str.contains("surprised where|surprised how|surprised who")]
surpriseswh = rel.loc[rel.sentence.str.contains("surprises where|surprises how|surprises who")]

# get indexes
s1 = surprisewh.index.values.tolist()
s2 = surprisedwh.index.values.tolist() 
s3 = surprisedwh.index.values.tolist()

# put them together unpacked
surprise_indexes = [*s1,*s2,*s3]

In [50]:
# now edit the DF
df.at[surprise_indexes,'questType'] = 'Embeded Question'

## Tell

In [56]:
tell = rel.loc[rel.mat_verb.isin(["tell"])]

In [40]:
# tell.iloc[200].sentence

In [44]:
s1 = tell.iloc[200].sentence
s1

"Dolly said one had got in, that he threatened to do her in if she didn't tell him where all the valuables were."

In [46]:
t1 = pos_tag(word_tokenize(s1))
print(t1)

[('Dolly', 'RB'), ('said', 'VBD'), ('one', 'CD'), ('had', 'VBD'), ('got', 'VBN'), ('in', 'IN'), (',', ','), ('that', 'IN'), ('he', 'PRP'), ('threatened', 'VBD'), ('to', 'TO'), ('do', 'VB'), ('her', 'PRP$'), ('in', 'IN'), ('if', 'IN'), ('she', 'PRP'), ('did', 'VBD'), ("n't", 'RB'), ('tell', 'VB'), ('him', 'PRP'), ('where', 'WRB'), ('all', 'PDT'), ('the', 'DT'), ('valuables', 'NNS'), ('were', 'VBD'), ('.', '.')]


In [60]:
# find all cases that minimize 
tell_emb = tell.loc[tell.sentence.str.contains("tell me/her/him/you/them/it where|who|how", regex=True)]
told_emb = tell.loc[tell.sentence.str.contains("told me/her/him/you/them/it where|who|how", regex=True)]
tells_emb = tell.loc[tell.sentence.str.contains("tells me/her/him/you/them/it where|who|how", regex=True)]

In [61]:
# get indexes
td1 = tell_emb.index.values.tolist()
td2 = told_emb.index.values.tolist() 
td3 = tells_emb.index.values.tolist()

tell_do_indexes = [*td1,*td2,*td3]

In [62]:
tell_emb.sentence

304       He tells the story of a woman who told him at a book signing that her son , once a high school drop - out and surf bum , decided to become a geneticist after listening to him on Triple J radio .                                                                                                              
627       It tells farmers who they are , where they ' ve been and where they might be going to as far as a place that encapsulates their cultural heritage ," she says . "                                                                                                                                               
736       These dodo bones , found recently on the island of Mauritius , tell a different story of how these birds became extinct Scientists who unearthed a mass dodo grave in Mauritius say they have found evidence showing a natural disaster killed the birds long before humans arrived on the Indian Ocean island .
753       We found that the features related to the red

In [51]:
# tell.to_csv("tell.csv", header = True)

In [52]:
tellwh = rel.loc[rel.sentence.str.contains("tell where|tell how|tell who")]
toldwh = rel.loc[rel.sentence.str.contains("told where|told how|told who")]
tellswh = rel.loc[rel.sentence.str.contains("tells where|tells how|tells who")]

# get indexes
t1 = tellwh.index.values.tolist()
t2 = toldwh.index.values.tolist() 
t3 = tellswh.index.values.tolist()

# put them together unpacked
tell_indexes = [*t1,*t2,*t3]

# Finish and save to .json

In [63]:
df.to_json("edited_edited2_corpus.json", orient='columns')