# Refining the labels for Inclusory Heuristics



### The goal of this script is to take care of the following problems:
1. the strict_emb heuristic is grabbing root questions that appear in quotes

In [2]:
from __future__ import division
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl 
import numpy as np
import re
from nltk import word_tokenize
from nltk import pos_tag

## Make sure to load in the dataset that was modified from the Exclusive heuristics notebook, because this notebook will output a new .json file for making graphs.

In [3]:
df = pd.read_json("edited_edited_corpus.json")

In [4]:
df['emb_verb'] = df['emb_verb'].replace({'Can': 'can', 'CAN': 'can', 'WILL': 'will', 'Will': 'will', 'Could': 'could', 'Would':'would', 'Should': 'should'})
df['v1_after'] = df['v1_after'].replace({'Can': 'can', 'CAN': 'can', 'WILL': 'will', 'Will': 'will', 'Could': 'could', 'Would':'would', 'Should': 'should'})

In [5]:
# This makes the display show more info
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

In [6]:
root = df[df.questType == "Root Question"]
emb = df[df.questType == "Embeded Question"]

## A. Embedded Questions
Break it up by heuristic.

In [7]:
emb.groupby(["heuristic"])["heuristic"].count()

heuristic
second_pass    335  
strict_emb     35024
v_before_wh    5135 
Name: heuristic, dtype: int64

In [7]:
emb_se = emb.loc[emb.heuristic.isin(["strict_emb"])]
emb_vbw = emb.loc[emb.heuristic.isin(["v_before_wh"])]
emb_sp = emb.loc[emb.heuristic.isin(["second_pass"])]

### 1. Strict Emb

In [8]:
len(emb_se)

34975

In [9]:
emb_se.groupby(["clauseType"])["clauseType"].count()

clauseType
Finite        27886
Modal         4174 
Non-Finite    2915 
Name: clauseType, dtype: int64

In [11]:
# emb_se.sentence

### 2. v_before_wh

In [44]:
len(emb_vbw)

5130

In [18]:
# uhoh, these have several root questions
# emb_vbw.sentence

In [19]:
emb_vbw_qmark = emb_vbw.loc[emb_vbw.sentence.str.contains("\?")]

In [20]:
# emb_vbw_qmark.sentence

In [None]:
len(emb_vbw_qmark)/len(emb_vbw)

In [21]:
emb_vbw_qmark_indexes = emb_vbw_qmark.index.values.tolist()

In [27]:
df.at[emb_vbw_qmark_indexes,'questType'] = 'Root Question'
df.at[emb_vbw_qmark_indexes,'mat_verb'] = None

### 3. second pass

These look like they should be gone through more closely.

In [28]:
len(emb_sp)

333

In [29]:
emb_sp.sentence

2110      By better understanding how the brain is altered with this syndrome , we can develop better drugs in the future ."                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            

### Summary

Change the ones with '?' to Root question

In [8]:
# df.at[emb_vbw_qmark_indexes, 'questType'] = 'Root Question'

## B. Root Questions

In [24]:
root.groupby(["heuristic"])["heuristic"].count()

heuristic
?              23219
amb            3677 
frag           817  
second_pass    2    
strict_emb     49   
sub_aux_inv    6867 
v_before_wh    5    
Name: heuristic, dtype: int64

In [26]:
root.iloc[100]

corpus        bnc                                                                                                                                                                                                                                                                                                                      
medium        print                                                                                                                                                                                                                                                                                                                    
sentence      Where are we to account for the hints, implicit purposes, assumptions, social attitudes and so on that are effectively communicated by the use of language, not to mention the figures of speech (e.g. metaphor, irony, rhetorical questions, understatement) that have preoccupied theorists of rhetoric and literature?
clauseType    Fi

In [34]:
root_qmark = root.loc[root.heuristic.isin(["?"])]
root_sai = root.loc[root.heuristic.isin(["sub_aux_inv"])]

## ?

In [35]:
len(root_qmark)

23219

In [37]:
root_qmark.groupby(["clauseType"])["clauseType"].count()

clauseType
Finite    18461
Modal     4758 
Name: clauseType, dtype: int64

In [40]:
# root_qmark.sentence

## Subject-Aux-Inversion

In [39]:
len(root_sai)

6867

In [42]:
root_sai.groupby(["clauseType"])["clauseType"].count()

clauseType
Finite    6649
Modal     218 
Name: clauseType, dtype: int64

In [43]:
root_sai.sentence

2319      How do aerosols influence ocean currents ?                                                                                                                                                                                                         
2374      How did these cells evolve ?                                                                                                                                                                                                                       
2400      How did they do it ?                                                                                                                                                                                                                               
2636      Who's in it now?                                                                                                                                                                                                                    

# Last step: save as new .json

In [45]:
df.to_json("final_corpus.json", orient="columns")