#### This notebook compares two approaches for extracting questions from text: (1) Using basic rule-based method with regex (2) Constituent parse trees from CoreNLP

- Manually annotate 300 samples from 4 subreddits (75 from each); and compare the outputs of the two approaches with the manually identified questions.

#### Useful links:
- CoreNLP Annotators: https://stanfordnlp.github.io/CoreNLP/annotators.html
- Penn Treebank tags: https://gist.github.com/nlothian/9240750

In [1]:
import pandas as pd
import pickle
import re
from pycorenlp import StanfordCoreNLP
from nltk.tokenize import sent_tokenize



#### Basic Regex approach:

In [3]:
def extract_questions_regex(comment):
    """
    Extracts questions from a given comment using simple regular expression and sentence tokenization.
    Note: gets rid of non-ASCII characters.
    
    Parameters
    ----------
    comment: str
    
    Returns
    -------
    list
        A list of strings where each string corresponds to a question.
    """
    questions = []
    
    clean_comment = re.sub(r'[!.,?]*\?[!.,?]*', '? ', comment) # substitute multiple !??.. with a single "?"
    clean_comment = re.sub(r'\.+\.', '. ', clean_comment) # substitute multiple .... with a single "."
    
    sentences = sent_tokenize(clean_comment)
    for sent in sentences:
        sent = sent.strip()
        if sent.endswith('?'): # gets rid of quoting questions: Sometimes she's hot, then other times you're like, "did you just get out of a car accident?"
            questions.append(sent)
            
    return questions

#### CoreNLP parsing approach:

In [5]:
nlp = StanfordCoreNLP('http://localhost:9000') # connect to server

# 'parse' provides full syntactic analysis, using both the constituent and the dependency representations.
def annotate_comment(comment):
    res = nlp.annotate(comment,
                       properties={
                       'annotators': 'parse',
                       'outputFormat': 'json',
                       'timeout': 1000,
                    })
    
    return res

def extract_questions_CoreNLP(comment):
    questions = []

    # CoreNLP server:
    res = annotate_comment(comment)

    # Question if contains SBARQ and SQ
    for output in res['sentences']:
        parse_tree = output['parse']
        parse_output = parse_tree.split('(')

        if parse_output[0] == '' and parse_output[1].startswith('ROOT'):
            if parse_output[2].startswith('SBARQ') or parse_output[2].startswith('SQ'):
                questions.append(" ".join([t["word"] for t in output["tokens"]]))

    return questions

In [6]:
def clean_comment(text):
    """
    Cleans the given text. Removes URL links. Removes non-ASCII characters.
    """
    text = re.sub(r"http\S+", "", text) # remove URL links
    return ''.join([i if ord(i) < 128 else '' for i in text])

### Test run:

In [6]:
comment = "Hey you. Can you do this for me!!?? Can't you do anything right..you shit. Is your pride too big? Involved much? \
           Not true, at all. https://www.youtube.com/watch?v=C6QEqoYgQxw   hey how are you?"

print "\nBasic ? approach: ", extract_questions_regex(clean_comment(comment))
print "\nCoreNLP Parsing approach: ", extract_questions_CoreNLP(clean_comment(comment))


Basic ? approach:  ['Can you do this for me?', 'Is your pride too big?', 'Involved much?', 'hey how are you?']

CoreNLP Parsing approach:  [u'Can you do this for me !!??', u"Ca n't you do anything right .", u'Is your pride too big ?']


## Extract questions from 4 subreddits:
- Using both approaches.
- 75 samples from each subreddit for a total of 300 samples.

In [7]:
SUBREDDITS = ['cringe', 'nfl', 'PoliticalDiscussion', 'The_Donald'] # Test subreddits for comparing the two approaches

In [8]:
inp_dict = {'Questions_Basic':[], 'Questions_CoreNLP': [], 'Comment_Text':[], 'Reply_Text':[], 'Comment_ID':[], 'Reply_ID':[], 
            'Subreddit':[]}

for subred in SUBREDDITS:    
    with open('../pickles/subreddit_interactions_withIDs/'+subred+'-interactions.pickle', 'rb') as f:
        data = pickle.load(f)
    print "Pickle loaded: ", subred
    
    counter = 0
    for user_tup, conversation in data.items():
        for interaction in conversation: # interaction is a (comment,reply) tuple so it's length is always 2
            comment_id = interaction[0][0]
            comment_text = interaction[0][1]

            reply_id = interaction[1][0]
            reply_text = interaction[1][1]

            reply_text = clean_comment(reply_text)
            
            try:
                questions_basic = extract_questions_regex(reply_text)
                questions_coreNLP = extract_questions_CoreNLP(reply_text)
            except:
                print "This didn't work: ", reply_text
                continue

            inp_dict['Questions_Basic'].append(questions_basic)
            inp_dict['Questions_CoreNLP'].append(questions_coreNLP)
            inp_dict['Comment_Text'].append(comment_text)
            inp_dict['Reply_Text'].append(reply_text)
            inp_dict['Comment_ID'].append(comment_id)
            inp_dict['Reply_ID'].append(reply_id)
            inp_dict['Subreddit'].append(subred)
            
            counter += 1
            
        if counter >= 75:
            break

Pickle loaded:  cringe
Pickle loaded:  nfl
Pickle loaded:  PoliticalDiscussion
Pickle loaded:  The_Donald


In [9]:
# Turn into a dataframe:
df = pd.DataFrame.from_dict(inp_dict, orient='columns')

cols = [u'Reply_Text', u'Questions_Basic', u'Questions_CoreNLP', u'Comment_Text', u'Comment_ID', u'Reply_ID', u'Subreddit']
df = df[cols]
print df.shape

(302, 7)


In [10]:
df.head()

Unnamed: 0,Reply_Text,Questions_Basic,Questions_CoreNLP,Comment_Text,Comment_ID,Reply_ID,Subreddit
0,You're thinking of Mel Gibson.,[],[],His performance in Space Balls was top notch.,czb7buz,czba13l,cringe
1,that deescalated quickly,[],[],That went from 100 to 0 real fucking quick.,czc6hot,czci4z7,cringe
2,"""That was our one year cake""\n\n\nThat's ok de...",[],[],Start of a good marrage.,44rqt6,cztpx3n,cringe
3,Couldn't do it,[],[Could n't do it],3 Guys breaking up with girls they are not dating,4fy9zm,d2d1y4a,cringe
4,All I'm saying is that Trump was clearly talki...,[],[],&gt;The absolute vast majority of illegal immi...,czcwtxo,czcx098,cringe


In [11]:
#### df.to_csv('../coreNLP_vs_Regex_302.csv', index=None)

### Stats:
- Manual Analysis: 
    - Total of 302 comments. They contain a total of 697 sentences.
    - 241 comments have no questions in them.
    - 61 comments contain a total of 81 questions.
    - Out of these 81 questions, 7 did not have a question mark:
        1. "holy shit don't you have anything better to do." [22]
        2. "Jesus fucking Christ how is this not top of the sub." [35]
        3. "wat" [42]
        4. "Was wondering why this was up voted" [62] (Indirect Question)
        5. "DO WE CARE ABOUT CONCUSSIONS OR NOT!" [80]
        6. "Even if you disagree, why single out a single individual - you're being a bully by doing so." [257]
        7. "How to spot someone that works and pays taxes" [278]
        
#### CoreNLP Parsing Performance:
- CoreNLP has 41 sentences labeled as Questions.
- TP = 40
- FP = 1
- FN = 41
- TN = 615

#### Regex Performance:
- Regex has 74 sentences labeled as Questions.
- TP = 74
- FP = 0 (kinda impossible by definition)
- FN = 7
- TN = 616

#### A Note about True Negatives:
- Both approaches correctly label this as NotQuestion (cuz it's quoting):
    - He said something like "How is anyone supposed to learn anything when all the Muslims kids won't stop clapping?" [297]
    - ..."We can't possibly stand toe-to-toe against them in a conventional, army to army fight, so how do we protect ourselves while still maintaining our rights as a sovereign nation to chart our own path?" [226]

## Examples where it fails:
#### CoreNLP Parsing:
- False positives (incorrectly classified as questions):
    - "Couldn't do it" [5]

- False negatives (questions that CoreNLP missed out on):
    - "So Canadians are rapists and drug dealers?" [7]
    - "[Grapefruit?" [15]
    - "holy shit don't you have anything better to do." [22]
    - "Jesus fucking Christ how is this not top of the sub." [35]
    - "wat" [42]
    - "Unless I misunderstood you (which frankly could be possible) are you trying to say that she isn't a credible authority because she says organic farms are bad?" [57]
    - "Was wondering why this was up voted" [62]
    - "DO WE CARE ABOUT CONCUSSIONS OR NOT!" [80]
    - "In the playoffs?" [86]
    - "Was Stewart mocking white dances or is he just a nerd?" [88]
    - "George Takei sitting in a throne doing an evil laugh?" [103]
    - "I know it's been a while, but didn't the Panthers used to have a good defense?" [105]
    - "Like, 2 weeks ago?" [105]
    - "Blaming shitty defense on the tablets?" [117]
    - "In this case, you would want the grower, right?" [125]
    - "Sure, he's a bit older but who can they field at center better than him?" [131]
    - "Why all the hate?" [132]
    - "Mike Tolbert?" [137]
    - "Bust?" [142]
    - "Live up to crazy expectations?" [142]
    - "Really?" [153]
    - "You've never heard Romney come off as stiff or robotic?" [153]
    - "And who are these people thinking Hillary should smile more?" [153]
    - "You think that's why they don't like Cruz?" [169]
    - "By your argument, no republican can win, so what does it matter who they support?" [170]
    - "Jesus, are there really Gaddafi apologists?" [178]
    - "Well, seeing how there is no evidence of Rice using RNC, coupled with lots of testimony now and during the time that she rarely used email and when she did she used the state department, why wouldn't you?" [180]
    - "I thought it was in the 40s?" [192]
    - "Problems with the concept now?" [212]
    - "I didn't see the video, but he just had to be joking right?" [214]
    - "What market force would cause minimum wage to increase?" [217]
    - "How?" [228]
    - "With force?" [228]
    - "Forever?" [228]
    - "We're going to spend billions, probably *trillions* in the long-run preventing other polities from discovering nuclear tech?" [228]
    - "What the fuck is up with /r/sweden Like who are the people posting in there?" [256]
    - "Even if you disagree, why single out a single individual - you're being a bully by doing so." [257]
    - "How to spot someone that works and pays taxes" [278]
    - "So why bring some bullshit argument you *know* is going to fail?" [285]
    - "I know the cuck in the middle, who are the other cucks?" [294]
    
#### Basic Regex:
- False negatives (questions that Regex missed out on): [questions within brackets; questions without a '?'] Note it also correctly labels a quoting question as NotQuestion.
    - "holy shit don't you have anything better to do." [22]
    - "Jesus fucking Christ how is this not top of the sub." [35]
    - "wat" [42]
    - "Was wondering why this was up voted" [62]
    - "DO WE CARE ABOUT CONCUSSIONS OR NOT!" [80]
    - "Even if you disagree, why single out a single individual - you're being a bully by doing so." [257]
    - "How to spot someone that works and pays taxes" [278]
    
- False positives (incorrectly classified as questions): None

### Analyse parse trees:

In [2]:
def explore_tree(comment):
    # CoreNLP server:
    res = annotate_comment(comment)

    # Question if contains SBARQ and SQ
    for output in res['sentences']:
        parse_tree = output['parse']
        print "\n", parse_tree

In [3]:
test = ["So Canadians are rapists and drug dealers?", "holy shit don't you have anything better to do.",]
#         "Jesus fucking Christ how is this not top of the sub.", "wat", "Couldn't do it", \
#         "Unless I misunderstood you (which frankly could be possible) are you trying to say that she isn't a credible \
#          authority because she says organic farms are bad?", "Was wondering why this was up voted", \
#         "DO WE CARE ABOUT CONCUSSIONS OR NOT!", "In the playoffs?", "Was Stewart mocking white dances or is he just a nerd?", \
#         "Sure, he's a bit older but who can they field at center better than him?", "Why all the hate?", "Mike Tolbert?", \
#         "Bust?", "Live up to crazy expectations?"]
len(test)

2

In [7]:
for comment in test:
    print "\n", comment
    explore_tree(comment)


So Canadians are rapists and drug dealers?

(ROOT
  (S (RB So)
    (NP (NNPS Canadians))
    (VP (VBP are)
      (NP
        (NP (NNS rapists))
        (CC and)
        (NP (NN drug) (NNS dealers))))
    (. ?)))

holy shit don't you have anything better to do.

(ROOT
  (S
    (NP (JJ holy) (NN shit))
    (VP (VBP do) (RB n't)
      (SBAR
        (S
          (NP (PRP you))
          (VP (VBP have)
            (S
              (NP (NN anything))
              (ADJP (JJR better))
              (S
                (VP (TO to)
                  (VP (VB do)))))))))
    (. .)))


In [8]:
explore_tree("Why do you care?")


(ROOT
  (SBARQ
    (WHADVP (WRB Why))
    (SQ (VBP do)
      (NP (PRP you))
      (VP (VB care)))
    (. ?)))
