In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns

from nltk import word_tokenize, pos_tag
import nltk

%matplotlib inline

pal = sns.color_palette()

Useful links:
* https://www.kaggle.com/c/quora-question-pairs
* http://www.nltk.org/book/ch05.html
* http://stackoverflow.com/questions/6115677/english-grammar-for-parsing-in-nltk

# Approach 1: Grammatical Analysis

## General Stratetegy

1. Divide and conquer
    * Group questions by type: how to, difference between, opinion on, etc.
    * Identify grammatical rules
2. Reiterate

## Training Set

In [2]:
cwd = os.getcwd()
print(cwd)

/Users/luguccioni/Documents/kaggle_competition


In [13]:
df_train = pd.read_csv('train.csv')
df_train.head(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [12]:
df_train.loc[df_train['is_duplicate'] == 1].head(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
11,11,23,24,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,1
12,12,25,26,What can make Physics easy to learn?,How can you make physics easy to learn?,1
13,13,27,28,What was your first sexual experience like?,What was your first sexual experience?,1


Exploring individual questions:

In [None]:
print(df_train.iloc[29,3])
print(df_train.iloc[29,4])

##### Some Initial Groups

In [90]:
how_to = ("how can i", "what should i do to", "how can you", "what can make",
             "how should i", "how one should", "how do we", "how do i", "how to", "what are some special cares for someone")

difference = ()
opinion = ("how is the new", "how bad is the new", "what is the most", "what is the best")

In [91]:
def findmatches(q = 'question1', string_list = how_to):
    match_list = []
    for n in df_train[q]:
        match=0
        for i in string_list:
            try: # note: Nan in question cause error when applying .lower()
                if i in n.lower():
                    match =+ 1
            except:continue
        match_list.append(match > 0)
    return pd.Series(match_list)

### The "How To" Group

Focusing in on one group at the time

In [99]:
df_train['Q1_howto'] = findmatches()
df_train['Q2_howto'] = findmatches(q = 'question2')
df_train['how_to_Qs'] = (df_train['Q1_howto'] == True) & (df_train['Q2_howto'] == True)

df_train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,Q1_howto,Q2_howto,how_to_Qs
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,False,False,False
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,False,False,False
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,True,True,True
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,True,False,False
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,False,False,False


In [103]:
df_train.loc[df_train['how_to_Qs'] == True].head(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,Q1_howto,Q2_howto,how_to_Qs
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,True,True,True
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1,True,True,True
11,11,23,24,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,1,True,True,True
12,12,25,26,What can make Physics easy to learn?,How can you make physics easy to learn?,1,True,True,True
29,29,59,60,How should I prepare for CA final law?,How one should know that he/she completely pre...,1,True,True,True


In [105]:
print('Duplicate pairs in this group: {}%'.format(round(df_train.loc[df_train['how_to_Qs'] == True]['is_duplicate'].mean()*100, 2)))
print('Duplicate pairs overall: {}%'.format(round(df_train['is_duplicate'].mean()*100, 2)))

Duplicate pairs in this group: 56.42%
Duplicate pairs overall: 36.92%


Large difference implies that this group exhibits its own patterns.

In [108]:
print(df_train.iloc[2,3])
print(df_train.iloc[2,4])

How can I increase the speed of my internet connection while using a VPN?
How can Internet speed be increased by hacking through DNS?


Key words: VPN vs DNS. Note: These are the only proper nouns.

In [132]:
q1 = pos_tag(word_tokenize(df_train.iloc[2,3]))
print(q1)
q2 = pos_tag(word_tokenize(df_train.iloc[2,4]))
print(q2)

[('How', 'WRB'), ('can', 'MD'), ('I', 'PRP'), ('increase', 'VB'), ('the', 'DT'), ('speed', 'NN'), ('of', 'IN'), ('my', 'PRP$'), ('internet', 'NN'), ('connection', 'NN'), ('while', 'IN'), ('using', 'VBG'), ('a', 'DT'), ('VPN', 'NNP'), ('?', '.')]
[('How', 'WRB'), ('can', 'MD'), ('Internet', 'VB'), ('speed', 'VBN'), ('be', 'VB'), ('increased', 'VBN'), ('by', 'IN'), ('hacking', 'NN'), ('through', 'IN'), ('DNS', 'NNP'), ('?', '.')]


In [134]:
how_to_df = df_train.loc[df_train['how_to_Qs'] == True].loc[:,'id':'is_duplicate']
how_to_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
11,11,23,24,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,1
12,12,25,26,What can make Physics easy to learn?,How can you make physics easy to learn?,1
29,29,59,60,How should I prepare for CA final law?,How one should know that he/she completely pre...,1


In [145]:
print([item[0].lower() for item in q1 if item[1] == 'NNP'])
print([item[0].lower() for item in q2 if item[1] == 'NNP'])

[item[0].lower() for item in q1 if item[1] == 'NNP'] == [item[0].lower() for item in q2 if item[1] == 'NNP']


['vpn']
['dns']


False

Check for proper noun matches

In [194]:
NNP_match = []
for n in how_to_df['id']:
    q1 = how_to_df.loc[n,'question1']
    q2 = how_to_df.loc[n,'question2']
    q1_pos = pos_tag(word_tokenize(q1))
    q2_pos = pos_tag(word_tokenize(q2))
    #print(how_to_df.loc[n,'question1'])
    #print(how_to_df.loc[n,'question2'])
    NNP1 = [item[0].lower() for item in q1_pos if item[1] == 'NNP']
    NNP2 = [item[0].lower() for item in q2_pos if item[1] == 'NNP']
    #print(NNP1)
    #print(NNP2)
    match = int(NNP1 == NNP2)
    for i in NNP1:
        if i in q2: match = 1
    for m in NNP2:
        if m in q1: match = 1
    NNP_match.append(match)
#how_to_df['NNP_match'] = pd.Series(NNP_match)

In [195]:
how_to_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,NNP_match
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,True
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1,False
11,11,23,24,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,1,False
12,12,25,26,What can make Physics easy to learn?,How can you make physics easy to learn?,1,False
29,29,59,60,How should I prepare for CA final law?,How one should know that he/she completely pre...,1,False


In [196]:
print('Recall: Duplicate pairs in this group: {}%'.format(round(df_train.loc[df_train['how_to_Qs'] == True]['is_duplicate'].mean()*100, 2)))

Recall: Duplicate pairs in this group: 56.42%


Confusion Matrix

In [197]:
y_actu = pd.Series(how_to_df['is_duplicate'], name='Actual')
y_pred = pd.Series(how_to_df['NNP_match'], name='Predicted')
df_confusion = pd.crosstab(y_actu, y_pred)

In [199]:
print('Accuracy Rate: {}%'.format(round(pd.Series(y_actu == y_pred).mean()*100,2)))

Accuracy Rate: 4.06%


In [198]:
['physics'][0] in 'How can you make physics easy to learn?'

True

## Exploration of Grammar Tools

Selecting sample questions.

In [35]:
Q1 = df_train.iloc[5,3]
print(Q1)

Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?


In [36]:
Q2 = df_train.iloc[5,4]
print(Q2)

I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?


### POS Tagging

In [37]:
pos_tag(word_tokenize(Q1))

[('Astrology', 'NNP'),
 (':', ':'),
 ('I', 'PRP'),
 ('am', 'VBP'),
 ('a', 'DT'),
 ('Capricorn', 'NNP'),
 ('Sun', 'NNP'),
 ('Cap', 'NNP'),
 ('moon', 'NN'),
 ('and', 'CC'),
 ('cap', 'NN'),
 ('rising', 'VBG'),
 ('...', ':'),
 ('what', 'WP'),
 ('does', 'VBZ'),
 ('that', 'IN'),
 ('say', 'VB'),
 ('about', 'IN'),
 ('me', 'PRP'),
 ('?', '.')]

In [120]:
nltk.help.upenn_tagset('NNP') # Query documentation for tags

NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...


### Grammatical Structure

In [68]:
print(Q1)
print(Q2)

Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?
I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?


In [76]:
from spacy.en import English
nlp = English()

doc1 = nlp(Q1[11:])
sub_toks1 = [tok for tok in doc1 if (tok.dep_ == "nsubj") ]
doc2 = nlp(Q2)
sub_toks2 = [tok for tok in doc2 if (tok.dep_ == "nsubj") ]

print(sub_toks1)
print(sub_toks2)

[I, that]
[I, What, this]


In [77]:
print("tok" + "\t\t" + "tok.dep_")
for tok in doc1:
    print(str(tok) + "\t\t" + str(tok.dep_))

tok		tok.dep_
I		nsubj
am		ROOT
a		det
Capricorn		compound
Sun		compound
Cap		compound
moon		attr
and		cc
cap		conj
rising		acl
...		punct
what		dobj
does		aux
that		nsubj
say		ccomp
about		prep
me		pobj
?		punct


In [69]:
print("tok" + "\t\t" + "tok.dep_")
for tok in doc2:
    print(str(tok) + "\t\t" + str(tok.dep_))

tok		tok.dep_
I		nsubj
'm		ROOT
a		det
triple		amod
Capricorn		attr
(		punct
Sun		appos
,		punct
Moon		conj
and		cc
ascendant		amod
in		prep
Capricorn		pobj
)		punct
What		nsubj
does		aux
this		nsubj
say		ccomp
about		prep
me		pobj
?		punct


### Grammar

http://stackoverflow.com/questions/6115677/english-grammar-for-parsing-in-nltk

 Symbol | Meaning | Example 
------|------|------
 S | sentence | the man walked 
NP | noun phrase | a dog
VP | verb phrase | saw a park
PP | prepositional phrase| with a telescope
Det | determiner | the
N | noun | dog
V | verb | walked
P | preposition | in


## Testing Set

In [8]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?
