In [73]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns

from nltk import word_tokenize, pos_tag
import nltk

%matplotlib inline

pal = sns.color_palette()

from spacy.en import English
nlp = English()

Useful links:
* https://www.kaggle.com/c/quora-question-pairs
* http://www.nltk.org/book/ch05.html
* http://stackoverflow.com/questions/6115677/english-grammar-for-parsing-in-nltk

# Approach 1: Grammatical Analysis

## General Stratetegy

1. Divide and conquer
    * Group questions by type: how to, difference between, opinion on, etc.
    * Identify grammatical rules
2. Reiterate

## Training Set

In [2]:
cwd = os.getcwd()
print(cwd)

/Users/luguccioni/Documents/kaggle_competition


In [3]:
df_train = pd.read_csv('train.csv')
df_train.head(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [5]:
df_train.loc[df_train['is_duplicate'] == 1].head(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
11,11,23,24,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,1
12,12,25,26,What can make Physics easy to learn?,How can you make physics easy to learn?,1
13,13,27,28,What was your first sexual experience like?,What was your first sexual experience?,1


Exploring individual questions:

In [6]:
print(df_train.iloc[29,3])
print(df_train.iloc[29,4])

How should I prepare for CA final law?
How one should know that he/she completely prepare for CA final exam?


##### Some Initial Groups

In [7]:
how_to = ("how can i", "what should i do to", "how can you", "what can make",
             "how should i", "how one should", "how do we", "how do i", "how to", "what are some special cares for someone")

difference = ()
opinion = ("how is the new", "how bad is the new", "what is the most", "what is the best")

In [8]:
def findmatches(q = 'question1', string_list = how_to):
    match_list = []
    for n in df_train[q]:
        match=0
        for i in string_list:
            try: # note: Nan in question cause error when applying .lower()
                if i in n.lower():
                    match =+ 1
            except:continue
        match_list.append(match > 0)
    return pd.Series(match_list)

### The "How To" Group

Focusing in on one group at the time

In [9]:
df_train['Q1_howto'] = findmatches()
df_train['Q2_howto'] = findmatches(q = 'question2')
df_train['how_to_Qs'] = (df_train['Q1_howto'] == True) & (df_train['Q2_howto'] == True)

df_train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,Q1_howto,Q2_howto,how_to_Qs
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,False,False,False
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,False,False,False
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,True,True,True
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,True,False,False
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,False,False,False


In [10]:
df_train.loc[df_train['how_to_Qs'] == True].head(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,Q1_howto,Q2_howto,how_to_Qs
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,True,True,True
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1,True,True,True
11,11,23,24,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,1,True,True,True
12,12,25,26,What can make Physics easy to learn?,How can you make physics easy to learn?,1,True,True,True
29,29,59,60,How should I prepare for CA final law?,How one should know that he/she completely pre...,1,True,True,True


In [11]:
print('Duplicate pairs in this group: {}%'.format(round(df_train.loc[df_train['how_to_Qs'] == True]['is_duplicate'].mean()*100, 2)))
print('Duplicate pairs overall: {}%'.format(round(df_train['is_duplicate'].mean()*100, 2)))

Duplicate pairs in this group: 56.42%
Duplicate pairs overall: 36.92%


Large difference implies that this group exhibits its own patterns.

In [12]:
print(df_train.iloc[2,3])
print(df_train.iloc[2,4])

How can I increase the speed of my internet connection while using a VPN?
How can Internet speed be increased by hacking through DNS?


Key words: VPN vs DNS. Note: These are the only proper nouns.

In [13]:
q1 = pos_tag(word_tokenize(df_train.iloc[2,3]))
print(q1)
q2 = pos_tag(word_tokenize(df_train.iloc[2,4]))
print(q2)

[('How', 'WRB'), ('can', 'MD'), ('I', 'PRP'), ('increase', 'VB'), ('the', 'DT'), ('speed', 'NN'), ('of', 'IN'), ('my', 'PRP$'), ('internet', 'NN'), ('connection', 'NN'), ('while', 'IN'), ('using', 'VBG'), ('a', 'DT'), ('VPN', 'NNP'), ('?', '.')]
[('How', 'WRB'), ('can', 'MD'), ('Internet', 'VB'), ('speed', 'VBN'), ('be', 'VB'), ('increased', 'VBN'), ('by', 'IN'), ('hacking', 'NN'), ('through', 'IN'), ('DNS', 'NNP'), ('?', '.')]


In [14]:
how_to_df = df_train.loc[df_train['how_to_Qs'] == True].loc[:,'id':'is_duplicate']
how_to_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
11,11,23,24,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,1
12,12,25,26,What can make Physics easy to learn?,How can you make physics easy to learn?,1
29,29,59,60,How should I prepare for CA final law?,How one should know that he/she completely pre...,1


In [15]:
print([item[0].lower() for item in q1 if item[1] == 'NNP'])
print([item[0].lower() for item in q2 if item[1] == 'NNP'])

[item[0].lower() for item in q1 if item[1] == 'NNP'] == [item[0].lower() for item in q2 if item[1] == 'NNP']


['vpn']
['dns']


False

Check for proper noun matches

In [16]:
NNP_match = []
for n in how_to_df['id']:
    q1 = how_to_df.loc[n,'question1']
    q2 = how_to_df.loc[n,'question2']
    q1_pos = pos_tag(word_tokenize(q1))
    q2_pos = pos_tag(word_tokenize(q2))
    #print(how_to_df.loc[n,'question1'])
    #print(how_to_df.loc[n,'question2'])
    NNP1 = [item[0].lower() for item in q1_pos if item[1] == 'NNP']
    NNP2 = [item[0].lower() for item in q2_pos if item[1] == 'NNP']
    #print(NNP1)
    #print(NNP2)
    match = int(NNP1 == NNP2)
    for i in NNP1:
        if i in q2: match = 1
    for m in NNP2:
        if m in q1: match = 1
    NNP_match.append(match)

In [24]:
how_to_df.index = range(len(how_to_df))

In [25]:
pd.Series(NNP_match).head()
how_to_df['NNP_match'] = pd.Series(NNP_match)

In [26]:
how_to_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,NNP_match
0,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,0
1,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1,1
2,11,23,24,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,1,1
3,12,25,26,What can make Physics easy to learn?,How can you make physics easy to learn?,1,1
4,29,59,60,How should I prepare for CA final law?,How one should know that he/she completely pre...,1,1


In [28]:
print('Recall: Duplicate pairs in this group: {}%'.format(round(df_train.loc[df_train['how_to_Qs'] == True]['is_duplicate'].mean()*100, 2)))

Recall: Duplicate pairs in this group: 56.42%


Confusion Matrix

In [29]:
y_actu = pd.Series(how_to_df['is_duplicate'], name='Actual')
y_pred = pd.Series(how_to_df['NNP_match'], name='Predicted')
df_confusion = pd.crosstab(y_actu, y_pred, margins = True)
df_confusion

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3353,9184,12537
1,1873,14355,16228
All,5226,23539,28765


In [30]:
print('Accuracy Rate: {}%'.format(round(pd.Series(y_actu == y_pred).mean()*100,2)))

Accuracy Rate: 61.56%


Focus on Predicted as True, but actual False:

In [66]:
how_to_df[(y_actu == 0) & (y_pred == 1)].head(10)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,NNP_match
6,36,73,74,I'm a 19-year-old. How can I improve my skills...,I am a 19 year old guy. How can I become a bil...,0,1
15,181,363,364,How can I stop being addicted to love?,How do I stop being addicted to someone?,0,1
19,222,445,446,How can I find job in Japan?,How can I find an IT job in Japan?,0,1
23,268,536,537,How do I love my body as a guy?,"How can I make my whole body more fair, if I a...",0,1
25,282,564,565,How do I get over a friend with whom I haven't...,How do I get over someone I loved now that we ...,0,1
27,300,600,601,How should I start small business effectively?,How should I start a small business in constru...,0,1
31,352,703,704,How do I send message from one Android phone t...,How do I transfer my contacts from one Android...,0,1
32,363,724,725,How do I edit my devices on my Google play acc...,How do I add a second device to a Google Play ...,0,1
39,525,1048,1049,How can I treat a severe foot sprain?,How can I get a severe foot sprain?,0,1
43,568,1134,1135,How do I start writing again?,How do I start writing?,0,1


In [86]:
Q1 = how_to_df.iloc[43,3]
print(Q1)

How do I start writing again?


In [87]:
Q2 = how_to_df.iloc[43,4]
print(Q2)

How do I start writing?


In [83]:
pos_tag(word_tokenize(Q1))

[('How', 'WRB'),
 ('do', 'VBP'),
 ('I', 'PRP'),
 ('edit', 'VB'),
 ('my', 'PRP$'),
 ('devices', 'NNS'),
 ('on', 'IN'),
 ('my', 'PRP$'),
 ('Google', 'NNP'),
 ('play', 'NN'),
 ('account', 'NN'),
 ('?', '.')]

In [80]:
for tok in nlp(Q1):
    print(str(tok) + "\t\t" + str(tok.dep_))

How		advmod
do		aux
I		nsubj
send		ROOT
message		dobj
from		prep
one		nummod
Android		compound
phone		pobj
to		prep
another		det
Android		compound
phone		pobj
through		prep
bluetooth		pobj
?		punct


# Next Steps

Combine end_match with NNP_match. Use end_match when NNP_match is true as a result of empty sets.

# Ideas
* at least one NN in common (per sentence)
* similar word count
* if no NNP => at least 2 NN
* Number of NN in common dependent on number of words
* qualifiers: in + word at the end.
* same dobj ?

In [132]:
how_to_df.head(8)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,NNP_match
0,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,0
1,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1,1
2,11,23,24,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,1,1
3,12,25,26,What can make Physics easy to learn?,How can you make physics easy to learn?,1,1
4,29,59,60,How should I prepare for CA final law?,How one should know that he/she completely pre...,1,1
5,31,63,64,What are some special cares for someone with a...,How can I keep my nose from getting stuffy at ...,1,1
6,36,73,74,I'm a 19-year-old. How can I improve my skills...,I am a 19 year old guy. How can I become a bil...,0,1
7,38,77,78,How do we prepare for UPSC?,How do I prepare for civil service?,1,0


In [142]:
end_match = []
for i in range(len(how_to_df)):
    end1 = how_to_df['question1'][i].strip('?').split()[-2:]
    end2 = how_to_df['question2'][i].strip('?').split()[-2:]
    end_match_i = 0
    for n in end1:
        if n in end2: end_match_i = 1
    for n in end2:
        if n in end1: end_match_i = 1
    end_match.append(end_match_i)
how_to_df['end_match'] = pd.Series(end_match)

In [144]:
how_to_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,NNP_match,end_match
0,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,0,0
1,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1,1,1
2,11,23,24,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,1,1,1
3,12,25,26,What can make Physics easy to learn?,How can you make physics easy to learn?,1,1,1
4,29,59,60,How should I prepare for CA final law?,How one should know that he/she completely pre...,1,1,1


In [145]:
print('Accuracy Rate: {}%'.format(round(pd.Series(y_actu == how_to_df['end_match']).mean()*100,2)))

Accuracy Rate: 62.01%


In [46]:
how_to_df.iloc[6,4].split('.')

['I am a 19 year old guy',
 ' How can I become a billionaire in the next 10 years?']

In [60]:
nltk.help.upenn_tagset('PRP')

PRP: pronoun, personal
    hers herself him himself hisself it itself me myself one oneself ours
    ourselves ownself self she thee theirs them themselves they thou thy us


__Pairs with only one question in the "how to group" (partially in group)__

In [32]:
part_how_to_df = df_train.loc[(df_train['how_to_Qs'] != True) &
                              ((df_train['Q1_howto'] == True) | 
                               (df_train['Q2_howto'] == True))].loc[:,'id':'is_duplicate']

In [34]:
part_how_to_df.head()
print('Duplicate pairs in this group: {}%'.format(round(part_how_to_df['is_duplicate'].mean()*
                                                        100, 2)))

Duplicate pairs in this group: 35.4%


Not very different from average. Ignore this group for now.

Next steps:
* Use CM to improve grammar rules

## Exploration of Grammar Tools

Selecting sample questions.

In [35]:
Q1 = df_train.iloc[5,3]
print(Q1)

Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?


In [36]:
Q2 = df_train.iloc[5,4]
print(Q2)

I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?


### POS Tagging

In [37]:
pos_tag(word_tokenize(Q1))

[('Astrology', 'NNP'),
 (':', ':'),
 ('I', 'PRP'),
 ('am', 'VBP'),
 ('a', 'DT'),
 ('Capricorn', 'NNP'),
 ('Sun', 'NNP'),
 ('Cap', 'NNP'),
 ('moon', 'NN'),
 ('and', 'CC'),
 ('cap', 'NN'),
 ('rising', 'VBG'),
 ('...', ':'),
 ('what', 'WP'),
 ('does', 'VBZ'),
 ('that', 'IN'),
 ('say', 'VB'),
 ('about', 'IN'),
 ('me', 'PRP'),
 ('?', '.')]

In [120]:
nltk.help.upenn_tagset('NNP') # Query documentation for tags

NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...


### Grammatical Structure

In [68]:
print(Q1)
print(Q2)

Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?
I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?


In [76]:
doc1 = nlp(Q1[11:])
sub_toks1 = [tok for tok in doc1 if (tok.dep_ == "nsubj") ]
doc2 = nlp(Q2)
sub_toks2 = [tok for tok in doc2 if (tok.dep_ == "nsubj") ]

print(sub_toks1)
print(sub_toks2)

[I, that]
[I, What, this]


In [77]:
print("tok" + "\t\t" + "tok.dep_")
for tok in doc1:
    print(str(tok) + "\t\t" + str(tok.dep_))

tok		tok.dep_
I		nsubj
am		ROOT
a		det
Capricorn		compound
Sun		compound
Cap		compound
moon		attr
and		cc
cap		conj
rising		acl
...		punct
what		dobj
does		aux
that		nsubj
say		ccomp
about		prep
me		pobj
?		punct


In [69]:
print("tok" + "\t\t" + "tok.dep_")
for tok in doc2:
    print(str(tok) + "\t\t" + str(tok.dep_))

tok		tok.dep_
I		nsubj
'm		ROOT
a		det
triple		amod
Capricorn		attr
(		punct
Sun		appos
,		punct
Moon		conj
and		cc
ascendant		amod
in		prep
Capricorn		pobj
)		punct
What		nsubj
does		aux
this		nsubj
say		ccomp
about		prep
me		pobj
?		punct


### Grammar

http://stackoverflow.com/questions/6115677/english-grammar-for-parsing-in-nltk

 Symbol | Meaning | Example 
------|------|------
 S | sentence | the man walked 
NP | noun phrase | a dog
VP | verb phrase | saw a park
PP | prepositional phrase| with a telescope
Det | determiner | the
N | noun | dog
V | verb | walked
P | preposition | in


## Testing Set

In [8]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?
