# Quora Dataset Parsing

In [11]:
from __future__ import unicode_literals
import csv
import json
import spacy
import io
import os
import pandas as pd
import numpy as np

In [2]:
# Converter by Sam Bowman (bowman@nyu.edu)

# Converts quora_duplicate_questions.csv, as distributed, to SNLI's formats.
# Data can be found here: https://data.quora.com/First-Quora-Dataset-Release-Question-Pairs
# Note: No parsing yet, just crude tokenization.

# Instructions: Install spaCy, move into the same directory as the source file, run.

NLP = spacy.load('en')
LABELS = ['entailment', 'neutral']

In [60]:
quora_data = pd.read_csv("train.csv",
                         usecols = ['id', 'qid1', 'qid2',
                                    'question1', 'question2',
                                    'is_duplicate'],
                         header = 0,
                         dtype = dict(question1=str,
                                      question2=str))

In [65]:
quora_data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [69]:
args = dict(tag=False, parse=False, entity=False, n_threads=3)
quora_data['question1_parse'] = [' '.join(w.text for w in nlp_q) for nlp_q
                                     in NLP.pipe(quora_data.question1.astype(str), 
                                                 args)]  
quora_data['question2_parse'] = [' '.join(w.text for w in nlp_q) for nlp_q 
                                     in NLP.pipe(quora_data.question2.astype(str), 
                                                 args)]

In [70]:
quora_data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_parse,question2_parse
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,What is the story of Kohinoor ( Koh - i - Noor...,What would happen if the Indian government sto...
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,Why am I mentally very lonely ? How can I solv...,Find the remainder when [ math]23^{24}[/math ]...
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,"Which one dissolve in water quikly sugar , sal...",Which fish would survive in salt water ?


In [137]:
def train_dev_test_split(df, train_split, dev_split, test_split, seed=1234):
    '''Performs a train, test, split of a dataframe'''
    
    # normalise split values
    split_sum   = train_split + dev_split + test_split
    train_split = train_split/split_sum
    dev_split   = dev_split  /split_sum
    test_split  = test_split /split_sum
    
    # set random seed
    np.random.seed(seed)
    
    mask1 = np.random.random(size=len(df)) < train_split
    mask2 = np.random.random(size=len(df)) < dev_split/(dev_split+test_split)
    
    train_df = df.ix[mask1,:]
    dev_df   = df.ix[np.logical_and(~mask1, mask2),:]
    test_df  = df.ix[np.logical_and(~mask1, ~mask2),:]
    
    return train_df, dev_df, test_df

In [143]:
quora_train, quora_dev, quora_test = train_dev_test_split(quora_data, 0.7, 0.2, 0.1)

print("Train split:\t%.f%%" %(len(quora_train)/len(quora_data) * 100))
print("Dev split:\t%.f%%"   %(len(quora_dev)/len(quora_data) * 100))
print("Test split:\t%.f%%"  %(len(quora_test)/len(quora_data) * 100))

quora_test.head()

Train split:	70%
Dev split:	20%
Test split:	10%


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_parse,question2_parse
9,9,19,20,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,0,Motorola ( company ) : Can I hack my Charter M...,How do I hack Motorola DCX3400 for free intern...
18,18,37,38,Why are so many Quora users posting questions ...,Why do people ask Quora questions which can be...,1,Why are so many Quora users posting questions ...,Why do people ask Quora questions which can be...
32,32,65,66,What Game of Thrones villain would be the most...,What Game of Thrones villain would you most li...,1,What Game of Thrones villain would be the most...,What Game of Thrones villain would you most li...
39,39,79,80,What is the stall speed and AOA of an f-14 wit...,Why did aircraft stop using variable-sweep win...,0,What is the stall speed and AOA of an f-14 wit...,Why did aircraft stop using variable - sweep w...
55,55,111,112,How difficult is it get into RSI?,Do you apply for programs like RSI when you're...,0,How difficult is it get into RSI ?,Do you apply for programs like RSI when you 'r...


In [109]:
quora_train.to_csv("quora_train.csv")
quora_dev.to_csv("quora_dev.csv")
quora_test.to_csv("quora_test.csv")