# Quora Questions with Spacy and Neo4j

Now I can combine the use of Spacy and Neo4j to try and make an interesting graph database for analysing the Quora Questions Dataset.

In [1]:
# The usual imports
import os, matplotlib, seaborn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline
seaborn.set()

In [2]:
# import the questions dataset
q_data = pd.read_csv('data/train.csv')

In [3]:
def train_dev_test_split(data, a, b, c, seed=1234):
    import random
    random.seed(seed)
    index = list(data.index)
    random.shuffle(index)
    l = len(index)
    ix_train = index[:int(l * a)]
    ix_dev   = index[int(l * a):int(l * (1 - c))]
    ix_test  = index[int(l * (1 - c)):]
    return data.ix[ix_train,:], data.ix[ix_dev,:], data.ix[ix_test,:]

In [4]:
q_train, q_dev, q_test = train_dev_test_split(q_data, 0.7, 0.2, 0.1)

Now I perform all of the processing on the training set, and package this so that it can be performed on the dev and test sets later for validation.

In [5]:
q_train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
214837,214837,320541,320542,Will projectile weapons ever go obsolete?,Are projectile weapons ever largely going to b...,1
352538,352538,481451,274326,What is the easiest way to make an atheist bel...,How do I convince an atheist that there is a God?,0
251562,251562,365705,118438,How do I lose weight in two month?,How do I lose weight in weeks?,0
294831,294831,416786,416787,Why are onion and garlic considered non-vegeta...,Why do some Hindus avoid onion and garlic in t...,0
180664,180664,276940,276941,What are the uses of electrophoresis in biology?,Does spirolactone work for acne?,0


Now I want to make a neo4j database with all of the questions.

In [6]:
# q_train.to_csv('data/q_train.csv', index=False, encoding='utf-8')
# q_dev.to_csv('data/q_dev.csv', index=False, encoding='utf-8')
# q_test.to_csv('data/q_test.csv', index=False, encoding='utf-8')

I can turn the training data into a graph database.

In [7]:
pd.read_csv("data/q_train.csv", index_col='id').head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
214837,320541,320542,Will projectile weapons ever go obsolete?,Are projectile weapons ever largely going to b...,1
352538,481451,274326,What is the easiest way to make an atheist bel...,How do I convince an atheist that there is a God?,0
251562,365705,118438,How do I lose weight in two month?,How do I lose weight in weeks?,0
294831,416786,416787,Why are onion and garlic considered non-vegeta...,Why do some Hindus avoid onion and garlic in t...,0
180664,276940,276941,What are the uses of electrophoresis in biology?,Does spirolactone work for acne?,0


This problem here is coming up with nice representations of the questions that a model can sink its teeth into. The first step in that regard is to do some cleaning.

In [8]:
qs = list(pd.concat([q_train.question1, q_train.question2]))

In [9]:
qs

['Will projectile weapons ever go obsolete?',
 'What is the easiest way to make an atheist believe in god(s)?',
 'How do I lose weight in two month?',
 'Why are onion and garlic considered non-vegetarian by some Hindus?',
 'What are the uses of electrophoresis in biology?',
 'Can I hack a Facebook account when I am logged in but don’t have password or email?',
 'How can I clear my doubts in quantitative aptitude for banking exams?',
 'What do you collect and why?',
 'How should I train and eat to get a body like Alexis Ren?',
 'What kinds of cells produce antibodies? How are they produced?',
 'I want to know all thing related to free body diagram..how to draw fbd of this question and why we drawing reation direction like this?',
 'How can I get rid of dark spots on my face?',
 'How is it being a JP Morgan Summer Analyst in India?',
 'What are the best blogs?',
 'What are the greatest contributions of America in literature and culture?',
 'What are the most cliche crime thriller plots?'

In [10]:
import spacy
nlp = spacy.load('en')

In [97]:
def punct_space(token):    
    return token.is_punct or token.is_space

def line_review(qs):
    for q in qs:
        yield q.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(qs):
    qs = map(str, qs)
    for parsed_q in nlp.pipe(line_review(qs),
                                  batch_size=10000, n_threads=4):
        for sent in parsed_q.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not punct_space(token)])

In [104]:
int_path = 'intermediate/'
with open(int_path + "lemmatized_sentence_corpus.txt", 'w', encoding="utf-8") as f:
    for sent in lemmatized_sentence_corpus(qs):
        f.write(sent + '\n')

In [26]:
' hi there    \'.strip()

SyntaxError: EOL while scanning string literal (<ipython-input-26-3a989fca0676>, line 1)

In [27]:
def grepl(pattern, col):
    return col.str.contains(pattern)
t = "How difficult is it to get an H1B visa as a embedded systems developer?"

In [28]:
q_data.ix[grepl(t, q_data.question1), 'question2'][79250]

'Who would win, Lobo vs all the predators and aliens?'

In [None]:
"CREATE CONSTRAINT ON (q:Question) ASSERT q.qid IS UNIQUE"

In [None]:
"""USING PERIODIC COMMIT
LOAD CSV WITH HEADERS FROM "file:///data/train.csv" AS line

WITH toInt(line.id) AS id, 
     toInt(line.qid1) AS qid1, 
     toInt(line.qid2) AS qid2, 
     line.question1 AS question1, 
     line.question2 AS question2, 
     toBoolean(line.is_duplicate) AS is_duplicate

MERGE (q1:Question {qid: qid1, text: question1})
MERGE (q2:Question {qid: qid2, text: question2})

WITH q1, q2

CREATE (q1)-[r1:IS_DUPLICATE]->(q2)
CREATE (q2)-[r2:IS_DUPLICATE]->(q1)

SET r1.is_duplicate = is_duplicate
SET r2.is_duplicate = is_duplicate"""