# Quora Questions - CS224

Here I'll be using some of the ideas from Stanford University's CompSci 224 course "Deep Learning for Natural Language Processing".

In [2]:
import os, re, matplotlib, seaborn
import itertools as it
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline
seaborn.set()

In [3]:
q_data = pd.read_csv('data/train.csv')

In [4]:
q_data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


Now we form a list of all of the questions in the dataset.

In [5]:
qs = np.concatenate([q_data.question1, q_data.question2])
qs = {str(q) for q in qs}
qs = sorted(qs)

In [6]:
qs

[' Any Canadian teachers (B.Ed. holders) teaching in U.S. schools?',
 ' Are there any underlying psychological tricks/tactics that are used when designing the lines for rides at amusement parks?',
 ' Can I pay with a debit card on PayPal?',
 ' Does New York state have a flagship university?',
 ' Failures haunt me all the time.How do I cope up?',
 ' How can I improve my sex life?',
 ' How do I make the time lapse images using an EOS 70D, with an intervalometer?',
 ' How will you interpret my dream?',
 " I am a 5 letter word.  I am normally below u  If u remove my 1st letter   u'll find me above u  If u remove my 1st & 2nd letters  u cant see me  Answer is really very interesting  Let us see who solves this.... ⏰Time limit :- today U can also send to other grps if I?",
 " I didn't file a police report for a car accident that happened over a month ago. My insurance company won't pay for damage, what do I do?",
 ' I have an offer from Manchester for mechatronics engineering and from Southa

# Spacy

Now we can use Spacy to clean up the text.

In [8]:
import spacy
nlp = spacy.load('en')



    Only loading the 'en' tokenizer.



In [7]:
def clean_lemma(token):
    [re.sub(r"[^A-z]", "", token) for lem in all_lemmas]

def lemmatize(nlp_qs):
    for nlp_q in nlp_qs:
        sents = []
        for sent in nlp_q.sents:
            lems = []
            for token in sent: 
                lem = token.lemma_
                lem = re.sub(r"[^A-z]|[\[\\\\\]\^_`]", "", lem)
                if len(lem) > 0:
                    lems.append(lem)
            sents += lems
        if len(sents) > 0:
            yield sents

In [8]:
nlp_qs = nlp.pipe(qs, batch_size=50000, n_threads=3)
lemmas = lemmatize(nlp_qs)

Now that I have easy access to the lemmatized corpus (via the `nlp` objects), I can try to form a concurrence matrix. It should be sparse, so I might need to implement it as a sparse matrix, we'll have to see.

In [9]:
import pickle

In [10]:
with open("lemmas.pkl", "rb") as f:
    lemmas = pickle.load(f)

In [11]:
all_lemmas = set(it.chain.from_iterable(lemmas))
all_lemmas = sorted(all_lemmas)
len(all_lemmas)

80892

Now the data is finally clean enough to create a concurrence matrix. Lets have a go and see what I find. Now we have indexed all of the lemmas, we can use this to help form the concurrence matrix.

In [12]:
# lemma_list = it.chain.from_iterable(lemmas)
# lemma_list = list(lemma_list)

In [13]:
def get_neighbours(i, lem, window_rad):
    low  = max(i-window_rad, 0)
    high = min(i+window_rad, len(lem))
    return lem[low:high+1]

In [14]:
all_lemmas = it.chain.from_iterable(lemmas)
all_lemmas = sorted(set(all_lemmas))
lemma_dict = {}
for i, lem in enumerate(all_lemmas):
    lemma_dict[lem] = i

In [15]:
window_rad = 1    # How far should the window be on each side
xs = []; ys = []
lem_cnt = len(lemmas)
for i, lem in enumerate(lemmas):
    if i % 1000 == 0:
        print("\rIteration %d out of %d (%.1f%%)" %(i, lem_cnt, i/lem_cnt*100), end = "")
    for j, tok in enumerate(lem):
        nbrs = get_neighbours(j, lem, window_rad)
        xs.append(lemma_dict[tok])
        for nbr in nbrs:
            ys.append(lemma_dict[nbr])     
print("\rDone!")

Done!


In [16]:
pair_cnts = pd.Series(list(zip(xs, ys))).value_counts()

In [17]:
vals = list(pair_cnts)
rows = list([x[0] for x in pair_cnts.index])
cols = list([x[1] for x in pair_cnts.index])

In [18]:
# from scipy.sparse import coo_matrix
# shape = (len(lemma_dict), len(lemma_dict))
# m = coo_matrix((vals, (rows, cols)), shape=shape)

In [19]:
def plot_coo_matrix(m):
    if not isinstance(m, coo_matrix):
        m = coo_matrix(m)
    fig = plt.figure()
    ax = fig.add_subplot(111, axisbg='white')
    ax.plot(m.row[m.data == 1], m.col[m.data == 1], 's', color='blue', ms=0.15)
    ax.plot(m.row[m.data == 0], m.col[m.data == 0], 's', color='red', ms=0.15)
    ax.set_xlim(0, m.shape[1])
    ax.set_ylim(0, m.shape[0])
    ax.set_aspect('equal')
    for spine in ax.spines.values():
        spine.set_visible(False)
    #ax.invert_yaxis()
    ax.set_aspect('equal')
    ax.set_xticks([])
    ax.set_yticks([])
    return ax

In [20]:
# plot_coo_matrix(m)

In [21]:
import scipy.sparse.linalg as la

In [22]:
# svds = la.svds(m.asfptype(),2)

I suppose I can use these svds to do something else later. I should probably read http://gramatica.usc.es/~gamallo/artigos-web/LRE2010Web.pdf first though.

## Neo4j

Now I'd quite like to have a go at converting the sparse matrix model into a graph. I know now that I can use `py2neo` in order to do this.

In [23]:
from py2neo import Graph, Node

In [24]:
import functools as func

In [25]:
%load_ext cypher

In [36]:
graph = Graph("C:\\Users\\caleb\\Documents\\Neo4j\\quora_questions.graphdb",
              password="yCHgrQeDcDOeIVXU")

# Delete all nodes from the graph
graph.delete_all()

In [26]:
os.listdir('data')

['sample_submission.csv',
 'sample_submission.csv.zip',
 'tokens.csv',
 'train.csv',
 'train.csv.zip']

In [37]:
def create_nodes(nodes):
    nodes = list(nodes)
    batch_cnt = 10
    batchs = {i : [node for j, node in enumerate(nodes) if j % batch_cnt == i] for i in range(batch_cnt)}
    
    for i in batchs:
        graph.create(func.reduce(lambda n1, n2: n1 | n2, batchs[i]))

In [None]:
nodes = map(lambda t: Node("Word", text=t), all_lemmas)
%time create_nodes(nodes)

In [29]:
def create_relationships(tups):
    tups = list(tups)
    batch_cnt = 100
    batchs = {i : [tup for j, tup in enumerate(tups) if j % batch_cnt == i] for i in range(batch_cnt)}    
    for key in batchs:
        progress = "\rProcessing batch %d out of %d" % (key + 1, batch_cnt) 
        batch = batchs[key]
        queries = []
        count = 0
        for tup in batch:
            val = tup[0]
            token1 = all_lemmas[tup[1]]
            token2 = all_lemmas[tup[2]]
            print(progress + " (%s, %s)" % (token1, token2), end="")
            query = [
            "MATCH (w%d:Word),(w%d:Word)"                  % (count, count + 1),
            'WHERE w%d.text = "%s" AND w%d.text = "%s"'    % (count, token1, count + 1, token2),
            "CREATE (w%d)-[o:OCCURS_WITH]->(w%d)"          % (count, count),
            "SET o.weight = %d"                            % (val)
            "CREATE (w%d)-[:OCCURS_WITH]->(w%d)"           % (count + 1, count + 1),
            "CREATE (w%d)-[:OCCURS_WITH]->(w%d)"           % (count, count + 1),
            "WITH w%d, w%d"                                % (count, count + 1),
            "RETURN w%d"                                   % (count),
            ] 
            query = '\n'.join(query)
            graph.run(query)
            count += 2

In [31]:
pd.DataFrame(list(zip(vals, rows, cols)), columns=['freqs', 'word1', 'word2']).to_csv("data/relationships.csv")

In [30]:
tups = zip(vals, rows, cols)
create_relationships(tups)

Processing batch 1 out of 100 (favorite, from)

KeyboardInterrupt: 

In [None]:
"""
MATCH (p:Person)-[:LIKES]->(drink:Drink)
WHERE p.name = {name}
RETURN p.name AS name, AVG(drink.calories) AS avg_calories
"""

In [35]:
pd.Series(all_lemmas, name='tokens').to_csv("data/tokens.csv")

# Cypher Queries

I finally made a query that worked - jeez

In [None]:
%%cypher http://neo4j:yCHgrQeDcDOeIVXU@localhost:7474/db/data
LOAD CSV WITH HEADERS FROM 'file:///data/tokens.csv' AS line
WITH line.token as token
MERGE (x:Token {text:token})