In [1]:
from py2neo import Graph
import re, string
from nltk.corpus import stopwords
import nltk

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/philgooch/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# connect to Neo4j instance using py2neo - default running locally
graphdb = Graph('http://neo4j:pass@localhost:7474/db/data')

In [3]:
# define some parameterized Cypher queries

# For data insertion
INSERT_QUERY = '''
    FOREACH (t IN {wordPairs} | 
        MERGE (w0:Word {word: t[0]})
        MERGE (w1:Word {word: t[1]})
        CREATE (w0)-[:NEXT_WORD]->(w1)
        )
'''

# get the set of words that appear to the left of a specified word in the text corpus
LEFT1_QUERY = '''
    MATCH (s:Word {word: {word}})
    MATCH (w:Word)-[:NEXT_WORD]->(s)
    RETURN w.word as word
'''

# get the set of words that appear to the right of a specified word in the text corpus
RIGHT1_QUERY = '''
    MATCH (s:Word {word: {word}})
    MATCH (w:Word)<-[:NEXT_WORD]-(s)
    RETURN w.word as word
'''

In [4]:
PUNCTUATION = re.compile('[%s’‘“”]' % re.escape(string.punctuation))

In [5]:
# convert a sentence string into a list of lists of adjacent word pairs
# arrifySentence("Hi there, Bob!") = [["hi", "there"], ["there", "bob"]]
def arrifySentence(sentence):
    sentence = sentence.lower()
    sentence = sentence.strip()
    sentence = PUNCTUATION.sub('', sentence)
    wordArray = [word for word in sentence.split() if word not in stopwords.words('english')]
    tupleList = []
    for i, word in enumerate(wordArray):
        if i+1 == len(wordArray):
            break
        tupleList.append([word, wordArray[i+1]])
    return tupleList

In [6]:
def loadFile(filepath):
    tx = graphdb.begin()
    with open(filepath, "r") as f:
        count = 0
        for l in f:
            params = {'wordPairs': arrifySentence(l)}
            tx.run(INSERT_QUERY, params)
            tx.process()
            count += 1
            if count > 300:
                tx.commit()
                tx = graphdb.begin()
                count = 0
    f.close()
    tx.commit()

In [7]:
loadFile('resources/scandal-full.txt')

In [8]:
# return a set of all words that appear to the left of `word`
def left1(word):
    params = {
        'word': word.lower()
    }
    tx = graphdb.begin()
    results = tx.run(LEFT1_QUERY, params)
    tx.commit()
    words = []
    for result in results:
        for line in result:
            words.append(line)
    return set(words)

In [9]:
# return a set of all words that appear to the right of `word`
def right1(word):
    params = {
        'word': word.lower()
    }
    tx = graphdb.begin()
    results = tx.run(RIGHT1_QUERY, params)
    tx.commit()
    words = []
    for result in results:
        for line in result:
            words.append(line)
    return set(words)

In [10]:
# compute Jaccard coefficient
def jaccard(a,b):
    intSize = len(a.intersection(b))
    unionSize = len(a.union(b))
    return intSize / unionSize

In [11]:
# we define paradigmatic similarity as the average of the Jaccard coefficents of the `left1` and `right1` sets
def paradigSimilarity(w1, w2):
    return (jaccard(left1(w1), left1(w2)) + jaccard(right1(w1), right1(w2))) / 2.0

In [54]:
paradigSimilarity('king', 'kingdom')

0.03125

In [64]:
paradigSimilarity('woman', 'photograph')

0.041245791245791245

In [65]:
paradigSimilarity('holmes', 'bohemia')

0.023809523809523808