In [1]:
!pip install py2neo
!pip install wikipedia
!pip install spacy==3.0.3



In [2]:
import json
import re
import urllib
from pprint import pprint
import time
from tqdm import tqdm

from py2neo import Node, Graph, Relationship, NodeMatcher
from py2neo.bulk import merge_nodes

import numpy as np
import pandas as pd
import wikipedia
from sklearn.metrics.pairwise import cosine_similarity

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.matcher import Matcher
from spacy.tokens import Doc, Span, Token

print(spacy.__version__)

3.0.3


In [3]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.0.0/en_core_web_md-3.0.0-py3-none-any.whl (47.1 MB)
     --------------------------------------- 47.1/47.1 MB 18.7 MB/s eta 0:00:00
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_md')


In [4]:
graph = Graph("bolt://52.91.35.107:7687", name="neo4j", password="explanations-abettors-guys")#graph data science
nodes_matcher = NodeMatcher(graph)

In [5]:
SUBJECTS = ["nsubj", "nsubj", "csubj", "csubjpass", "agent", "expl"]
VERBS = ['ROOT', 'advcl']
OBJECTS = ["dobj", "dative", "attr", "oprd", 'pobj']
ENTITY_LABELS = ['PERSON', 'NORP','FACILITY', 'GPE', 'ORG', 'FAC', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART','DATE','TIME','ORDINAL','LAW']
#ENTITY_LABELS = ['CHARACTER','PLACE','HOUSE','TERMS','Family','Historical Event','RULE','SYMBOL','MOTTO','Title','Age','STATUS','ROLE']
api_key = open('.api_key').read()

non_nc = spacy.load('en_core_web_md')

nlp = spacy.load('en_core_web_md')
nlp.add_pipe('merge_noun_chunks')

print(non_nc.pipe_names)
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']
['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer', 'merge_noun_chunks']


In [6]:
def query_google(query, api_key, limit=10, indent=True, return_lists=True):
    
    text_ls = []
    node_label_ls = []
    url_ls = []
    
    params = {
        'query': query,
        'limit': limit,
        'indent': indent,
        'key': api_key,
    }   
    
    service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
    url = service_url + '?' + urllib.parse.urlencode(params)
    response = json.loads(urllib.request.urlopen(url).read())
    
    if return_lists:
        for element in response['itemListElement']:

            try:
                node_label_ls.append(element['result']['@type'])
            except:
                node_label_ls.append('')

            try:
                text_ls.append(element['result']['detailedDescription']['articleBody'])
            except:
                text_ls.append('')
                
            try:
                url_ls.append(element['result']['detailedDescription']['url'])
            except:
                url_ls.append('')
                
        return text_ls, node_label_ls, url_ls
    
    else:
        return response

In [7]:
def remove_special_characters(text):
    
    regex = re.compile(r'[\n\r\t]')
    clean_text = regex.sub(" ", text)
    
    return clean_text


def remove_stop_words_and_punct(text, print_text=False):
    
    result_ls = []
    rsw_doc = non_nc(text)
    
    for token in rsw_doc:
        if print_text:
            print(token, token.is_stop)
            print('--------------')
        if not token.is_stop and not token.is_punct:
            result_ls.append(str(token))
    
    result_str = ' '.join(result_ls)

    return result_str


def create_svo_lists(doc, print_lists):
    
    subject_ls = []
    verb_ls = []
    object_ls = []

    for token in doc:
        if token.dep_ in SUBJECTS:
            subject_ls.append((token.lower_, token.idx))
        elif token.dep_ in VERBS:
            #verb_ls.append((token.lemma_, token.idx))
            verb_ls.append((token.lemma_, token.idx))
        elif token.dep_ in OBJECTS:
            object_ls.append((token.lower_, token.idx))

    if print_lists:
        print('SUBJECTS: ', subject_ls)
        print('VERBS: ', verb_ls)
        print('OBJECTS: ', object_ls)
    #max_len = max(len(subject_ls), len(object_ls), len(verb_ls))

    #subject_ls += [''] * (max_len - len(subject_ls))
    #object_ls += [''] * (max_len - len(object_ls))
    #verb_ls += [''] * (max_len - len(verb_ls))
    #df_svo = pd.DataFrame({'Subject': subject_ls, 'Verb': verb_ls, 'Object': object_ls})
    #print(df_svo)
    #df_svo.to_csv('df_svo.csv', index=False)
    return subject_ls, verb_ls, object_ls


def remove_duplicates(tup, tup_posn):
    
    check_val = set()
    result = []
    
    for i in tup:
        if i[tup_posn] not in check_val:
            result.append(i)
            check_val.add(i[tup_posn])
            
    return result


def remove_dates(tup_ls):
    
    clean_tup_ls = []
    for entry in tup_ls:
        if not entry[2].isdigit():
            clean_tup_ls.append(entry)
    return clean_tup_ls


def create_svo_triples(text, print_lists=False):
    
    clean_text = remove_special_characters(text)
    doc = nlp(clean_text)
    subject_ls, verb_ls, object_ls = create_svo_lists(doc, print_lists=print_lists)
    
    graph_tup_ls = []
    dedup_tup_ls = []
    clean_tup_ls = []
    
    for subj in subject_ls: 
        for obj in object_ls:
            
            dist_ls = []
            
            for v in verb_ls:
                
                # Assemble a list of distances between each object and each verb
                dist_ls.append(abs(obj[1] - v[1]))
                
            # Get the index of the verb with the smallest distance to the object 
            # and return that verb
            index_min = min(range(len(dist_ls)), key=dist_ls.__getitem__)
            
            # Remve stop words from subjects and object.  Note that we do this a bit
            # later down in the process to allow for proper sentence recognition.

            no_sw_subj = remove_stop_words_and_punct(subj[0])
            no_sw_obj = remove_stop_words_and_punct(obj[0])
            
            # Add entries to the graph iff neither subject nor object is blank
            if no_sw_subj and no_sw_obj:
                tup = (no_sw_subj, verb_ls[index_min][0], no_sw_obj)
                graph_tup_ls.append(tup)
        
        #clean_tup_ls = remove_dates(graph_tup_ls)
    
    dedup_tup_ls = remove_duplicates(graph_tup_ls, 2)
    clean_tup_ls = remove_dates(dedup_tup_ls)
    
    return clean_tup_ls

In [8]:
def get_obj_properties(tup_ls):
    
    init_obj_tup_ls = []
    
    for tup in tup_ls:

        try:
            text, node_label_ls, url = query_google(tup[2], api_key, limit=1)
            new_tup = (tup[0], tup[1], tup[2], text[0], node_label_ls[0], url[0])
        except:
            new_tup = (tup[0], tup[1], tup[2], [], [], [])
        
        init_obj_tup_ls.append(new_tup)
        
    return init_obj_tup_ls


def add_layer(tup_ls):

    svo_tup_ls = []
    
    for tup in tup_ls:
        
        if tup[3]:
            svo_tup = create_svo_triples(tup[3])
            svo_tup_ls.extend(svo_tup)
        else:
            continue
    
    return get_obj_properties(svo_tup_ls)
        

def subj_equals_obj(tup_ls):
    
    new_tup_ls = []
    
    for tup in tup_ls:
        if tup[0] != tup[2]:
            new_tup_ls.append((tup[0], tup[1], tup[2], tup[3], tup[4], tup[5]))
            
    return new_tup_ls


def check_for_string_labels(tup_ls):
    # This is for an edge case where the object does not get fully populated
    # resulting in the node labels being assigned to string instead of list.
    # This may not be strictly necessary and the lines using it are commnted out
    # below.  Run this function if you come across this case.
    
    clean_tup_ls = []
    
    for el in tup_ls:
        if isinstance(el[2], list):
            clean_tup_ls.append(el)
            
    return clean_tup_ls


def create_word_vectors(tup_ls):

    new_tup_ls = []
    
    for tup in tup_ls:
        if tup[3]:
            doc = nlp(tup[3])
            new_tup = (tup[0], tup[1], tup[2], tup[3], tup[4], tup[5], doc.vector)
        else:
            new_tup = (tup[0], tup[1], tup[2], tup[3], tup[4], tup[5], np.random.uniform(low=-1.0, high=1.0, size=(300,)))
        new_tup_ls.append(new_tup)
        
    return new_tup_ls

In [9]:
import os

# create a folder
folder_name = "wikidata4"
os.mkdir(folder_name)

# define the text content
#text_content =wikipedia.summary('Fire and Blood')
# save the text content in a txt file inside the folder



In [10]:
def dedup(tup_ls):
    
    visited = set()
    output_ls = []
    
    for tup in tup_ls:
        if not tup[0] in visited:
            visited.add(tup[0])
            output_ls.append((tup[0], tup[1], tup[2], tup[3], tup[4]))
            
    return output_ls


def convert_vec_to_ls(tup_ls):
    
    vec_to_ls_tup = []
    
    for el in tup_ls:
        vec_ls = [float(v) for v in el[4]]
        tup = (el[0], el[1], el[2], el[3], vec_ls)
        vec_to_ls_tup.append(tup)
        
    return vec_to_ls_tup


def add_nodes(tup_ls):   

    keys = ['name', 'description', 'node_labels', 'url', 'word_vec']
    merge_nodes(graph.auto(), tup_ls, ('Node', 'name'), keys=keys)
    print('Number of nodes in graph: ', graph.nodes.match('Node').count())
    
    return

In [11]:
def add_edges(edge_ls):
    
    edge_dc = {} 
    
    # Group tuple by verb
    # Result: {verb1: [(sub1, v1, obj1), (sub2, v2, obj2), ...],
    #          verb2: [(sub3, v3, obj3), (sub4, v4, obj4), ...]}
    
    for tup in edge_ls: 
        if tup[1] in edge_dc: 
            edge_dc[tup[1]].append((tup[0], tup[1], tup[2])) 
        else: 
            edge_dc[tup[1]] = [(tup[0], tup[1], tup[2])] 
    
    for edge_labels, tup_ls in tqdm(edge_dc.items()):   # k=edge labels, v = list of tuples
        
        tx = graph.begin()
        
        for el in tup_ls:
            source_node = nodes_matcher.match(name=el[0]).first()
            target_node = nodes_matcher.match(name=el[2]).first()
            if not source_node:
                source_node = Node('Node', name=el[0])
                tx.create(source_node)
            if not target_node:
                try:
                    target_node = Node('Node', name=el[2], node_labels=el[4], url=el[5], word_vec=el[6])
                    tx.create(target_node)
                except:
                    continue
            try:
                rel = Relationship(source_node, edge_labels, target_node)
            except:
                continue
            tx.create(rel)
        tx.commit()
    
    return


In [12]:
#full
def edge_tuple_creation(text):
    
    initial_tup_ls = create_svo_triples(text)
    init_obj_tup_ls = get_obj_properties(initial_tup_ls)
    new_layer_ls = add_layer(init_obj_tup_ls)
    starter_edge_ls = init_obj_tup_ls + new_layer_ls
    edge_ls = subj_equals_obj(starter_edge_ls)
    edges_word_vec_ls = create_word_vectors(edge_ls)
    
    return edges_word_vec_ls


def node_tuple_creation(edges_word_vec_ls):
    
    orig_node_tup_ls = [(edges_word_vec_ls[0][0], '', ['Subject'], '', np.random.uniform(low=-1.0, high=1.0, size=(300,)))]
    obj_node_tup_ls = [(tup[2], tup[3], tup[4], tup[5], tup[6]) for tup in edges_word_vec_ls]
    full_node_tup_ls = orig_node_tup_ls + obj_node_tup_ls
    cleaned_node_tup_ls = check_for_string_labels(full_node_tup_ls)
    #dedup_node_tup_ls = dedup(cleaned_node_tup_ls)
    dedup_node_tup_ls = cleaned_node_tup_ls
    node_tup_ls = convert_vec_to_ls(dedup_node_tup_ls)
    
    return node_tup_ls  

In [13]:
%%time
fireandblood_text = wikipedia.summary('Fire & Blood (A Targaryen History, #1)')
file_path = os.path.join(folder_name, "t50.txt")
with open(file_path, 'w') as file:
    file.write(fireandblood_text)
fireandblood_edges_word_vec_ls = edge_tuple_creation(fireandblood_text)
fireandblood_node_tup_ls = node_tuple_creation(fireandblood_edges_word_vec_ls)

Wall time: 7.99 s


In [14]:
%%time
text2 = wikipedia.summary('Everything You Should Know About the Fire and Blood Book')
file_path = os.path.join(folder_name, "t51.txt")
with open(file_path, 'w') as file:
    file.write(text2)
text2_edges_word_vec_ls = edge_tuple_creation(text2)
text2_node_tup_ls = node_tuple_creation(text2_edges_word_vec_ls)

Wall time: 6.8 s


In [15]:
%%time
text3 = wikipedia.summary('House Baratheon')
file_path = os.path.join(folder_name, "t52.txt")
with open(file_path, 'w') as file:
    file.write(text3)
text3_edges_word_vec_ls = edge_tuple_creation(text3)
text3_node_tup_ls = node_tuple_creation(text3_edges_word_vec_ls)

Wall time: 9.69 s


In [16]:
%%time
text4 = wikipedia.summary('Doom of Valyria')
file_path = os.path.join(folder_name, "t53.txt")
with open(file_path, 'w') as file:
    file.write(text4)
text4_edges_word_vec_ls = edge_tuple_creation(text4)
text4_node_tup_ls = node_tuple_creation(text4_edges_word_vec_ls)

Wall time: 24.1 s


In [17]:
%%time
text5 = wikipedia.summary('Fire and Blood Study Guide')
file_path = os.path.join(folder_name, "t54.txt")
with open(file_path, 'w') as file:
    file.write(text5)
text5_edges_word_vec_ls = edge_tuple_creation(text5)
text5_node_tup_ls = node_tuple_creation(text5_edges_word_vec_ls)

Wall time: 11.9 s


In [18]:
%%time
text6 = wikipedia.summary('BLOOD OF DRAGONS A SONG OF ICE AND FIRE MUSH')
file_path = os.path.join(folder_name, "t55.txt")
with open(file_path, 'w') as file:
    file.write(text6)
text6_edges_word_vec_ls = edge_tuple_creation(text6)
text6_node_tup_ls = node_tuple_creation(text6_edges_word_vec_ls)

Wall time: 2.61 s


In [19]:
%%time
text7 = wikipedia.summary('Fire and Blood by George RR Martin review')
file_path = os.path.join(folder_name, "t56.txt")
with open(file_path, 'w') as file:
    file.write(text7)
text7_edges_word_vec_ls = edge_tuple_creation(text7)
text7_node_tup_ls = node_tuple_creation(text7_edges_word_vec_ls)

Wall time: 19.6 s


In [20]:
%%time
text8 = wikipedia.summary('Aegon III')
file_path = os.path.join(folder_name, "t57.txt")
with open(file_path, 'w') as file:
    file.write(text8)
text8_edges_word_vec_ls = edge_tuple_creation(text8)
text8_node_tup_ls = node_tuple_creation(text8_edges_word_vec_ls)

Wall time: 12.9 s


In [21]:
%%time
text9 = wikipedia.summary("Fire & Blood: 300 Years Before A Game of Thrones")
file_path = os.path.join(folder_name, "t58.txt")
with open(file_path, 'w') as file:
    file.write(text9)
text9_edges_word_vec_ls = edge_tuple_creation(text9)
text9_node_tup_ls = node_tuple_creation(text9_edges_word_vec_ls)

Wall time: 13.8 s


In [22]:
%%time
text10 = wikipedia.summary('A Feast for Crows')
file_path = os.path.join(folder_name, "t60.txt")
with open(file_path, 'w') as file:
    file.write(text10)
text10_edges_word_vec_ls = edge_tuple_creation(text10)
text10_node_tup_ls = node_tuple_creation(text10_edges_word_vec_ls)

Wall time: 15.5 s


In [23]:
%%time
text11 = wikipedia.summary('A Storm of Swords')
file_path = os.path.join(folder_name, "t59.txt")
with open(file_path, 'w') as file:
    file.write(text11)
text11_edges_word_vec_ls = edge_tuple_creation(text11)
text11_node_tup_ls = node_tuple_creation(text11_edges_word_vec_ls)


Wall time: 24.2 s


In [24]:
%%time
text12 = wikipedia.summary('Dance with Dragons: Dreams and Dust (A Song of Ice and Fire)')
file_path = os.path.join(folder_name, "t61.txt")
with open(file_path, 'w') as file:
    file.write(text12)
text12_edges_word_vec_ls = edge_tuple_creation(text12)
text12_node_tup_ls = node_tuple_creation(text12_edges_word_vec_ls)


Wall time: 7.35 s


In [25]:
%%time
text13 = wikipedia.summary('The Ice Dragon')
file_path = os.path.join(folder_name, "t62.txt")
with open(file_path, 'w') as file:
    file.write(text13)
text13_edges_word_vec_ls = edge_tuple_creation(text13)
text13_node_tup_ls = node_tuple_creation(text13_edges_word_vec_ls)


Wall time: 6.41 s


In [26]:
%%time
text14 = wikipedia.summary('A Knight of the Seven Kingdoms: A Song of Ice and Fire')
file_path = os.path.join(folder_name, "t63.txt")
with open(file_path, 'w') as file:
    file.write(text14)
text14_edges_word_vec_ls = edge_tuple_creation(text14)
text14_node_tup_ls = node_tuple_creation(text14_edges_word_vec_ls)

Wall time: 14.9 s


In [27]:
full_node_ls = fireandblood_node_tup_ls + text2_node_tup_ls+text3_node_tup_ls+text4_node_tup_ls+text5_node_tup_ls+ text6_node_tup_ls+text7_node_tup_ls+text8_node_tup_ls+text9_node_tup_ls+ text10_node_tup_ls+text11_node_tup_ls+text12_node_tup_ls+text13_node_tup_ls+text14_node_tup_ls
full_edge_ls = fireandblood_edges_word_vec_ls + text2_edges_word_vec_ls+ text3_edges_word_vec_ls+ text4_edges_word_vec_ls+ text5_edges_word_vec_ls+ text6_edges_word_vec_ls+ text7_edges_word_vec_ls+ text8_edges_word_vec_ls+ text9_edges_word_vec_ls+ text10_edges_word_vec_ls+ text11_edges_word_vec_ls+ text12_edges_word_vec_ls+ text13_edges_word_vec_ls+ text14_edges_word_vec_ls
full_dedup_node_tup_ls = dedup(full_node_ls)
print(len(full_node_ls), len(full_dedup_node_tup_ls))

467 362


In [28]:
#full
add_nodes(full_dedup_node_tup_ls)
add_edges(full_edge_ls)

Number of nodes in graph:  566


  tx.commit()
100%|██████████████████████████████████████████████████████████████████████████████████| 72/72 [00:48<00:00,  1.49it/s]


In [29]:
full_edge_ls[0:6]

[('fire',
  'be',
  'fantasy book',
  [],
  [],
  [],
  array([ 0.9056491 , -0.62179116,  0.47717725, -0.46243974,  0.4144077 ,
          0.41114337,  0.07060562, -0.57808566, -0.50273112,  0.21066428,
          0.14046367, -0.09478547,  0.57531051, -0.52409349, -0.07865505,
         -0.77526176,  0.38037188, -0.62075516, -0.52982366, -0.50639233,
          0.54245179, -0.22996293,  0.38186088,  0.47201245,  0.37130138,
          0.52304594,  0.59891927,  0.81382899,  0.55467894, -0.01841774,
         -0.40735599,  0.76901752, -0.76738794, -0.24038902,  0.10929141,
          0.49913668,  0.61598318,  0.7898087 , -0.90302829,  0.31578638,
          0.73990447,  0.38965508, -0.85168834, -0.90905168, -0.52896556,
         -0.5999538 , -0.52016672, -0.48469637, -0.82701952, -0.36737768,
          0.3973398 ,  0.61186701,  0.97970217,  0.63608849, -0.17930163,
          0.66778751, -0.12269533,  0.77900315, -0.79750612, -0.91119992,
          0.81652292,  0.09278741, -0.56808167,  0.8247029

import numpy as np
def get_word_vec_similarity(node1, node2, node_ls):
    
    node1_vec = [tup[4] for tup in node_ls if tup[0] == node1]
    node2_vec = [tup[4] for tup in node_ls if tup[0] == node2]
    
    return cosine_similarity((np.array(node1_vec)).reshape(-1,1),((np.array(node2_vec).reshape(-1,1))))

cs = get_word_vec_similarity('fire and blood', 'Fire &blood', dedup_node_tup_ls)
print(cs)

In [30]:
full_dedup_node_tup_ls

[('fire',
  '',
  ['Subject'],
  '',
  [0.24939157505742005,
   0.786107377524504,
   0.9333316396939821,
   -0.08126990273328683,
   -0.3096771106220162,
   0.8299149361251872,
   0.10507408786800343,
   -0.818816535019016,
   -0.8460190134253887,
   -0.8768332531555714,
   0.8333148292650441,
   0.1671337704393976,
   -0.33813847358430804,
   0.1027707556944657,
   0.08271933156786049,
   -0.9383177805863334,
   -0.41266721035050247,
   0.7909253427001035,
   -0.5949091975442873,
   -0.5573515472999033,
   -0.058329936306343644,
   -0.6922734995124045,
   0.044061752833732815,
   0.8629398007137341,
   -0.9568342854288594,
   -0.1979603310597038,
   -0.2721482017470296,
   -0.446282049046018,
   0.9261314612783349,
   -0.19027227780273304,
   0.32788914346707876,
   -0.8401636794718657,
   0.21777736257619118,
   -0.6601096725074223,
   -0.20881757913757726,
   0.19749259804268138,
   0.6358660823816311,
   -0.4467639021579084,
   -0.6907808557238979,
   0.4071459929215895,
   0.1744