# Mining Wikipedia for Semantic Relations

I define some functions to extract some semantic relations from Wikipedia text.  Specifically, they are relations that could be useful for crafting "automatic allusions"---i.e., given a `target word` such as `"multitudinous"`, return a snippet of text containing an old proper noun, such as `"as Ashurbanipal's army"`. 

## Mining ⛏️

### Helper Functions

In [1]:
import spacy
nlp = spacy.load("en_core_web_lg")

In [2]:
def test_contiguous(numbers):
    """
    just make sure that ints are contiguous
    useful for making sure a span of text isn't missing any words
    """
    for en,n in enumerate(numbers[:-1]):
        if n+1!=numbers[en+1]:
            return False
    return True

print(test_contiguous([123,124,125,126]))
print(test_contiguous([123,20]))

True
False


In [3]:
def get_chunk(word,tempspacy):
    """
    get all ancestors of a word
    exclude bits that are connected by appos dependency
    """
    descendants = [i for i in tempspacy if (word.is_ancestor(i) or i==word)]
    to_ban = [i for i in descendants if (i.dep_ in ["appos"] and i!=word)]
    descendants_valid = [i for i in descendants if i not in to_ban]
    descendants_valid = [i for i in descendants_valid if not any([banned.is_ancestor(i) for banned in to_ban])]
    return descendants_valid

s = "Moatt of Carthage, a wise king, is very tired."
sp = nlp(s)
print(sp[0])
get_chunk(sp[0],sp)

Moatt


[Moatt, of, Carthage, ,]

In [4]:
def rclip(fragment,bad_ending_pos=["PUNCT","SPACE"]):#ok_ending_pos=["NOUN","PROPN"]):
    """
    remove everything from the right of a spacy sequence if doesn't end correctly
    """
    while len(fragment)>0 and (fragment[-1].pos_ in bad_ending_pos and fragment[-1].text!=")"):
            fragment.pop()
    return fragment


In [5]:
rclip(get_chunk(sp[0],sp))

[Moatt, of, Carthage]

### Functions for Mining Relations from Text Parsed with `spaCy`

In [6]:
def noun_of_nounphrase(tempspacy):
    """
    """
    to_return = []
    for t in tempspacy:
        if t.head.text.lower()=="of" and t.pos_ in ["NOUN","PROPN"]:
            fragment = rclip([i for i in tempspacy if (t.head.is_ancestor(i))])
            fragment_pos = [i.pos_ for i in fragment]
            fragment_indices = [i.i for i in fragment]
            if test_contiguous(fragment_indices):
                if "PROPN" in fragment_pos:
                    to_return.append({
                        "type":"of",
                        "target":str(t.head.head).lower(),
                        #"target_original":str(t.head.head),
                        "other_words":"|".join([child.text.lower() for child in t.head.head.children if child.dep_ in ['amod',"conj","advmod"]]),
                        "allusion":"of "+str(tempspacy[fragment_indices[0]:fragment_indices[-1]+1]),
                        "allusion2":None,
                        "target_pos":t.head.head.pos_,

                    })
    return to_return

noun_of_nounphrase(nlp("Beyond the old cat's stinky legs and ugly face.  The semi-ugly library of the cat of Alexandria.  And there was a Library of Alexandria. The cat's beautiful felt. A ball of yarn. The Shining Scales of Agoatus."))

[{'type': 'of',
  'target': 'library',
  'other_words': 'semi|-|ugly',
  'allusion': 'of the cat of Alexandria',
  'allusion2': None,
  'target_pos': 'NOUN'},
 {'type': 'of',
  'target': 'cat',
  'other_words': '',
  'allusion': 'of Alexandria',
  'allusion2': None,
  'target_pos': 'NOUN'},
 {'type': 'of',
  'target': 'library',
  'other_words': '',
  'allusion': 'of Alexandria',
  'allusion2': None,
  'target_pos': 'PROPN'},
 {'type': 'of',
  'target': 'scales',
  'other_words': 'shining',
  'allusion': 'of Agoatus',
  'allusion2': None,
  'target_pos': 'PROPN'}]

In [7]:
def verb2np(tempspacy):
    to_return = []
    for t in tempspacy:
        if t.dep_=="nsubj":
            subtree = list(t.head.subtree)
            span = tempspacy[subtree[0].i:subtree[-1].i+1]
            if span[0].dep_=="mark":
                span = tempspacy[subtree[1].i:subtree[-1].i+1]
            fragment = [i for i in tempspacy if (t.is_ancestor(i) or i==t)]
            fragment_pos = [i.pos_ for i in fragment]
            fragment_indices = [i.i for i in fragment]
            if test_contiguous(fragment_indices):
                if "PROPN" in fragment_pos:
                    if t.head.lemma_ not in ["be"]: # boring
                        to_return.append({"type":"v2np",
                                          "target":t.head.lemma_,
                                          "other_words":"|".join([child.lemma_ for child in t.head.children if child.dep_ in ["dobj","conj","advmod","prt"]]),
                                          "allusion":str(span),
                                          "allusion2":str(tempspacy[fragment_indices[0]:fragment_indices[-1]+1]),
                                          "target_pos":t.head.pos_,
                                         })
    return to_return

verb2np(nlp("I have heard that Ashurbanurpal hurt my friend. I have heard a rumor that Ashurbanipal's army first advanced south and secured the city of Der, and there were people there"))

[{'type': 'v2np',
  'target': 'hurt',
  'other_words': 'friend',
  'allusion': 'Ashurbanurpal hurt my friend',
  'allusion2': 'Ashurbanurpal',
  'target_pos': 'VERB'},
 {'type': 'v2np',
  'target': 'advance',
  'other_words': 'first|south|secure',
  'allusion': "Ashurbanipal's army first advanced south and secured the city of Der",
  'allusion2': "Ashurbanipal's army",
  'target_pos': 'VERB'}]

In [8]:
from collections import defaultdict

def adj_to_noun_phrase(tempspacy):
    to_return = []
    for t in tempspacy:
        if t.dep_=="amod":
            if "neg" not in [c.dep_ for c in t.children]:
                if (t.head.pos_=="NOUN"):
                    #fragment = rclip([i for i in tempspacy if (t.head.is_ancestor(i) or i==t.head)])
                    fragment = rclip(get_chunk(t.head,tempspacy))
                    fragment_pos = [i.pos_ for i in fragment]
                    fragment_indices = [i.i for i in fragment]
                    if test_contiguous(fragment_indices):
                        if "PROPN" in fragment_pos:
                            to_return.append({
                                "target":str(t).lower(),
                                "allusion":str(tempspacy[fragment_indices[0]:fragment_indices[-1]+1]),
                                "type":"adj2np",
                                "allusion2":None,
                                "other_words":None,
                                "target_pos":"ADJ",
                            })
    return to_return
        
adj_to_noun_phrase(nlp("I saw the rebellious, stinky friends of Azerooit and his merciless ilk, a bad team.  And I saw the shining scales of Aotis."))

[{'target': 'rebellious',
  'allusion': 'the rebellious, stinky friends of Azerooit and his merciless ilk',
  'type': 'adj2np',
  'allusion2': None,
  'other_words': None,
  'target_pos': 'ADJ'},
 {'target': 'stinky',
  'allusion': 'the rebellious, stinky friends of Azerooit and his merciless ilk',
  'type': 'adj2np',
  'allusion2': None,
  'other_words': None,
  'target_pos': 'ADJ'},
 {'target': 'shining',
  'allusion': 'the shining scales of Aotis',
  'type': 'adj2np',
  'allusion2': None,
  'other_words': None,
  'target_pos': 'ADJ'}]

In [9]:
def attr_or_acomp_relation(tempspacy):
    to_return = []
    for t in tempspacy: 
        if (t.dep_=="nsubj" and t.head.pos_=="AUX"):  ## looking for nouns that are in nsubj relationship to aux verb
            if "neg" not in [i.dep_ for i in t.head.children]: ## exclude negatives
                attrs = [ch for ch in [tok for tok in tempspacy if tok.head==t.head] if ch.dep_ in ["attr","acomp"]]
                if attrs!=[]: ## if there is at least 1 attr
                    attr = attrs[0] ## just deal with the 0th one
                    attr_or_acomp = attr.dep_
                    ## get the noun ph
                    #np_fragment = [i for i in tempspacy if (t.is_ancestor(i) or i==t)]
                    np_fragment = rclip(get_chunk(t,tempspacy))
                    np_fragment_pos = [i.pos_ for i in np_fragment]
                    np_fragment_indices = [i.i for i in np_fragment]
                    noun_phrase = str(tempspacy[np_fragment_indices[0]:np_fragment_indices[-1]+1])
                    if "PROPN" in np_fragment_pos:     
                        ## get attr phrase
                        #attr_fragment = [i for i in tempspacy if (attr.is_ancestor(i) or i==attr)]
                        attr_fragment = rclip((get_chunk(attr,tempspacy)))
                        attr_fragment_indices = [i.i for i in attr_fragment]
                        attr_phrase = str(tempspacy[attr_fragment_indices[0]:attr_fragment_indices[-1]+1])
                        ## get attr phrase simple
                        valid_deps = ["det","amod"]
                        simple_fragment = [i for i in tempspacy if ((i.head==attr and i.dep_ in valid_deps) or i==attr)]
                        simple_fragment_indices = [i.i for i in simple_fragment]
                        simple_phrase = str(tempspacy[simple_fragment_indices[0]:simple_fragment_indices[-1]+1])
                        if test_contiguous(np_fragment_indices) and test_contiguous(attr_fragment_indices):
                            for n in [w for w in simple_fragment if w.pos_ in ["NOUN","ADJ"]]:
                                to_return.append({
                                    "type":attr_or_acomp,
                                    "allusion":noun_phrase,
                                    "target":n.lemma_,
                                    "allusion2":simple_phrase,
                                    "other_words":"|".join([child.text.lower() for child in n.children if child.dep_ in ['amod']]),
                                    "target_pos":n.pos_
                                })
    return to_return

In [10]:
attr_or_acomp_relation(nlp("John is uick to speak. A mind is a terrible thing to puree. The feet of the Big King were a simple nomadic people who moved through the earth and ate things, and the Dull Forest was a stupid place where old people went. Mausoolus was a wise king of bitter sadness, a good person.  John was a big lawyer.  Mausoolus of Xeria was going to the store."))

[{'type': 'acomp',
  'allusion': 'John',
  'target': 'uick',
  'allusion2': 'uick',
  'other_words': '',
  'target_pos': 'ADJ'},
 {'type': 'attr',
  'allusion': 'The feet of the Big King',
  'target': 'simple',
  'allusion2': 'a simple nomadic people',
  'other_words': '',
  'target_pos': 'ADJ'},
 {'type': 'attr',
  'allusion': 'The feet of the Big King',
  'target': 'nomadic',
  'allusion2': 'a simple nomadic people',
  'other_words': '',
  'target_pos': 'ADJ'},
 {'type': 'attr',
  'allusion': 'The feet of the Big King',
  'target': 'people',
  'allusion2': 'a simple nomadic people',
  'other_words': 'simple|nomadic',
  'target_pos': 'NOUN'},
 {'type': 'attr',
  'allusion': 'the Dull Forest',
  'target': 'stupid',
  'allusion2': 'a stupid place',
  'other_words': '',
  'target_pos': 'ADJ'},
 {'type': 'attr',
  'allusion': 'the Dull Forest',
  'target': 'place',
  'allusion2': 'a stupid place',
  'other_words': 'stupid',
  'target_pos': 'NOUN'},
 {'type': 'attr',
  'allusion': 'Mausool

In [11]:
def appos_relation(tempspacy):
    to_return = []
    for t in tempspacy:
        if t.dep_=="appos":
            appos_fragment = get_chunk(t,tempspacy) ##[i for i in tempspacy if (t.is_ancestor(i) or i==t)]
            appos_fragment_indices = [i.i for i in appos_fragment]
            appos_phrase = str(tempspacy[appos_fragment_indices[0]:appos_fragment_indices[-1]+1])
            ## get the noun ph
            noun_or_proper_noun = t.head
            valid_deps = ['compound']
            np_fragment = rclip(get_chunk(noun_or_proper_noun,tempspacy))#
            np_fragment_pos = [i.pos_ for i in np_fragment]
            np_fragment_indices = [i.i for i in np_fragment]
            noun_phrase = str(tempspacy[np_fragment_indices[0]:np_fragment_indices[-1]+1])
            if "PROPN" in np_fragment_pos:    
                if test_contiguous(np_fragment_indices) and test_contiguous(appos_fragment_indices):
                    for n in [w for w in appos_fragment if w.pos_ in ["NOUN","ADJ"]]:
                        to_return.append({
                            "type":"appos",
                            "allusion":noun_phrase,
                            "allusion2":appos_phrase,
                            "target":n.lemma_,
                            "other_words":"|".join([w.lemma_ for w in appos_fragment if w.pos_=="ADJ"]),
                            "target_pos":n.pos_,
                        })                   
    return to_return

appos_relation(nlp(" King Mausaillus of Carthage, a wise and fat king, is my friend. Mausoolus was going to the store. The old King Mausoolaic of Wales, a wise and smelly king, is my friend."))#ppos_relation(nlp("King Mausaillus of Carthage, a wise person, is my friend. Mausoolus was going to the store. The old King Mausoolaic of Wales, a wise king, is my friend."))

[{'type': 'appos',
  'allusion': ' King Mausaillus of Carthage',
  'allusion2': 'a wise and fat king',
  'target': 'wise',
  'other_words': 'wise|fat',
  'target_pos': 'ADJ'},
 {'type': 'appos',
  'allusion': ' King Mausaillus of Carthage',
  'allusion2': 'a wise and fat king',
  'target': 'fat',
  'other_words': 'wise|fat',
  'target_pos': 'ADJ'},
 {'type': 'appos',
  'allusion': ' King Mausaillus of Carthage',
  'allusion2': 'a wise and fat king',
  'target': 'king',
  'other_words': 'wise|fat',
  'target_pos': 'NOUN'},
 {'type': 'appos',
  'allusion': 'The old King Mausoolaic of Wales',
  'allusion2': 'a wise and smelly king',
  'target': 'wise',
  'other_words': 'wise|smelly',
  'target_pos': 'ADJ'},
 {'type': 'appos',
  'allusion': 'The old King Mausoolaic of Wales',
  'allusion2': 'a wise and smelly king',
  'target': 'smelly',
  'other_words': 'wise|smelly',
  'target_pos': 'ADJ'},
 {'type': 'appos',
  'allusion': 'The old King Mausoolaic of Wales',
  'allusion2': 'a wise and sm

In [12]:
def noun_to_compound(tempspacy):
    to_return = []
    for t in tempspacy:
        if (t.pos_=="PROPN" and t.head.pos_=="NOUN" and t.dep_=="compound"):
            to_return.append({
                            "type":"compound",
                            "allusion":str(t),
                            "allusion2":None,
                            "target":t.head.lemma_,
                            "other_words":None,
                            "target_pos":t.pos_,
                        })     
    return to_return

noun_to_compound(nlp("The Norbertine lace was on my feet."))

[{'type': 'compound',
  'allusion': 'Norbertine',
  'allusion2': None,
  'target': 'lace',
  'other_words': None,
  'target_pos': 'PROPN'}]

In [13]:
mining_funcs = [
    noun_of_nounphrase,
    verb2np,
    adj_to_noun_phrase,
    attr_or_acomp_relation,
    appos_relation,
    noun_to_compound,
]

## Wikipedia Article Tester

I want to focus on things from Wikipedia that are old.  I will use some gnarly regexes to test whether some section of a Wikipedia text contains references to dates in the 19th century or prior and none from later.

In [14]:
import re

In [15]:
old_stuff_regex = r'\b(?:in|from|c\.|year|by|since|circa) ?(?:late|early)? ?(?:\d{3}|1[0-7]\d\d)\b|\d+ [ab]\.?[dc]\b|\b(?:[1-8]|1[0-8])(?:st|nd|th) centur\w+\b|\b(?!tw)(?!ninetee)[a-z]+(?:th|nd|st) centur\w+\b|\b\d{3,4}\b ?- ?\b(?:\d{3}|1[0-7]\d\d)\b|\((?:\d{3}|1[0-7]\d\d)\b\)|\b(?:(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug)[a-z.]*|spring|summer|fall|winter),? (?:of )?(?:\d{3}|1[0-7]\d\d)\b'

In [16]:
new_stuff_regex = r'\b(?:in|from|c\.|year|by|since|circa) ?(?:late|early)? ?(?:1[89]|2\d)\d\d\b|\d+ a\.?d\b|\b(?:19|2\d)(?:st|nd|th) centur\w+\b|\b(?:tw|ninet)[a-z-]+(?:th|nd|st) centur\w+\b|\b\d{3,4}\b ?- ?\b(?:\d{3}|1[0-8]\d\d)\b|\((?:1[89]|2\d)\d\d\b\)|\b(?:(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug)[a-z.]*|spring|summer|fall|winter),? (?:of )?(?:1[89]|2\d)\d\d\b'

In [17]:
test_string = """
(c. 334 – c. 2602 BC)
18th century
20th century
2nd century
first century
ninth century
nineteeth century
twentieth century
from 1923
from 1520
(119)
(1953)
(1834)
1995
the year 1895
in late 234
in late 1943
in early 1734

June, 1924
July of 2345
June, 1892
June, 234l234

spring of 1233

by 23445
c.234 
c. 1234
1523-1934
234
1523453245
"""

In [18]:
re.findall(old_stuff_regex,test_string)

['c. 334',
 '18th century',
 '2nd century',
 'first century',
 'ninth century',
 'from 1520',
 '(119)',
 'in late 234',
 'in early 1734',
 'spring of 1233',
 'c.234',
 'c. 1234']

In [19]:
re.findall(new_stuff_regex,test_string)

['c. 2602',
 '20th century',
 'nineteeth century',
 'twentieth century',
 'from 1923',
 '(1953)',
 '(1834)',
 'year 1895',
 'in late 1943',
 'June, 1924',
 'July of 2345',
 'June, 1892']

In [20]:
def test_time_of_full_text(text,times_more=5):
    """
    makes sure proper ratio of olds/new 
    """
    olds = re.findall(old_stuff_regex,text,flags=re.I)
    news = re.findall(new_stuff_regex,text,flags=re.I)
    if len(olds)>len(news)*times_more:
            return True
    return False

In [21]:
def test_time_of_section(text):
    olds = re.findall(old_stuff_regex,text,flags=re.I)
    news = re.findall(new_stuff_regex,text,flags=re.I)
    if len(olds)>0:
        if len(news)==0:
            return True
    return False

In [22]:
test_time_of_section("in the 2nd CENTuRY ")

True

In [23]:
test_time_of_section("in 1923 and the 2nd century")

False

***

In [24]:
import re

def clean_wiki_text(text):
    text = re.sub(r"\'\'",'',text) ## links
    text = re.sub(r"poly[ \d]+","",text) ## ?
    text = re.sub(r"=+[^=]+=+\n"," ",text) ## section headings
    text = re.sub(r"\|alt=.+\n"," ",text) ## alt text
    text = text.rstrip(r"\n")
    return text

clean_wiki_text('====Union military strategy====\nLincoln took executive control of the war and shaped the Union military strategy.   |alt=Large group of people\n  sdfl;kasdf\n')

' Lincoln took executive control of the war and shaped the Union military strategy.      sdfl;kasdf\n'

In [25]:
from gensim import utils

import json

In [26]:
#!rm wiki.db ## just re-initialize each time this notebook runs

In [27]:
import sqlite3
connection = sqlite3.connect("wiki.db")
cursor = connection.cursor()

In [28]:
cursor.execute("CREATE TABLE allusions (target TEXT, type TEXT, allusion TEXT, allusion2 TEXT, other_words TEXT, target_pos TEXT, entry TEXT, id NUMBER)")

<sqlite3.Cursor at 0x7faa8a1428f0>

In [29]:
def push_to_db(data):
    sql = "INSERT INTO allusions(target,type,allusion,allusion2,other_words,target_pos,entry,id) VALUES(?,?,?,?,?,?,?,?)"  
    for d in data:
        #print(d.keys())
        task = (d['target'],d['type'],d['allusion'],d['allusion2'],d['other_words'],d['target_pos'],d['entry'],d['id'])
        ## replace empty strings with None
        task = tuple(t if t!="" else None for t in task)
        #print(task)
        cursor.execute(sql,task)
        connection.commit()
#         for k in d:
#             print(k,d[k])
            #task = (k[])
            #cur.execute(sql, task)

In [30]:
import random

In [31]:
def get_part_of_full_text_for_time_test(article):
    """
    returns most of article text as single string, trying to get rid of bibliography, etc.
    this output is meant to be test
    """
    bad_words = ['reference','cite','note','see also','source','adapt','popular culture','biblio','links']
    ok_sections = [section_text for section_title,section_text in zip(article['section_titles'][:-2], article['section_texts']) if any([bw in section_title.lower() for bw in bad_words])==False]
    full_text = "\n\n".join(ok_sections)
    return full_text

In [32]:
%%time

# c=0
# max_n = 100

idnum = 0

prob_of_processing_article = 1.0 

with utils.open('enwiki-latest.json.gz', 'rb') as f:
    for line in f:
        if random.random()<prob_of_processing_article:
            article = json.loads(line)
            full_text = get_part_of_full_text_for_time_test(article)
            if test_time_of_full_text(full_text):
                for section_title, section_text in zip(article['section_titles'][:-2], article['section_texts'][:2]):
                    text = section_text.strip()
                    if test_time_of_section(text):              
                        text = clean_wiki_text(text)
                        ok_sents = [s for s in text.split("\n\n") if (len(s)>=300 and s[-1]==".")]
                        for oks in ok_sents:
                            spacied = nlp(oks)
                            for func in mining_funcs:
                                try:
                                    results = func(spacied)
                                    [r.update({"entry":article['title']}) for r in results] ## just update all the dicts
                                    for r in results:
                                        r.update({"id":idnum})
                                        idnum+=1
                                    push_to_db(results)
                                except:
                                    pass
#         if c==max_n:
#             break
#         c+=1

CPU times: user 3h 48min 59s, sys: 23min 45s, total: 4h 12min 44s
Wall time: 4h 35min 41s


In [33]:
def query_db(target):
    cursor.execute("SELECT * FROM allusions WHERE target=?", (target,))
    rows = cursor.fetchall()
    return rows

query_db("smile")

[('smile',
  'of',
  'of the Canaanite deity El',
  None,
  'benevolent',
  'NOUN',
  'Isaac',
  19340),
 ('smile',
  'of',
  'of the Canaanite deity El',
  None,
  'benevolent',
  'NOUN',
  'Isaac',
  19341),
 ('smile', 'compound', 'Samuel', None, None, 'PROPN', 'Jakob Abbadie', 21559),
 ('smile',
  'v2np',
  'Musashi just smiled.',
  'Musashi',
  'just',
  'VERB',
  'Sasaki Kojirō',
  230500),
 ('smile',
  'v2np',
  'Captain Glass to smile pleasantly',
  'Captain Glass',
  'pleasantly',
  'VERB',
  'Capture of Guam',
  456054),
 ('smile',
  'v2np',
  'Dante to smile',
  'Dante',
  None,
  'VERB',
  'Belacqua',
  784686),
 ('smile',
  'v2np',
  'Maurecia celebrates, Ron is not smiling.',
  'Ron',
  None,
  'VERB',
  'Wayside School Beneath the Cloud of Doom',
  2116326)]

In [34]:
cp wiki.db wiki_to_use.db

***