# Extracting triples from wikipedia

- We'll use rules incorperating linguistic features to extract structured information from Wikipedia

# Prepare our tools

In [1]:
from bs4 import BeautifulSoup
import requests

from processors import *

In [2]:
API = ProcessorsAPI(port=8886, keep_alive=True)

Using default
Connection with server established!


# Retrieve wiki text

In [3]:
url = "https://wikipedia.org/wiki/Barack_Obama"
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, "html.parser")
paragraph_text = [p.text for p in soup.find_all("p") if p.text]
paragraph_text

['President of the United States\nIncumbent',
 'First term',
 'Cabinet\nClimate change\nEconomic\nEnergy\nJudicial Appointments\nForeign\n(Obama Doctrine)\nForeign trips\nPardons\nSocial\nSpace',
 'Second term',
 'Barack Hussein Obama II (US i/bəˈrɑːk huːˈseɪn oʊˈbɑːmə/;[1][2] born August 4, 1961) is an American politician serving as the 44th President of the United States. He is the first African American to hold the office, as well as the first president born outside of the continental United States. Born in Honolulu, Hawaii, Obama is a graduate of Columbia University and Harvard Law School, where he served as president of the Harvard Law Review. He was a community organizer in Chicago before earning his law degree. He worked as a civil rights attorney and taught constitutional law at the University of Chicago Law School between 1992 and 2004. He served three terms representing the 13th District in the Illinois Senate from 1997 to 2004, and ran unsuccessfully in the Democratic primar

# Run Odin IE rules on a couple of paragraphs  

- ###### https://gist.github.com/myedibleenso/0c6a5c99070b506992c6343375cd7ebd

In [4]:
rules_url = "https://gist.githubusercontent.com/myedibleenso/0c6a5c99070b506992c6343375cd7ebd/raw/8e08411078a6ddcc5bfb776ff9ac8555ad10db13/triples.yaml"
# pick a couple of paragraphs
sample_text = "\n".join(paragraph_text[4:6])
mentions = API.odin.extract_from_text(sample_text, rules_url)

# Find our triples

In [5]:
triples = [m for m in mentions if m.label == "Triple"]

### Get the distinct triples by the text span of each argument

In [6]:
distinct_triples = {(t.arguments['subject'][0].text, t.trigger.text, t.arguments['object'][0].text) 
                    for t in triples}

print("triples of the form (SUBJECT, PREDICATE, OBJECT)\n")
# sort triples by predicate
for t in sorted(distinct_triples, key=lambda triple: triple[1]):
    print("\t{}".format(t))

triples of the form (SUBJECT, PREDICATE, OBJECT)

	('He', 'began', 'campaign')
	('He', 'began', 'presidential campaign')
	('He', 'defeated', 'nominee John McCain')
	('He', 'defeated', 'John McCain')
	('He', 'defeated', 'Republican nominee John McCain')
	('He', 'hold', 'office')
	('Obama', 'is', 'graduate')
	('He', 'is', 'African American')
	('He', 'is', 'first African American')
	('Obama', 'received', 'attention')
	('Obama', 'received', 'keynote address')
	('Obama', 'received', 'national attention')
	('He', 'served', 'terms')
	('He', 'taught', 'constitutional law')
	('He', 'taught', 'law')
	('He', 'was', 'community organizer')
	('He', 'was inaugurated', 'president')
	('he', 'won', 'delegates')
	('he', 'won', 'sufficient delegates')


# More on Odin and rule-based IE

- http://arxiv.org/abs/1509.07513

- https://github.com/clulab/odin-examples

- http://clulab.cs.arizona.edu/papers/lrec2016-odin.pdf

# More on syntactic dependencies

- http://nlp.stanford.edu/software/dependencies_manual.pdf