In [162]:
import json
import urllib
import sys
from collections import defaultdict

In [163]:
# remove '_' if it is concatenating two words
# remove '_' and append s if it is plural
def sanitize_relation(relation):
    tokens = relation.split('_')
    if tokens[-1] == 's':
        tokens[-2] = tokens[-2] + 's'
        tokens.pop()

    return " ".join(tokens)

def sanitize_compound_arg(arg):
    tokens = arg.split(' - ')
    del tokens[-2:]
    return " ".join(tokens)

In [164]:
def search_api_request(api_key, query):
    service_url = 'https://www.googleapis.com/freebase/v1/search'
    params = {
        'key': api_key,
        'query': query,
    }
    url = service_url + '?' + urllib.urlencode(params)
    response = json.loads(urllib.urlopen(url).read())
    return response

In [165]:
def extract_first_result(response):
    first_result = response['result'][0]
    topic_name = str(first_result['name'])
    topic_id = str(first_result['mid']) # To be used by the topic API in scraping relations
    return (topic_name, topic_id)

In [166]:
def topic_api_request(api_key, topic):
    service_url = 'https://www.googleapis.com/freebase/v1/topic'
    params = {
      'key': api_key,
    }
    topic_name = topic[0]
    topic_id = topic[1]
    url = service_url + topic_id + '?' + urllib.urlencode(params)
    topic = json.loads(urllib.urlopen(url).read())
    return topic

In [167]:
def print_tuples(tuples):
    for t in tuples:
        print t

In [168]:
def construct_ppdb_from_file(file_name):
    ppdb = defaultdict(list)
    count = 0
    with open(file_name) as f:
        for line in f:
            data = line.split(" ||| ")
            pos_tag = data[0]
            source = data[1]
            target = data[2]
            ppdb[source].append(target)
            count += 1
            
    print str(count) + " paraphrases added to PPDB"
    return ppdb


def expand_tuple_for_phrase(t, phrase, ppdb, tuples):
    print "** expand_tuple_for_phrase ** tuple: "
    print t
    print "and phrase: " + phrase
    new_tuples = []
    for paraphrase in ppdb[phrase]:
        new_tuple = (t[0], paraphrase, t[2])
        new_tuples.append(new_tuple)
    
    print new_tuples
    print ""
    
    return new_tuples

def expand_tuples_with_ppdb(ppdb, tuples):
    new_tuples = []
    for t in tuples:
        phrase = t[1]
        new_tuples += expand_tuple_for_phrase(t, phrase, ppdb, tuples)
        for sub_phrase in phrase.split(' '):
            new_tuples+= expand_tuple_for_phrase(t, sub_phrase, ppdb, tuples)
    tuples += new_tuples

In [169]:
def construct_tuples(name, response):
    tuples = []
    for property in response['property']:
        simple_prop = str(property.split('/')[-1])
        simple_prop = sanitize_relation(simple_prop)
        
        for value in response['property'][property]['values']:
            arg = value['text'].encode("utf8") 
            if response['property'][property]['valuetype'] == 'compound':
                arg = sanitize_compound_arg(arg)     
           
            tuples.append((name, simple_prop, str(arg)))
            
    return tuples

In [171]:
def allow_tuple(tuple):
    disallowed_relations = ['key', 'type']
    # If t contains '/' we skip.
    # This is indicative of a Freebase topic-topic link or url.
    if not tuple[2] or '/' in tuple[2]:
        return False
    if tuple[1] in disallowed_relations:
        return False
     
    return True

def sanitize_tuples(tuples):
    allowed_tuples = []
    blocked_tuples = []
    for t in tuples:
        if allow_tuple(t):
            allowed_tuples.append(t)
        else:
            blocked_tuples.append(t)
            
#     print "== Start Blocked Tuples =="
#     print_tuples(blocked_tuples)
#     print "== End Blocked Tuples == \n"

    return allowed_tuples


In [181]:
api_key = open(".api_key").read()

query = "Stanford" # Join all arguments to form one search query
search_result = search_api_request(api_key, query)

topic = extract_first_result(search_result)

print "***"
print "Query: \"" + query + "\" => found topic_name: " + topic[0] + " topic_id:  " + topic[1]
print "***"

response = topic_api_request(api_key, topic)
tuples = construct_tuples(topic[0], response)
tuples = sanitize_tuples(tuples)

print_tuples(tuples)

***
Query: "Stanford" => found topic_name: Stanford University topic_id:  /m/06pwq
***
('Stanford University', 'daylife topic', '03F28nhaTT5ue')
('Stanford University', 'daylife topic', '05RjadO4sRdmu')
('Stanford University', 'board members', 'Sharon Percy Rockefeller')
('Stanford University', 'board members', '1990 John E. Bryson 2000')
('Stanford University', 'software', 'blacklight')
('Stanford University', 'software', 'Folding@home')
('Stanford University', 'geolocation', '37.43 -122.17')
('Stanford University', 'projects', '2005 Stanford Multidisciplinary Teaching and Research Projects Host Organization')
('Stanford University', 'projects', '2013 Stanford StartX Fund Sponsor')
('Stanford University', 'description', 'Leland Stanford Junior University, or more commonly Stanford University, is a private research...')
('Stanford University', 'faculty', '1807 2007')
('Stanford University', 'faculty', '3228 2010 nces_university_attr')
('Stanford University', 'phone number', '(650) 723-

In [182]:
json_string = json.dumps(response, ensure_ascii=False, indent=2)
print "==="
print json_string
print "==="

===
{
  "property": {
    "/common/topic/topical_webpage": {
      "count": 1.0, 
      "valuetype": "uri", 
      "values": [
        {
          "lang": "", 
          "text": "http://topics.nytimes.com/top/reference/timestopics/organizations/s/stanford_university/index.html", 
          "timestamp": "2011-11-01T03:19:51.000Z", 
          "value": "http://topics.nytimes.com/top/reference/timestopics/organizations/s/stanford_university/index.html", 
          "creator": "/user/gardening_bot"
        }
      ]
    }, 
    "/education/university/international_tuition": {
      "count": 0.0, 
      "status": "has_value", 
      "values": []
    }, 
    "/common/identity/daylife_topic": {
      "count": 2.0, 
      "valuetype": "string", 
      "values": [
        {
          "lang": "", 
          "text": "03F28nhaTT5ue", 
          "timestamp": "2009-11-30T19:57:21.002Z", 
          "value": "03F28nhaTT5ue", 
          "creator": "/user/linkbot"
        }, 
        {
          "lang": "

In [175]:
ppdb = construct_ppdb_from_file("ppdb-1.0-s-all")

6977679 paraphrases added to PPDB


In [176]:
expand_tuples_with_ppdb(ppdb, tuples)

** expand_tuple_for_phrase ** tuple: 
('University of California, Santa Barbara', 'geolocation', '34.41254 -119.84813')
and phrase: geolocation
[]

** expand_tuple_for_phrase ** tuple: 
('University of California, Santa Barbara', 'geolocation', '34.41254 -119.84813')
and phrase: geolocation
[]

** expand_tuple_for_phrase ** tuple: 
('University of California, Santa Barbara', 'colors', 'Blue')
and phrase: colors
[('University of California, Santa Barbara', 'colours', 'Blue'), ('University of California, Santa Barbara', 'colours', 'Blue'), ('University of California, Santa Barbara', 'colours', 'Blue'), ('University of California, Santa Barbara', 'the colours', 'Blue')]

** expand_tuple_for_phrase ** tuple: 
('University of California, Santa Barbara', 'colors', 'Blue')
and phrase: colors
[('University of California, Santa Barbara', 'colours', 'Blue'), ('University of California, Santa Barbara', 'colours', 'Blue'), ('University of California, Santa Barbara', 'colours', 'Blue'), ('Universit

In [177]:
print_tuples(tuples)

('University of California, Santa Barbara', 'geolocation', '34.41254 -119.84813')
('University of California, Santa Barbara', 'colors', 'Blue')
('University of California, Santa Barbara', 'colors', 'Gold')
('University of California, Santa Barbara', 'faculty', '1101 2010 nces_university_attr')
('University of California, Santa Barbara', 'containedby', 'United States of America')
('University of California, Santa Barbara', 'containedby', 'California')
('University of California, Santa Barbara', 'containedby', 'Goleta')
('University of California, Santa Barbara', 'containedby', 'Santa Barbara')
('University of California, Santa Barbara', 'campuses', 'University of California, Santa Barbara')
('University of California, Santa Barbara', 'employees', 'David Gross')
('University of California, Santa Barbara', 'employees', 'J. Gordon Melton')
('University of California, Santa Barbara', 'employees', '1982 Alan J. Heeger')
('University of California, Santa Barbara', 'employees', 'Finn E. Kydlan