### 1) Create a new database in PostgreSQL

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import sys
from collections import defaultdict

#set this user line 
user = 'jared_local'

PARALLEL = 4 # assuming a quad-core machine
ATTRIBUTE = "entity_phone"
os.environ['SNORKELDBNAME'] = "location_extraction"

if user == 'accenture':
    os.environ['SNORKELDB'] = 'postgresql://localhost:5432/' + os.environ['SNORKELDBNAME']
    sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/fonduer/memex/')
elif user == 'jared':
    os.environ['SNORKELDB'] = 'postgres://jdunnmon:123@localhost:5432/' + os.environ['SNORKELDBNAME']
    sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/fonduer/chtap/')
elif user == 'jared_local':
    os.environ['SNORKELDB'] = 'postgres://jdunnmon:genpass2014@localhost:5432/' + os.environ['SNORKELDBNAME']
    sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/fonduer/chtap/')
    
#from sqlalchemy import create_engine
#snorkeldb = create_engine('postgresql://localhost:5432/', isolation_level="AUTOCOMMIT")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


 ## 1.1 Defining a Candidate Schema2) Candidate Schema

In [2]:
from snorkel.contrib.fonduer import SnorkelSession

session = SnorkelSession()

In [3]:
import os
from snorkel.contrib.fonduer.models import candidate_subclass

Location_Extraction = candidate_subclass('location_extraction', ["location"])


## 1.2 Parsing and Transforming the Input Documents into Unified Data Models

### Configuring an `HTMLPreprocessor`

In [4]:
from snorkel.contrib.fonduer import HTMLPreprocessor, OmniParser

if user == 'accenture':
    docs_path = os.environ['SNORKELHOME'] + '/tutorials/fonduer/memex/data/profiles_chtap/'
elif user == 'jared':
    docs_path = '/lfs/local/0/jdunnmon/chtap/data/s3/chtap_profiles_20170928/'
elif user == 'jared_local':
    docs_path = '/home/jdunnmon/research/re/projects/memex/data/profiles/crawl_october_2017/texas_profiles_data'

doc_preprocessor = HTMLPreprocessor(docs_path)

### Configuring an `OmniParser`

In [12]:
corpus_parser = OmniParser(structural=True, lingual=True)
%time corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)

Clearing existing...
Running UDF...
CPU times: user 9.54 s, sys: 112 ms, total: 9.65 s
Wall time: 16min 38s


In [5]:
from snorkel.contrib.fonduer.models import Document, Phrase,Table

print "Documents:", session.query(Document).count()
print "Phrases:", session.query(Phrase).count()
print "Table", session.query(Table).count()

Documents: 342
Phrases: 71150
Table 832


## 1.3 Dividing the Corpus into Test and Train

In [6]:
docs = session.query(Document).order_by(Document.name).all()
ld   = len(docs)

train_docs = set()
dev_docs   = set()
test_docs  = set()
splits = (0.8, 0.9)
data = [(doc.name, doc) for doc in docs]
data.sort(key=lambda x: x[0])
for i, (doc_name, doc) in enumerate(data):
    if i < splits[0] * ld:
        train_docs.add(doc)
    elif i < splits[1] * ld:
        dev_docs.add(doc)
    else:
        test_docs.add(doc)
from pprint import pprint
#pprint([x.name for x in train_docs])
print "train:",len(train_docs)
print "dev:" ,len(dev_docs)
print "test:",len(test_docs)
# from pprint import pprint
# pprint([x.name for x in train_docs])

train: 274
dev: 34
test: 34


### Phase 2: Candidate Extraction & Multimodal Featurization

In [7]:
from snorkel.matchers import *
location_matcher = LocationMatcher(longest_match_only=True) 

####Define a relation's ContextSpaces

from snorkel.contrib.fonduer.fonduer.candidates import OmniNgrams
location_ngrams = OmniNgrams(n_max=6, split_tokens=[])

### Defining candidate Throttlers

In [8]:
from snorkel.contrib.fonduer.lf_helpers import *
import re
from snorkel.lf_helpers import *


    
def location_currencies_filter(location):
    list_currencies = [ "dollar", "dollars", "lira","kwacha","rials","rial","dong","dongs","fuerte","euro",
                       "euros","vatu","som","peso","sterling","sterlings","soms","pestos",
                       "pounds", 
                  "pound","dirham","dirhams","hryvnia","manat","manats","liras","lira",
                       "dinar","dinars","pa'anga","franc","baht","schilling",
                  "somoni","krona","lilangeni","rupee","rand","shilling","leone","riyal","dobra",
                  "tala","ruble","zloty","peso","sol","quarani","kina","guinean","balboa","krone","naira",
                  "cordoba","kyat","metical","togrog","leu","ouguiya","rufiyaa","ringgit","kwacha",
                  "ariary","denar","litas","loti","lats","kip","som","won","tenge","yen","shekel","rupiah",
                  "forint","lempira","gourde","quetzal","cedi","lari","dalasi","cfp","birr","kroon","nakfa",
                  "cfa","Peso","koruna","croatian","colon","yuan","escudo","cape","riel","lev","real"
                  ,"real","mark","boliviano","ngultrum","taka","manat","dram","kwanza","lek","afghani","renminbi"]

    
    cand_right_tokens = list(get_right_ngrams(location,window=2))
    for cand in cand_right_tokens:
        if cand not in list_currencies:
            return location
    
candidate_filter = location_currencies_filter

In [9]:
from snorkel.contrib.fonduer.candidates import CandidateExtractor

candidate_extractor = CandidateExtractor(Location_Extraction,
                                         [location_ngrams], [location_matcher],
                                         candidate_filter=candidate_filter)

%time candidate_extractor.apply(train_docs, split=0, parallelism=PARALLEL)

Clearing existing...
Running UDF...
CPU times: user 64 ms, sys: 32 ms, total: 96 ms
Wall time: 27.2 s


In [10]:
train_cands = session.query(Location_Extraction).filter(Location_Extraction.split == 0).all()
print "Number of candidates:", len(train_cands) 

Number of candidates: 1932


### Exploring the candidate 

In [11]:
from snorkel.contrib.fonduer.fonduer.lf_helpers import*
from snorkel.contrib.fonduer.candidates import*

In [12]:
cand_16= train_cands[16]
print cand_16
cand_18= train_cands[18]
print cand_18
cand_19= train_cands[19]
print cand_19

location_extraction(Span("Dallas", sentence=135702, chars=[35,40], words=[11,11]))
location_extraction(Span("Dallas", sentence=190784, chars=[13,18], words=[3,3]))
location_extraction(Span("United States", sentence=151353, chars=[0,12], words=[0,1]))


In [13]:
cand_16= train_cands[16]
print "text for the 16th candidate:\n", cand_16.get_parent()
print "16th candidate\n:",cand_16
ance_16 = get_ancestor_tag_names(cand_16)
print "ancestor of 16th candidate\n:", ance_16 
print "***************************************************"
cand_17= train_cands[17]
print "text for the 17th candidate:\n", cand_17.get_parent()
print "17th candidate:",cand_17
ance_17 = get_ancestor_tag_names(cand_17)
print "ancestor of 17th candidate\n:", ance_17
print "***************************************************"

cand_19= train_cands[19]
print "text for the 19th candidate:\n", cand_19.get_parent()
print "19th candidate:",cand_19
ance_19 = get_ancestor_tag_names(cand_18)
print "ancestor of 19th candidate\n:", ance_19

text for the 16th candidate:
Phrase (Doc: a2de2646-0ee6-40b0-8631-d9188ea6d96b, Index: 74, Text: var as_sid = '16';     var ad_loc='Dallas';   var aspublisher_width = "200";  var aspublisher_height = "700";  var aspublis_color_bg = "ffffff";  var aspublis_color_border = "ffffff";  var aspublis_color_link = "006621";  var aspublis_color_text = "000000";  var aspublis_color_url = "1a0dab";  as_show_ad('page_ads_2', as_sid);)
16th candidate
: location_extraction(Span("Dallas", sentence=135702, chars=[35,40], words=[11,11]))
ancestor of 16th candidate
: ['html', 'body', 'div', 'div', 'script']
***************************************************
text for the 17th candidate:
Phrase (Doc: 0d40b595-921b-4063-84f1-01a1e44be0f4, Index: 28, Text: Dallas, Texas Female Escort & GFE available for Incall.)
17th candidate: location_extraction(Span("Dallas", sentence=119801, chars=[0,5], words=[0,0]))
ancestor of 17th candidate
: ['html', 'body', 'div', 'div', 'div']
***********************************

### Repeating for development and test splits

In [14]:
%%time
for i, docs in enumerate([dev_docs, test_docs]):
    candidate_extractor.apply(docs, split=i+1)
    print "Number of candidates:", session.query(Location_Extraction).filter(Location_Extraction.split == i+1).count()

Clearing existing...
Running UDF...
Number of candidates: 229
Clearing existing...
Running UDF...
Number of candidates: 266
CPU times: user 5.77 s, sys: 104 ms, total: 5.87 s
Wall time: 14.6 s


In [15]:
session.rollback()

In [16]:
dev_cands = session.query(Location_Extraction).filter(Location_Extraction.split == 1).all()
print "Number of candidates:", len(train_cands)
dev_cand1= dev_cands[10]
print get_ancestor_tag_names(dev_cand1)
print dev_cand1.get_parent()

Number of candidates: 1932
['html', 'head', 'title']
Phrase (Doc: e89be8b6-b029-4edd-a61b-f9cc806c1b38, Index: 0, Text: 737-204-4588 Kendra James  Austin, Texas Female Escorts)


In [17]:
can_tmp = []
dc_ind=50
for can in dev_cands:
    if can.get_parent().document.name == dev_cands[dc_ind].get_parent().document.name:
        can_tmp.append(can.get_parent())
        print can.location
for cn in can_tmp:
    print cn

Span("United States", sentence=153831, chars=[0,12], words=[0,1])
Span("Texas", sentence=153752, chars=[29,33], words=[4,4])
Span("Houston", sentence=191159, chars=[13,19], words=[3,3])
Span("Houston", sentence=153840, chars=[0,6], words=[0,0])
Span("Houston", sentence=153859, chars=[0,6], words=[0,0])
Span("Houston", sentence=154013, chars=[35,41], words=[11,11])
Phrase (Doc: df84365f-614e-4689-b12d-57580ff8c059, Index: 18, Text: United States »)
Phrase (Doc: df84365f-614e-4689-b12d-57580ff8c059, Index: 0, Text: 832-739-8609 Brenda Houston, Texas Female Escorts)
Phrase (Doc: df84365f-614e-4689-b12d-57580ff8c059, Table: 0, Row: 0, Col: 0, Index: 0, Text: Back To All  Houston, Texas Female Escort)
Phrase (Doc: df84365f-614e-4689-b12d-57580ff8c059, Index: 20, Text: Houston    »)
Phrase (Doc: df84365f-614e-4689-b12d-57580ff8c059, Index: 26, Text: Houston, Texas Female Escort available for Incall.)
Phrase (Doc: df84365f-614e-4689-b12d-57580ff8c059, Index: 64, Text: var as_sid = '16';     v

## Getting Place Names and Locationsfrom Google API

In [18]:
#getting google place and geocoding APIs
import googlemaps as gm
import gmaps
import pandas as pd
import numpy as np
from shapely.geometry import MultiPoint
import warnings
warnings.filterwarnings('ignore')
maps_api_key = 'AIzaSyA0Veo5Lc6JOwDjNgQvPEhQB4AiZcrYQGI'
gmaps.configure(api_key=maps_api_key)

def get_possible_locations(plc):
    """
    INPUTS
    plc: string describing place to match

    OUTPUTS
    qo: full json structure returned from API call
    cl: list of candidate location strings
    """ 
    api_key = 'AIzaSyDbk3lLZHuQVKDRBN99_oz-p4AJjIzhA0w'
    gms = gm.Client(key=api_key)
    qo = gm.places.places_autocomplete(gms,plc)
    cl = [a['description'] for a in qo]
    return qo,cl

def get_geocode(plc):
    """
    INPUTS
    plc: string describing place to match

    OUTPUTS
    qo full json structure returned from API call
    (lat,lon): lat-lon tuple
    """
    api_key = 'AIzaSyBlLyOaasYMgMxFGUh2jJyxIG0_pZFF_jM'
    gms = gm.Client(key=api_key)
    qo = gm.geocoding.geocode(gms,plc)
    lat = qo[0]['geometry']['location']['lat']
    lng = qo[0]['geometry']['location']['lng']
    return qo,(lat,lng)

def slice_pd_by_cont(dfm,col,val,pres=True,lower=False,union=False):
    """
    Returns dataframe where column values include/exclude values in provided list
    
    INPUTS:
    dfm: dataframe
    col: column header
    val: list of strings to include/ignore
    pres: true to include, false to exclude
    union: include union of these values
    """
    if union:
        val = ['|'.join(val)]
    for vl in val:
        if ~lower:
            if pres:
                dfm = dfm.loc[dfm[col].str.contains(vl,na=False)]
            else:
                dfm = dfm.loc[~dfm[col].str.contains(vl,na=False)]
        else:
            if pres:
                dfm = dfm.loc[dfm[col].str.lower().str.contains(vl,na=False)]
            else:
                dfm = dfm.loc[~dfm[col].str.lower().str.contains(vl,na=False)]
    return dfm

def map_candidates_and_centroid(dfm):
    """
    INPUT
    dfm: dataframe containing at least latitude, longitude
    
    OUTPUT
    centroid: np array of lat/lon of location centroid
    """
    df_cans = dfm
    df_cans_map = dfm[['latitude','longitude']]
    df_cans['lat_long'] = df_cans[['latitude', 'longitude']].apply(tuple, axis=1)
    point_tup_lst = df_cans['lat_long'].tolist()
    points = MultiPoint(point_tup_lst)
    cent = np.array(points.centroid)
    cent_df = pd.DataFrame([cent]) #this is a rough centroid estimate
    fig = gmaps.Map()
    can_layer = gmaps.symbol_layer(
    df_cans_map, fill_color="green", stroke_color="green", scale=2)
    cent_layer = gmaps.symbol_layer(
    cent_df, fill_color="red", stroke_color="red", scale=2)
    fig.add_layer(can_layer)
    fig.add_layer(cent_layer)
    fig
    return cent,fig

state_dict = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}
state_add_dict = {v: k for k, v in state_dict.iteritems()}

In [113]:
#creating a candidate dictionary keyed by document name
doc_dict = defaultdict(list)
loc_dict = defaultdict(list)
can_loc_dict = defaultdict(list)
for can in dev_cands:
    doc_dict[can.get_parent().document.name].append(can)

    #calling API for each location
for ky in doc_dict.keys():
    #print doc_dict[ky]
    for can in doc_dict[ky]:
        loc_can = can.location.get_span()
        can_loc_dict[ky].append(loc_can)
    for plc in list(set(can_loc_dict[ky])):
        _,loc_out = get_possible_locations(plc)
        loc_dict[ky] = loc_dict[ky]+loc_out
  #  print loc_dict[ky] 
  #  print can_loc_dict[ky]

In [142]:
import pycountry
from collections import Counter
    
def get_attr(obj):
    out = [a for a in dir(obj) if not a.startswith('__') and not callable(getattr(obj,a))]
    return out

def most_common(lt):
    data = Counter(lt)
    return data.most_common(1)[0][0]

def get_common_country(lt):
    country_lst = []
    country_els = []
    for it in lt:
        try:
            country = pycountry.countries.lookup(it.lower())
            country_lst.append(country.alpha_3)
            country_els.append(it)
        except:
            country = None 
    if country_lst == []:
        return 'none',[],[]
    return most_common(country_lst),country_lst, country_els

def get_common_state(lt):
    state_lst = []
    state_els = []
    for it in lt:
        if it in state_add_dict.keys():
            state_lst.append(it)
            state_els.append(it)
        elif it in state_add_dict.values():
            state_lst.append(state_dict[it])
            state_els.append(it)
    if state_lst == []:
        return 'none',[],[]
    else:
        return most_common(state_lst), state_lst, state_els

def get_possible_locale(lt,cn,st,cn_lst,st_lst):
    locale_list = []
    a = [b for b in lt if b not in cn_lst and b not in st_lst]
    for b in a:
        locales = get_possible_locations(b)
        locales = [c for c in locales if cn in b and st in b]
        locale_list.append(locales)
    return locale_list

def lookup_country_name(cn):
    try:
        out = pycountry.countries.lookup(cn).name
    except:
        out = 'no country'
    return out

def lookup_state_abbrev(cn):
    try:
        out = state_add_dict[cn]
    except:
        out = 'no state'
    return out

def lookup_country_alpha3(cn):
    return pycountry.countries.lookup(cn).alpha_3

In [147]:
#getting most common locale
out_locales = defaultdict(list)
for ky in can_loc_dict.keys():
    #getting country names
    probable_country,country_list, country_els = get_common_country(can_loc_dict[ky])
    if pycountry.countries.lookup(probable_country).alpha_3 == 'USA' and len(can_loc_dict[ky]) >1:
        #getting state names
        probable_state,state_list,state_els = get_common_state(can_loc_dict[ky])
    else:
        probable_state,state_list,state_els = 'none',[],[]
    
    #getting state names
    locale_list = []
    a = [b for b in can_loc_dict[ky] if b not in country_els and b not in state_els] #need lookup here
    print a
    if a == []:
        if probable_state != 'none' and probable_country != 'none' and a == []:
            locale_list = ['none,none,'+state_add_dict[probable_state]+','+probable_country]
    else:
        most_common_locale = most_common(a)
        aset = list(set(a))
        #print aset
        for b in aset:
                #print b
                locale_tmp = []
                qo,locales = get_possible_locations(b)
                not_exact = 1
                count = 0
                while not_exact and count<len(locales):
                    print('Checking Locale %d of %d' %(count,len(locales)))
                    c = locales[count]
                    spl =  [str(x.strip().lower()) for x in c.split(',')]
                    if lookup_country_name(probable_country).lower() in spl:
                        if lookup_state_abbrev(probable_state).lower() in spl: 
                            if spl[0].lower() == most_common_locale.lower() and len(spl) == 3:
                                locale_list = ['none']+spl
                                locale_list = [','.join(locale_list)]
                                not_exact = 0
                                print 'Exact City Found'
                            elif spl[0].lower() == most_common_locale.lower() and len(spl) == 4:
                                locale_list = [','.join(spl)]
                                not_exact = 0
                                print 'Exact Location Found'
                            else:             
                                locale_list.append(','.join(spl))  
                                count = count+1
                        else:
                            count = count+1         
                    else:
                        count = count+1
        
    #reformatting for labeling comparison
    locale_list_out = []
    for c in locale_list:
        b = c.split(',')
        print b
        b[-1] = str(lookup_country_alpha3(b[-1]).lower())
        b[-2] = state_dict[b[-2].upper()].lower()
        locale_list_out.append(','.join(b)) 
    out_locales[ky] = locale_list_out

[u'Houston', u'Angel Houston', u'Houston', u'Houston', u'Houston']
Checking Locale 0 of 5
Exact City Found
Checking Locale 0 of 5
Checking Locale 1 of 5
Checking Locale 2 of 5
Checking Locale 3 of 5
Checking Locale 4 of 5
['none', 'houston', 'tx', 'united states']
['angel lane', 'houston', 'tx', 'united states']
['angel fire lane', 'houston', 'tx', 'united states']
['angel shores', 'houston', 'tx', 'united states']
['angel falls lane', 'houston', 'tx', 'united states']
['angel island lane', 'houston', 'tx', 'united states']
[u'Dallas', u'Dallas', u'Dallas']
Checking Locale 0 of 5
Exact City Found
['none', 'dallas', 'tx', 'united states']
[]
[u'none', u'none', u'TX', u'USA']
[u'Dallas', u'Dallas', u'SEATTLE ROCK', u'Dallas', u'Anhellica Dallas', u'Dallas']
Checking Locale 0 of 1
Checking Locale 0 of 5
Exact City Found
['none', 'dallas', 'tx', 'united states']
[]
[u'none', u'none', u'TX', u'USA']
[u'Houston', u'Houston', u'Houston', u'Houston']
Checking Locale 0 of 5
Exact City Found
['n

In [154]:
for ii in out_locales.keys():
    if out_locales[ii] == []:
        out_locales[ii] = ['none','none','none','none']

In [156]:
f = open('extracted_loc_tsv.tsv','w')
for ky in can_loc_dict.keys():
    line = ky+"\t"+out_locales[ky][0]+'\n'
    f.write(line)
f.close()