In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import sys
import logging

from fonduer import Meta, init_logging
from fonduer.candidates import CandidateExtractor
from fonduer.candidates.models import candidate_subclass
from fonduer.candidates.models import Mention
from fonduer.candidates import MentionExtractor
from fonduer.candidates import MentionNgrams
from fonduer.utils.data_model_utils import get_row_ngrams
from fonduer.candidates.matchers import LambdaFunctionMatcher, Intersect, Union
from fonduer.candidates.models import mention_subclass
from fonduer.parser.models import Document, Sentence
from fonduer.parser.preprocessors import HTMLDocPreprocessor
from fonduer.parser import Parser
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
from pyparsing import Word, alphas
from kanren import Relation, facts, var, run

In [2]:
PARALLEL = 4

DSN = os.environ.get('DSN')

init_logging(log_dir="share/logs")

session = Meta.init(DSN).Session()

[2019-10-04 15:16:47,387][INFO] fonduer.meta:50 - Setting logging directory to: logs/2019-10-04_15-16-47
[2019-10-04 15:16:47,428][INFO] fonduer.meta:134 - Connecting user:None to db:5432/postgres
[2019-10-04 15:16:47,570][INFO] fonduer.meta:161 - Initializing the storage schema


In [3]:
docs_path = "data/perfomers/"
doc_preprocessor = HTMLDocPreprocessor(docs_path)

if session.query(Document).count() == 0:
    print("Documents not found: initing...")
    corpus_parser = Parser(session, structural=True, lingual=True)
    %time corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)

print(f"Documents: {session.query(Document).count()}")
print(f"Sentences: {session.query(Sentence).count()}")

Documents: 2
Sentences: 1060


In [4]:
docs = session.query(Document).order_by(Document.name).all()
ld = len(docs)

In [5]:
Performername = mention_subclass("Performername")
Origin = mention_subclass("Origin")

In [6]:
def mention_span_matches_file_name(mention):
    cheking_string = mention.get_span()
    file_name = mention.sentence.document.name.replace("_", " ")
    if cheking_string == file_name:
        return True
    else:
        return False

performer_name_matcher = LambdaFunctionMatcher(func=mention_span_matches_file_name)

In [7]:
def is_in_origin_table_row(mention):
    if not mention.sentence.is_tabular():
        return False
    ngrams = get_row_ngrams(mention, lower=True)
    origin_place_words = set(["origin"])
    ngrams = list(ngrams)
    if origin_place_words <= set(ngrams):
        return True
    else:
        return False


def origin_left_aligned_to_punctuation(mention):
    for sentence in mention.sentence.cell.sentences:
        sentence_parts = sentence.text.split(",")
        for sentence_part in sentence_parts:
            if sentence_part.startswith(mention.get_span()):
                return True
    return False


def no_commas_in_origin(mention):
    if "," in mention.get_span():
        return False
    else:
        return True

In [8]:
origin_in_labeled_row_matcher = LambdaFunctionMatcher(
    func=is_in_origin_table_row
)
origin_in_labeled_row_matcher.longest_match_only = False
origin_no_commas_matcher = LambdaFunctionMatcher(func=no_commas_in_origin)
origin_left_aligned_matcher = LambdaFunctionMatcher(
    func=origin_left_aligned_to_punctuation
)

origin_matcher = Intersect(
    origin_in_labeled_row_matcher,
    origin_no_commas_matcher,
    origin_left_aligned_matcher,
)

In [9]:
performer_name_ngrams = MentionNgrams(n_max=4, n_min=2)
origin_ngrams = MentionNgrams(n_max=3)

In [10]:
mention_extractor = MentionExtractor(
    session,
    [Performername, Origin],
    [performer_name_ngrams, origin_ngrams, ],
    [performer_name_matcher, origin_matcher, ],
)

In [11]:
mention_extractor.apply(docs, parallelism=PARALLEL)
num_names = session.query(Performername).count()
num_origins = session.query(Origin).count()

print(
    f"Total Mentions: {session.query(Mention).count()} ({num_names} {num_origins} names)"
)
print(f'Names {list(session.query(Origin))} {list(session.query(Performername))}')

[2019-10-04 15:16:48,800][INFO] fonduer.candidates.mentions:459 - Clearing table: performername
[2019-10-04 15:16:48,884][INFO] fonduer.candidates.mentions:459 - Clearing table: origin
[2019-10-04 15:16:48,917][INFO] fonduer.utils.udf:54 - Running UDF...


HBox(children=(IntProgress(value=0, max=2), HTML(value='')))


Total Mentions: 4 (2 2 names)
Names [Origin(SpanMention("Germany", sentence=1519, chars=[0,6], words=[0,0])), Origin(SpanMention("Germany", sentence=2092, chars=[0,6], words=[0,0]))] [Performername(SpanMention("Qntal - Wikipedia", sentence=582, chars=[0,16], words=[0,2])), Performername(SpanMention("Blutengel - Wikipedia", sentence=1059, chars=[0,20], words=[0,2]))]


In [12]:
PerformernameOrigin = candidate_subclass(
    "PerformernameOrigin", [Performername, Origin]
)

candidate_extractor = CandidateExtractor(session, [PerformernameOrigin])
candidate_extractor.apply(docs, split=0, parallelism=PARALLEL)

print(
    f"Number of Candidates: {session.query(PerformernameOrigin).count()}"
)

print(f'Candidats {list(session.query(PerformernameOrigin))}')
# c = list(session.query(PerformernameOrigin))[0]

[2019-10-04 15:16:49,888][INFO] fonduer.candidates.candidates:125 - Clearing table performername_origin (split 0)
[2019-10-04 15:16:49,950][INFO] fonduer.utils.udf:54 - Running UDF...


HBox(children=(IntProgress(value=0, max=2), HTML(value='')))


Number of Candidates: 2
Candidats [PerformernameOrigin(Performername(SpanMention("Qntal - Wikipedia", sentence=582, chars=[0,16], words=[0,2])), Origin(SpanMention("Germany", sentence=1519, chars=[0,6], words=[0,0]))), PerformernameOrigin(Performername(SpanMention("Blutengel - Wikipedia", sentence=1059, chars=[0,20], words=[0,2])), Origin(SpanMention("Germany", sentence=2092, chars=[0,6], words=[0,0])))]


In [13]:
origination = Relation()
# x = var()
performer_origins = []

for cand in session.query(PerformernameOrigin):
    performer_origins.append((cand.origin.context.get_span().lower(), cand.performername.context.get_span().lower()))
    
facts(origination, *performer_origins)

In [14]:
@interact
def answer(question='What is origin for qntal?'):
    query = question
    what = 'What' + ' ' + 'is' + Word(alphas).setResultsName('object') + 'for' + Word(alphas).setResultsName('subject') + "?"
    
    try:
        res = what.parseString(query)

    except Exception as e:
        print('Incorrect question.\nInput question that like to "What is origin for performer name?"')
        pass

    else:
        if res['object'] == 'origin':
            x = var()
            subject = res['subject'].lower()
            origin = (run(3, x, origination(x, f"{subject} - wikipedia"))+(None,))[0]
            if origin is None:
                performers = [
                    cand.performername.context.get_span().lower().split('-')[0].strip() 
                    for cand
                    in session.query(PerformernameOrigin)
                ]
                performers = ', '.join(performers)
                print(f'Performer {subject} not found. Known performers are {performers}')
            
            else:
                print(f'Performer {subject} is originated from {origin}')


interactive(children=(Text(value='What is origin for qntal?', description='question'), Output()), _dom_classes…