In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import sys
import logging
import pathlib
import requests

from bs4 import BeautifulSoup
from fonduer import Meta, init_logging
from fonduer.candidates import CandidateExtractor
from fonduer.candidates.models import candidate_subclass
from fonduer.candidates.models import Mention
from fonduer.candidates import MentionExtractor
from fonduer.candidates import MentionNgrams
from fonduer.utils.data_model_utils import get_row_ngrams
from fonduer.candidates.matchers import LambdaFunctionMatcher, Intersect, Union
from fonduer.candidates.models import mention_subclass
from fonduer.parser.models import Document, Sentence
from fonduer.parser.preprocessors import HTMLDocPreprocessor
from fonduer.parser import Parser
from pyparsing import Word, alphas
from kanren import Relation, facts, var, run, eq
from ipywidgets import interact, interact_manual
import ipywidgets as widgets
from tqdm import tqdm

In [2]:
PARALLEL = 4

DSN = os.environ.get('DSN')

init_logging(log_dir="share/logs")

session = Meta.init(DSN).Session()

[2019-10-08 19:08:04,159][INFO] fonduer.meta:50 - Setting logging directory to: share/logs/2019-10-08_19-08-04
[2019-10-08 19:08:04,198][INFO] fonduer.meta:134 - Connecting user:None to localhost:5432/ds_2019_03_pw
[2019-10-08 19:08:04,349][INFO] fonduer.meta:161 - Initializing the storage schema


In [3]:
NATIONALITY_LIMIT = 25

def full_path(path): 
    return 'https://en.wikipedia.org/' + path.lstrip('/')

def href(element): 
    return element.attrs['href']

performer_list_soup = BeautifulSoup(requests.get('https://en.wikipedia.org/wiki/Category:Lists_of_musicians_by_nationality').text)
performer_urls = [
    full_path(href(a)) 
    for list_url in tqdm(performer_list_soup.select('div.mw-category div.mw-category-group ul li a')[:NATIONALITY_LIMIT])
    for a in BeautifulSoup(requests.get(full_path(href(list_url))).text).select('div.columns ul li a')
]

100%|██████████| 25/25 [00:23<00:00,  1.10it/s]


In [4]:
PERFORMER_LIMIT=17
docs_path = pathlib.Path("data/perfomers/")

def save(url):
    doc_path = docs_path / (url.split('/')[-1] + ' - Wikipedia.html')
    if os.path.exists(doc_path):
        return
    print(f'save from {url}')
    with open(doc_path, 'wt') as f:
        f.write(requests.get(url).text)
        
for performer_url in performer_urls[:PERFORMER_LIMIT]:
    save(performer_url)

In [5]:
doc_preprocessor = HTMLDocPreprocessor(docs_path)

doc_paths = [path for path in os.listdir(docs_path) if path.endswith('.html')]
update = False
# update = session.query(Document).count() < len(doc_paths)  # Расскомментировать для первичного заполнения
if update:
    print(f"Some documents not found (only {session.query(Document).count()} from {len(doc_paths)}): initing...")
    corpus_parser = Parser(session, structural=True, lingual=True)
    %time corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)

print(f"Documents: {session.query(Document).count()}")
print(f"Sentences: {session.query(Sentence).count()}")

Documents: 23
Sentences: 17965


In [6]:
docs = session.query(Document).order_by(Document.name).all()

In [7]:
Performer = mention_subclass("Performer")
Origin = mention_subclass("Origin")
Genre = mention_subclass("Genre")
Beginning = mention_subclass("Beginning")
Perfomance = mention_subclass("Perfomance")

In [8]:
def mention_span_matches_file_name(mention):
    checked = mention.get_span()
    if 'class=firstHeading' in mention.sentence.html_attrs and 'band' not in checked and not (set('()') & set(checked)):
        return True
    
    return False

performer_name_matcher = LambdaFunctionMatcher(func=mention_span_matches_file_name)
performer_name_matcher.longest_match_only = True

In [9]:
def is_in_origin_table_row(mention):
    if not mention.sentence.is_tabular():
        return False
    ngrams = get_row_ngrams(mention, lower=True)
    origin_place_words = set(["origin"])
    ngrams = list(ngrams)
    if origin_place_words <= set(ngrams):
        return True
    else:
        return False


def origin_left_aligned_to_punctuation(mention):
    for sentence in mention.sentence.cell.sentences:
        sentence_parts = sentence.text.split(",")
        for sentence_part in sentence_parts:
            if sentence_part.startswith(mention.get_span()):
                return True
    return False


def no_commas_in_origin(mention):
    if "," in mention.get_span():
        return False
    else:
        return True

In [10]:
origin_in_labeled_row_matcher = LambdaFunctionMatcher(
    func=is_in_origin_table_row
)
origin_in_labeled_row_matcher.longest_match_only = False
origin_no_commas_matcher = LambdaFunctionMatcher(func=no_commas_in_origin)
origin_left_aligned_matcher = LambdaFunctionMatcher(
    func=origin_left_aligned_to_punctuation
)

origin_matcher = Intersect(
    origin_in_labeled_row_matcher,
    origin_no_commas_matcher,
    origin_left_aligned_matcher,
)

In [11]:
def is_in_genre_table_row(mention):
    if not mention.sentence.is_tabular():
        return False
    ngrams = get_row_ngrams(mention, lower=True)
    origin_place_words = set(["genres"])
    ngrams = list(ngrams)
    if origin_place_words <= set(ngrams):
        return True
    else:
        return False
    
genre_in_labeled_row_matcher = LambdaFunctionMatcher(
    func=is_in_genre_table_row
)
genre_in_labeled_row_matcher.longest_match_only = True

genre_matcher = Intersect(
    genre_in_labeled_row_matcher,
)

In [12]:
def is_in_years_active_table_row(mention):
    if not mention.sentence.is_tabular():
        return False
    ngrams = get_row_ngrams(mention, lower=True)
    origin_place_words = set(['years', 'active'])
    ngrams = list(ngrams)
    if origin_place_words <= set(ngrams) and mention.get_span() == mention.sentence.text.split('-')[0]:
        return True
    else:
        return False
    
beginning_in_labeled_row_matcher = LambdaFunctionMatcher(
    func=is_in_years_active_table_row
)
beginning_in_labeled_row_matcher.longest_match_only = True

beginning_matcher = Intersect(
    beginning_in_labeled_row_matcher,
)

In [13]:
def cell_in_title_column(mention):
    if mention.sentence.cell:
        cell = mention.sentence.cell
        header = ''.join(sentence.text for sentence in cell.table.cells[cell.col_start].sentences)
        if 'title' in header.lower():
            return True

def mention_fill_whole_cell(mention):
    maybe = mention.get_span()
    if mention.sentence.cell:
        cell_text = ''.join(sentence.text for sentence in mention.sentence.cell.sentences)
        if cell_text.lower() == maybe.lower():
            return True

performance_cell_in_title_column = LambdaFunctionMatcher(func=cell_in_title_column)
performance_mention_fill_whole_cell = LambdaFunctionMatcher(func=mention_fill_whole_cell)

performance_matcher = Intersect(
    performance_cell_in_title_column,
    performance_mention_fill_whole_cell,
)

In [14]:
performer_name_ngrams = MentionNgrams(n_max=4, n_min=1)
origin_ngrams = MentionNgrams(n_max=3)
genre_ngrams = MentionNgrams(n_max=2)
beginning_ngrams = MentionNgrams(n_max=2, split_tokens=['-'])
performance_ngrams = MentionNgrams(n_min=1, n_max=8)

In [15]:
mention_extractor = MentionExtractor(
    session,
    [Performer, Origin, Genre, Beginning, Perfomance],
    [performer_name_ngrams, origin_ngrams, genre_ngrams, beginning_ngrams, performance_ngrams],
    [performer_name_matcher, origin_matcher, genre_matcher, beginning_matcher, performance_matcher],
)

In [16]:
mention_extractor.apply(docs, parallelism=PARALLEL)
num_names = session.query(Performer).count()
num_origins = session.query(Origin).count()
num_genres = session.query(Genre).count()
num_beginnings = session.query(Beginning).count()
num_perfomances = session.query(Perfomance).count()

print(
    f'Total Mentions: {session.query(Mention).count()} ('
    f' {num_names} names'
    f' {num_origins} origins'
    f' {num_genres} genres'
    f' {num_beginnings} beginning'
    f' {num_perfomances} perfomance'
    f')'
)

[2019-10-08 19:08:32,828][INFO] fonduer.candidates.mentions:459 - Clearing table: performer
[2019-10-08 19:08:32,928][INFO] fonduer.candidates.mentions:459 - Clearing table: origin
[2019-10-08 19:08:32,974][INFO] fonduer.candidates.mentions:459 - Clearing table: genre
[2019-10-08 19:08:33,088][INFO] fonduer.candidates.mentions:459 - Clearing table: beginning
[2019-10-08 19:08:33,141][INFO] fonduer.candidates.mentions:459 - Clearing table: perfomance
[2019-10-08 19:08:33,187][INFO] fonduer.utils.udf:54 - Running UDF...


HBox(children=(IntProgress(value=0, max=23), HTML(value='')))


Total Mentions: 372 ( 24 names 37 origins 244 genres 18 beginning 31 perfomance)


In [17]:
PerformerOrigin = candidate_subclass(
    "PerformerOrigin", [Performer, Origin]
)

PerformerGenre = candidate_subclass(
    "PerformerGenre", [Performer, Genre]
)

PerformerBeginning = candidate_subclass(
    "PerformerBeginning", [Performer, Beginning]
)

PerformerPerfomance = candidate_subclass(
    "PerformerPerfomance", [Performer, Perfomance]
)

PerformerAndAllAllAll = candidate_subclass(
    "PerformerAndAllAllAll", [Performer, Origin, Genre, Beginning, Perfomance]
)

candidate_extractor = CandidateExtractor(session, [
    PerformerOrigin, PerformerGenre, PerformerBeginning, PerformerPerfomance, PerformerAndAllAllAll
])
candidate_extractor.apply(docs, split=0, parallelism=PARALLEL)

print(
    f"Number of Candidates: {session.query(PerformerAndAllAllAll).count()}"
)


[2019-10-08 19:08:52,523][INFO] fonduer.candidates.candidates:125 - Clearing table performer_origin (split 0)
[2019-10-08 19:08:52,704][INFO] fonduer.candidates.candidates:125 - Clearing table performer_genre (split 0)
[2019-10-08 19:08:52,814][INFO] fonduer.candidates.candidates:125 - Clearing table performer_beginning (split 0)
[2019-10-08 19:08:52,870][INFO] fonduer.candidates.candidates:125 - Clearing table performer_perfomance (split 0)
[2019-10-08 19:08:52,901][INFO] fonduer.candidates.candidates:125 - Clearing table performer_and_all_all_all (split 0)
[2019-10-08 19:08:52,980][INFO] fonduer.utils.udf:54 - Running UDF...


HBox(children=(IntProgress(value=0, max=23), HTML(value='')))


Number of Candidates: 335


In [18]:
origin = Relation()
genre = Relation()
beginning = Relation()
perfomance = Relation()

facts(origin,
    *[
        (
            cand_origin.origin.context.get_span().lower(), 
            cand_origin.performer.context.get_span().lower()
        )
        for cand_origin in session.query(PerformerOrigin)
])


facts(genre,
    *[
        (
            cand_genre.genre.context.get_span().lower(), 
            cand_genre.performer.context.get_span().lower()
        )
        for cand_genre in session.query(PerformerGenre)
])

facts(beginning,
    *[
        (
            cand_beginning.beginning.context.get_span().lower(), 
            cand_beginning.performer.context.get_span().lower()
        )
        for cand_beginning in session.query(PerformerBeginning)
])

facts(perfomance,
    *[
        (
            cand_perfomance.perfomance.context.get_span().lower(), 
            cand_perfomance.performer.context.get_span().lower()
        )
        for cand_perfomance in session.query(PerformerPerfomance)
])


In [19]:
LIMIT = 7

sample = 'Where from "blutengel"'

@interact
def answer(question=sample):
    from pyparsing import Combine, OneOrMore, White, printables
    object_var = Word(alphas).setResultsName('object')
    subject_var = ('"' + OneOrMore(Word(alphas)) + '"').setResultsName('subject')
    templates = {
        None: [
            'What' + ' ' + 'is' + object_var + 'for' + subject_var + "?",
        ],
        'origin': [
            'Where from' + subject_var,
        ],
        'genre': [
            'Who does' + subject_var + 'belong' + 'to',
        ],
        'beginning': [
            'When was' + subject_var + 'created'+ 'at' + '?',
        ],
        'perfomance': [
            'What does' + subject_var + 'execute?',
        ],
        'similiar': [
            'What is' + subject_var + 'similiar' + 'to' + '?',
            'What similiar' + subject_var ,
        ],
    }
    
    result = None
    subject_name = None
    object_name = None

    for obj_name, object_templates in templates.items():
        if object_name:
            break
        for template in object_templates:
            if object_name:
                break
            try:
                res = template.parseString(question)

            except Exception as e:
                pass

            else:
                subject_name = ' '.join(res['subject']).strip('" ').lower() # res['subject'].lower()
                
                if obj_name is not None:
                    object_name = obj_name
                else:
                    object_name = res['object']
                
    if object_name is None:
        print(f'Incorrect question ({question}). Input correct question')
        return
    
    performers = [
        performer.context.get_span().lower()
        for performer
        in session.query(Performer)
    ]

    if subject_name not in performers:
        performers = ', '.join(sorted(performers))
        print(f'Performer "{subject_name}" not found. Known performers are {performers}. Question {question}.')
    
    if object_name == 'origin':
        x = var()
        origin_name = (run(LIMIT, x, origin(x, f"{subject_name}"))+(None,))[0]
        print(f'Performer {subject_name} originate from {origin_name}')
        result = origin_name
        
    if object_name == 'genre':
        x = var()
        genre_names = run(LIMIT, x, genre(x, f"{subject_name}"))
        print(f'Performer {subject_name} belongs to {", ".join(genre_names)}')
        result = genre_names

    if object_name == 'beginning':
        x = var()
        beginning_date = (run(LIMIT, x, beginning(x, f"{subject_name}"))+(None,))[0]
        print(f'Performer {subject_name} is created at {beginning_date}')
        result = beginning_date

    if object_name == 'perfomance':
        x = var()
        perfomance_names = run(LIMIT, x, perfomance(x, f"{subject_name}"))
        print(f'Performer {subject_name} execute {", ".join(perfomance_names)}')
        result = perfomance_names

    if object_name == 'similiar':
        a = var()
        b = var()
        c = var()
        similiar_names = set([
             v2 
             for v1, v2 
             in run(77, (a, b), eq(a, subject_name), genre(c, a), genre(c, b)) if v1 != v2
            ][:LIMIT])
        print(f'Performer {subject_name} is similiar to {", ".join((similiar_names))}')
        result = similiar_names
    
    return result


interactive(children=(Text(value='Where from "blutengel"', description='question'), Output()), _dom_classes=('…

In [20]:
perfomers = [cand.performer.context.get_span().lower() for cand in list(session.query(PerformerAndAllAllAll))[:1]]

assert len(perfomers) > 0

performer = perfomers[0]

assert answer(question=f'What is "{performer}" similiar to?') is not None
assert answer(question=f'When was "{performer}" created at?') is not None
assert answer(question=f'Who does "{performer}" belong to?') is not None
assert answer(question=f'What does "{performer}" execute?') is not None

Performer qntal is similiar to los fabulosos cadillacs, blutengel, heilung
Performer qntal is created at 1991
Performer qntal belongs to darkwave, industrial music, industrial, music, neofolk
Performer qntal execute qntal ii, qntal i, nihil, qntal iv: ozymandias, qntal iii: tristan und isolde, qntal v: silver swan, illuminate
