In [16]:
from fastcoref import spacy_component
import spacy

nlp = spacy.load("en_core_web_sm", exclude=["parser", "lemmatizer", "ner", "textcat"])
nlp.add_pipe("fastcoref")

11/04/2022 17:08:54 - INFO - 	 missing_keys: []
11/04/2022 17:08:54 - INFO - 	 unexpected_keys: []
11/04/2022 17:08:54 - INFO - 	 mismatched_keys: []
11/04/2022 17:08:54 - INFO - 	 error_msgs: []
11/04/2022 17:08:54 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M


<fastcoref.spacy_component.spacy_component.FastCorefResolver at 0x7f010aa36170>

In [17]:
import logging

logger = logging.getLogger('bgg_predict')
logger.handlers.clear()
handler = logging.StreamHandler()
formatter = logging.Formatter(
        '%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)

logger.debug('test')

2022-11-04 17:09:51,207 bgg_predict  DEBUG    test
11/04/2022 17:09:51 - DEBUG - 	 test


In [18]:
import re
from typing import List
from dataclasses import dataclass

regex_mail = re.compile(r'\w+(?:\.\w+)*?@\w+(?:\.\w+)+')

@dataclass
class Sentence:
    content: str
    start: int
    end: int
    
    def does_include_pos(self, pos: int) -> bool:
        return self.start <= pos <= self.end

def clean_text(text: str) -> str:
    return regex_mail.sub('', text)

def get_sentences_from_text(text: str) -> List[Sentence]:
    # assert no continuous dots because of text cleared while building the dataset
    sentences = text.split('.')
    
    res = []
    char_accumulator = 0
    for sentence in sentences:
        res.append(Sentence(sentence, char_accumulator, char_accumulator + len(sentence) - 1))
        char_accumulator += len(sentence) + 1
        
    if res[-1].content == '':
        res.pop()
        
    return res

In [19]:
from bisect import bisect_left
from typing import List, Tuple
from dataclasses import dataclass

# necessary to use bisect_left with ranges
@dataclass
class Interval:
    start: int
    end: int
    
    def __lt__(self, other) -> bool:
       return self.start < self.end < other.start
    
    def __eq__(self, other) -> bool:
       return self.start <= other.start <= self.end

def get_sentences_from_clusters(clusters: List[List[Tuple[int, int]]], sentences: List[Sentence]) -> List[List[int]]:
    '''find the sentence each cluster belongs to'''
    sentence_clusters = []
    for cluster in clusters:
        sentence_clusters.append([bisect_left(sentences, Interval(entity[0], entity[1]), key=lambda x: Interval(x.start, x.end)) for entity in cluster])

    return sentence_clusters

text = 'Alice goes down the rabbit hole. Where she would discover a new reality beyond her expectations.'
sentences = get_sentences_from_text(text)
clusters = [[(0, 5), (39, 42), (79, 82)]]
get_sentences_from_clusters(clusters, sentences)

[[0, 1, 1]]

In [25]:
from typing import List, Set
from itertools import groupby
from operator import itemgetter
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import connected_components

def get_rule_groups_from_sentence_clusters(sentences: List[Sentence], sentence_clusters: List[List[int]]) -> List[List[int]]:
    def normalize_group(group: Set[int]) -> List[List[int]]:
        '''each group could contain multiple consecutive sublists. this method split these sublists'''
        res = []

        # https://stackoverflow.com/a/23861347/5587393
        for k, g in groupby(enumerate(sorted(list(group))), lambda x: x[0] - x[1]):
            res.append(list(map(itemgetter(1), g)))

        return res
    # the graph is built as a directed sparse graph where the first element of each cluster
    # is connected to the other elements in the same cluster
    graph = [[0 for _ in range(len(sentences))] for __ in range(len(sentences))]
    for cluster in sentence_clusters:
        for sentence in cluster[1:]:
            graph[cluster[0]][sentence] = 1

    # find the connected components of the graph created from the clusters returned after coref     
    graph = csr_matrix(graph)
    n_components, labels = connected_components(csgraph=graph, directed=False, return_labels=True)
    groups = [set() for _ in range(n_components)]
    for i, label in enumerate(labels):
        groups[label].add(i)

    return [norm_group for group in groups for norm_group in normalize_group(group)]

In [21]:
from dataclass_csv import DataclassReader
from models.BoardGameData import *
from extensions.dataclass_csv_extension import *
from typing import Type, TypeVar, Generator

DataclassType = TypeVar('DataclassType')
CSV_PATH = 'data/dataset.csv'

def get_dc_from_csv(f, dc_type: Type[DataclassType]) -> Generator[DataclassType, None, None]:
    reader = DataclassReader(f, dc_type)
    for row in reader:
        yield row

DataclassReader = extend_DataClassReader_with_nested_dataclass(DataclassReader)
with open(CSV_PATH) as f:
    print(next(get_dc_from_csv(f, BoardGame)))

BoardGame(info=BoardGameInfo(id='10', name='Elfenland', numweights=703, averageweight=2.1579, playingtime=60, minage=10), rulebook='Corrections or constructive criticisms?     Email: delta1119@earthlink.net Michael Weston, 2004 Elfenland Quick Reference Setup 1   Each player chooses a color.Place the Boot on Elvenhold and 1 marker in each other city on the board.2   Shuffle the cards.Deal 8 to each player.3   Place the 4 Round # cards on the board in order with the  1  on top.4   Mix tiles face down.Turn 5 tiles face up.5   Each player gets a Summary card and an Obstacle tile.6   Choose a start player and give them the Start Player card Turn Overview  - each phase starts with the Start Player and goes clockwise.1) Refill everyone s hand back up to 8 cards.2) Each player draws a face-down tile.It is kept secret from other players.3) Each player draws either a face-down or face-up tile, and places it face-up in front of them.If a face-up tile was drawn, replace it.Repeat this until each 

In [27]:
import itertools
from models.BoardGameFeatures import *

def get_rules(text: str) -> List[List[int]]:
    text = clean_text(text)
    sentences = get_sentences_from_text(text)
    
    doc = nlp(text, component_cfg = { "fastcoref": {'resolve_text': True} })
    coref_clusters = doc._.coref_clusters
    logger.debug(coref_clusters)
    
    sentence_clusters = get_sentences_from_clusters(coref_clusters, sentences)
    rule_groups = get_rule_groups_from_sentence_clusters(sentences, sentence_clusters)
    
    return ['. '.join([sentences[s_index].content for s_index in group]) for group in rule_groups]

bgs_with_features = []
with open(CSV_PATH) as f:
    bg = next(itertools.islice(get_dc_from_csv(f, BoardGame), 2, None))
    rules = get_rules(bg.rulebook)
    rule_count = len(rules)
    bgs_with_features.append(BoardGameFeatures(averageweight=bg.averageweight,
                                               playingtime=bg.playingtime,
                                               family=bg.family,
                                               rule_count=rule_count,
                                               average_rule_len=bg.rulebook / rule_count
                                              )
                            )
    
bgs_with_features

11/04/2022 17:13:36 - INFO - 	 Tokenize 1 texts...


  0%|          | 0/1 [00:00<?, ?ba/s]

11/04/2022 17:13:38 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

2022-11-04 17:13:46,108 bgg_predict  DEBUG    [[(0, 10), (223, 237), (371, 385), (413, 427), (526, 540), (1050, 1060), (1098, 1107), (3873, 3882), (4043, 4057), (4487, 4501), (4555, 4569), (5364, 5378), (6750, 6764), (6816, 6820), (6859, 6864), (7408, 7422), (10059, 10073), (10791, 10805), (12411, 12426), (12699, 12709), (16121, 16135), (16433, 16442), (16833, 16842), (17095, 17104), (17358, 17367), (17617, 17631), (17657, 17667), (17723, 17732), (17905, 17914), (18422, 18431), (18528, 18542), (19082, 19096), (19645, 19654)], [(189, 215), (283, 285)], [(133, 163), (313, 325)], [(491, 507), (550, 574), (585, 589)], [(154, 163), (692, 701), (766, 775), (844, 853), (1772, 1781), (4159, 4168), (4384, 4393), (12198, 12207), (17677, 17686), (19535, 19544)], [(678, 701), (752, 775)], [(734, 748), (784, 786), (1895, 1909)], [(648, 674), (796, 815)], [(947, 960), (966, 971)], [(854, 866), (1001, 1013)], [(1247, 1255), (1349, 1359)], [(1389, 1401), (1433, 1438)], [(1453, 1476), (1537, 1547)], [(

94
