<a href="https://colab.research.google.com/github/marco-luzzara/boardgame-complexity-predictor/blob/master/src/extract_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from IPython.display import clear_output
import os
WORKING_LOCALLY = bool(os.getenv('WORKING_LOCALLY'))

if WORKING_LOCALLY:
    DATASET_FILE_PATH = 'data/dataset.csv'
else:
    from google.colab import drive
    drive.mount('/content/drive')
    DATASET_FILE_PATH = '/content/drive/My Drive/Projects/IRBoardGameComplexity/dataset.csv'
    !pip install fastcoref==2.0.*
    !pip install spacy-transformers
    !python3 -m pip install coreferee==1.3.*
    !python3 -m coreferee install en
    !python -m spacy download en_core_web_lg
    !python -m spacy download en_core_web_trf
    clear_output(wait=False)


In [2]:
from IPython.display import clear_output
import spacy
from spacy import displacy

## +++++++++++ with fastcoref
# from fastcoref import spacy_component
# nlp = spacy.load("en_core_web_sm", exclude=["parser", "lemmatizer", "ner", "textcat"])
# nlp.add_pipe("fastcoref")
#              #config={'model_architecture': 'LingMessCoref', 'model_path': 'biu-nlp/lingmess-coref'})

# # to remove tqdm progress bar: https://stackoverflow.com/questions/37091673/silence-tqdms-output-while-running-tests-or-running-the-code-via-cron
from tqdm.auto import tqdm
from functools import partialmethod
tqdm.__init__ = partialmethod(tqdm.__init__, disable=True, ncols=0, nrows=0, gui=False, bar_format='', leave=False)

## +++++++++++ with coreferee
import coreferee
nlp = spacy.load('en_core_web_trf')
nlp.add_pipe("coreferee")

clear_output(wait=False)

In [42]:
import logging

logger = logging.getLogger('bgg_predict')
logger.handlers.clear()
handler = logging.StreamHandler()
formatter = logging.Formatter(
        '%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.INFO)

logger.debug('test')

In [4]:
import re

regex_mail = re.compile(r'\w+(?:\.\w+)*?@\w+(?:\.\w+)+')
# modified from https://stackoverflow.com/a/163684/5587393
regex_link = re.compile(r'(?:\b(?:(?:https?|ftp|file)://|www))[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#%=~_|]')
# in a sentence there must be at least 4 words of length 2 each
regex_at_least_4_words_in_sentence = re.compile(r'^(?=.*?(?:[,:;()]?[a-zA-Z]{2,}[,:;()]?(?: |$)[^a-zA-Z]*?){4,})')
# a string like "first.Second" could be misinterpreted by the tokenizer as a single token
# with the regex it becomes "first. Second"
regex_distance_between_period_and_following_word = re.compile(r'\.(?!\s|$)')
# compress consecutive whitespaces
regex_multiple_spaces = re.compile(r'\s{2,}')
# interrupted words usually have a "- " at the end before the new line, 'inter- rupted' -> 'interrupted'
# NOTE: must be after whitespace compression
regex_interrupted_word = re.compile(r'([a-zA-Z])- ')
# remove page numbers, that are usually enclosed in characters like = or -, for example "-12-"
regex_consecutive_meaningless_chars = re.compile(r'[^\.a-zA-Z0-9\s()]{2,} *(?:\d+)?|(?P<prepage>[^a-zA-Z\s\d\.])\d+(?P=prepage)')
# remove paragraphs id, '1.2.3' -> ''
regex_dot_separated_digits = re.compile(r'(?:\d+\.)+\d+')
# remove meaningless chars after sentence start, '. (- start' -> '. start'
regex_clean_start = re.compile(r'\.(\s?)[^a-zA-Z\s]+')

def clean_from_short_sentences(text: str) -> str:
    return '.'.join(sentence for sentence in text.split('.') if regex_at_least_4_words_in_sentence.match(sentence) is not None)

def clean_text(text: str) -> str:
    for clean_function in [lambda x: regex_mail.sub('', x),
                           lambda x: regex_link.sub('', x),
                           lambda x: regex_dot_separated_digits.sub('', x),
                           lambda x: regex_consecutive_meaningless_chars.sub('', x),
                           lambda x: regex_clean_start.sub(r'.\1', x),
                           # everything that is remove should be placed before this line so that 
                           # eventual spaces are compressed with regex_multiple_space
                           lambda x: regex_multiple_spaces.sub(' ', x),
                           lambda x: regex_interrupted_word.sub(r'\1', x),
                           lambda x: clean_from_short_sentences(x),
                           lambda x: regex_distance_between_period_and_following_word.sub('. ', x)]:
        text = clean_function(text)
    return text

test_text = 'this is a test (me@gmail.it) -12- that wi-  ll be   cleaned. with 2 5 6 not valid. two sentences is good enough http://or.not.'
cleaned_text = clean_text(test_text)
print(cleaned_text)
assert cleaned_text == 'this is a test () that will be cleaned. two sentences is good enough '

this is a test () that will be cleaned. two sentences is good enough 


In [7]:
import pandas as pd
import ast



def get_document_by_line(file_path: str, line: int) -> str:
    ''' the line includes the header too '''
    with pd.read_csv(file_path, chunksize=1, converters={ 'family': ast.literal_eval }) as reader:
        # -1 because the first line contains the header
        for _ in range(line - 1):
            df = next(reader)
        return df['rulebook'].iloc[0]

def get_document_by_id(file_path: str, id: int) -> str:
     with pd.read_csv(file_path, chunksize=1, converters={ 'family': ast.literal_eval }) as reader:
        while True:
            df = next(reader)
            bg_id = df['info.id'].iloc[0]
            if bg_id == id:
                return df['rulebook'].iloc[0]

assert get_document_by_id(DATASET_FILE_PATH, 2310) == get_document_by_line(DATASET_FILE_PATH, 40)

In [None]:
from fastcoref import spacy_component

text = clean_text(get_document_by_line(DATASET_FILE_PATH, 138))
nlp = spacy.load("en_core_web_sm", exclude=["parser", "lemmatizer", "ner", "textcat"])
nlp.add_pipe("fastcoref", config={'max_tokens_in_batch': 200})
doc = nlp(text, component_cfg={"fastcoref": {'resolve_text': True}})
#displacy.render(doc, style='ent', jupyter=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
import coreferee
nlp = spacy.load('en_core_web_trf')
nlp.add_pipe("coreferee")

text = clean_text(get_document_by_line(DATASET_FILE_PATH, 138))
nlp = spacy.load('en_core_web_trf')
nlp.add_pipe("coreferee")
doc = nlp(text)



In [34]:
for cluster in convert_result_to_cluster(doc, nlp.pipeline):
    print([doc.text[entity[0]:entity[1] + 1] for entity in cluster])

['game', 'game']
['rules', 'rules']
['BBEG', 'it', 'BBEG']
['columns', 'them']
['Tower', 'Tower']
['room', 'room', 'room']
['Potion', 'it']
['spot', 'it']
['die', 'its']
['Potion', 'Potion']
['Monster', 'it']
['Monster', 'it', 'it']
['ShaunGamer', 'his']
['Warrior', 'Warrior', 'Warrior']
['Bat', 'Bat', 'Bat', 'Bat']
['Elf', 'Elf', 'Elf', 'Elf']
['Lord', 'Lord', 'Lord']
['player', 'player']
['Level', 'Level', 'Level', 'Level', 'Level']


In [29]:
for cluster in doc._.coref_clusters:
    print([doc.text[entity[0]:entity[1] + 1] for entity in cluster])


['Solo Tower Hack ', 'This print-and-play game ', 'This game ']
['these rules ', 'these rules,']
['the BBEG* ', 'it.', 'the BBEG ']
['a single room ', 'the room s ']
['up to 20 points of Damage.', 'this level of Damage ']
['the Tower ', 'Tower ', 'the Tower,', 'the Tower.', 'the Tower ']
['six columns in the Tower grid,', 'them ']
['the stairs to Level 2 ', 'those places.']
['the next room in your path through the Tower.', 'the room s ', 'the room ', 'the room ']
['1D6 ', 'this ']
['Potion,', 'it ']
['that spot on the map ', 'it ']
['a room ', 'its ']
['Room ', 'the room ', 'Room ']
['Ben Nelson ', 'Ben Nelson ', 'Ben Nelson ']
['2007 ', '2007 ']
['Tower Level ', 'Tower Level ']
['a Monster ', 'it ', 'the Monster ', 'the Monster ', 'the Monster ', 'it ']
['Each Monster ', 'it.']
['you ', 'you ', 'you ']
['ShaunGamer ', 'his ']
['Devil Bat ', 'Devil Bat ', 'Devil Bat ', 'Devil Bat ']
['this player ', 'this player ']
['2007 ', '2007 ']
['Tower Level Damage ', 'Tower Level Damage ']


In [None]:
from typing import List
from dataclasses import dataclass

@dataclass
class Sentence:
    content: str
    start: int
    end: int
    
    def does_include_pos(self, pos: int) -> bool:
        # + 1 because the sentence include terminal period. some tokens include this period too
        return self.start <= pos <= self.end + 1

def get_sentences_from_text(text: str) -> List[Sentence]:
    # assert no continuous dots because of text cleared while building the dataset
    sentences = text.split('.')
    
    res = []
    char_accumulator = 0
    for sentence in sentences:
        res.append(Sentence(sentence, char_accumulator, char_accumulator + len(sentence) - 1))
        # + 1 because of the dot at the end of the sentence
        char_accumulator += len(sentence) + 1
        
    if res[-1].content == '':
        res.pop()
        
    return res

test_text = 'first sentence. second sentence. third sentence'
test_sentences = get_sentences_from_text(test_text)
assert len(test_sentences) == 3
test_sentences

[Sentence(content='first sentence', start=0, end=13),
 Sentence(content=' second sentence', start=15, end=30),
 Sentence(content=' third sentence', start=32, end=46)]

In [None]:
from typing import Tuple

def get_sentences_from_clusters(clusters: List[List[Tuple[int, int]]], sentences: List[Sentence]) -> List[List[int]]:
    # + sentences[0] because sentences are built from the entire text and not from the current group
    clusters_on_sentences = [[next(filter(lambda x: x[1].does_include_pos(entity[0] + sentences[0].start), enumerate(sentences)))[0] 
                              for entity in cluster]
                             for cluster in clusters]

    return clusters_on_sentences

# text = 'Alice goes down the rabbit hole. Where she would discover a new reality beyond her expectations.'
# sentences = get_sentences_from_text(text)
# clusters = [[(0, 5), (39, 42), (79, 82)]]
sentences = [Sentence(content=' A boom unit is destroyed when it has received 5 floatation hits,  and is removed from play, clearing the hex for unobstructed vessel  movement', start=65348, end=65490), 
             Sentence(content=' If a boom unit destroyed on the same game turn it is attacked, the  attacking vessel (A', start=65492, end=65579), 
             Sentence(content=' is not subject to a die roll on the Vessel  Fouling Table (Combat Table No', start=65581, end=65655), 
             Sentence(content=' 13) and continues its movement', start=65657, end=65687)]
clusters = [[(8, 11), (31, 32)], [(192, 193), (231, 232), (308, 308)], [(306, 307), (328, 330)]]        
get_sentences_from_clusters(clusters, sentences)

[[0, 0], [1, 1, 2], [2, 3]]

In [None]:
from typing import List, Set
from itertools import groupby
from operator import itemgetter
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import connected_components

def get_rule_groups_from_sentence_clusters(sentences: List[Sentence], sentence_clusters: List[List[int]]) -> List[List[int]]:
    def normalize_group(group: Set[int]) -> List[List[int]]:
        '''each group could contain multiple consecutive sublists. this method split these sublists'''
        res = []

        # https://stackoverflow.com/a/23861347/5587393
        for k, g in groupby(enumerate(sorted(list(group))), lambda x: x[0] - x[1]):
            res.append(list(map(itemgetter(1), g)))

        return res
    # the graph is built as a directed sparse graph where the first element of each cluster
    # is connected to the other elements in the same cluster
    graph = [[0 for _ in range(len(sentences))] for __ in range(len(sentences))]
    for cluster in sentence_clusters:
        for sentence in cluster[1:]:
            graph[cluster[0]][sentence] = 1

    # find the connected components of the graph created from the clusters returned after coref     
    graph = csr_matrix(graph)
    n_components, labels = connected_components(csgraph=graph, directed=False, return_labels=True)
    groups = [set() for _ in range(n_components)]
    for i, label in enumerate(labels):
        groups[label].add(i)

    return [norm_group for group in groups for norm_group in normalize_group(group)]

In [11]:
from typing import List, Tuple
def convert_result_to_cluster(result, pipeline) -> List[List[Tuple[int, int]]]:
    component_names = [x[0] for x in pipeline]
    if 'coreferee' in component_names:
        return [[(result[entity[0]].idx, result[entity[0]].idx + len(result[entity[0]]) - 1) 
                 for entity in chain] for chain in result._.coref_chains]
    elif 'fastcoref' in component_names:
        return result._.coref_clusters

result = nlp("Although he was very busy with his work, Peter had had enough of it. He and his wife decided they needed a holiday. They travelled to Spain because they loved the country very much.")
convert_result_to_cluster(result, nlp.pipeline)



[[(9, 10), (31, 33), (41, 45), (69, 70), (76, 78)],
 [(35, 38), (65, 66)],
 [(69, 70), (93, 96), (116, 119), (148, 151)],
 [(134, 138), (163, 169)]]

In [None]:
from typing import List, Tuple
import pandas as pd
import ast

def get_rules(text: str) -> List[str]:
    text = clean_text(text)
    sentences = get_sentences_from_text(text)
    
    GROUP_STEP_OFFSET = 2
    # I create groups of 4 sentences to speed up the process of finding connected sentences
    # and to make sure to find connected sentences not immediately adjacent
    sentences_groups = [sentences[i:min(i+4, len(sentences))] for i in range(0, len(sentences) - 2, GROUP_STEP_OFFSET)]
    doc_groups = nlp.pipe(['.'.join(map(lambda s: s.content, group)) for group in sentences_groups])

    cluster_groups = []
    for i, group in enumerate(sentences_groups):
        group_text = next(doc_groups)
        group_coref_clusters = convert_result_to_cluster(group_text, nlp.pipeline)
        group_sentence_clusters = get_sentences_from_clusters(group_coref_clusters, group)
        # + i * GROUP_STEP_OFFSET to retrieve the actual index of the sentence
        cluster_groups.extend([sentence_id + i * GROUP_STEP_OFFSET for sentence_id in gsc] 
                               for gsc in group_sentence_clusters)
        
    rule_groups = get_rule_groups_from_sentence_clusters(sentences, cluster_groups)                                                                                
    
    return ['. '.join([sentences[s_index].content for s_index in group]) for group in rule_groups]

with pd.read_csv(DATASET_FILE_PATH, chunksize=1, converters={ 'family': ast.literal_eval }) as reader:
    while True:
        df = next(reader)
        id = df['info.id'].iloc[0]
        if id == 24770:
            break
    doc = df['rulebook'].iloc[0]
    print(doc)
    rules = get_rules(doc)

rules

In [None]:
from typing import Tuple
import pandas as pd
import ast
import os
from IPython.display import clear_output

def get_rules_features(text: str) -> Tuple[int, float]:
    rules = get_rules(text)
    rule_count = len(rules)
    return rule_count, len(text) / rule_count

def remove_columns_prefix(df) -> None:
    '''remove prefix 'info.' from the columns of df'''
    df.rename(columns=lambda c: c.rsplit('.', 1)[-1], inplace=True)

PROCESSED_DATASET_FILE_PATH = 'data/processed_dataset.csv' if WORKING_LOCALLY \
    else '/content/drive/My Drive/Projects/IRBoardGameComplexity/processed_dataset.csv'

if os.path.exists(PROCESSED_DATASET_FILE_PATH):
    file_mode = 'a'

df_features = pd.DataFrame()
# ast.literal_eval converts the family column string into a python array
with pd.read_csv(DATASET_FILE_PATH, chunksize=5, converters={ 'family': ast.literal_eval }) as reader:
    for df in reader:
        remove_columns_prefix(df)
        logger.info(f'processing board games {df[["id"]].to_string(index=False)}')
        df_rules_features = df.apply(lambda x: pd.Series(get_rules_features(x.rulebook), 
                                     index=['rule_count', 'avg_rule_len']), axis='columns')
        df_features = pd.concat([df_features, df[['averageweight', 'playingtime', 'family']].join(df_rules_features)])
        clear_output(wait=True)
        
# one-hot encoding "family" field 
# from https://stackoverflow.com/questions/71401193/one-hot-encoding-in-python-for-array-values-in-a-dataframe
df_features = df_features.join(df_features.pop('family').apply('|'.join).str.get_dummies())
df_features.head()

df_features.to_csv(PROCESSED_DATASET_FILE_PATH, header=True, index=False, mode='w')    
if not WORKING_LOCALLY:
    drive.flush_and_unmount()

In [None]:
PROCESSED_DATASET_FILE_PATH = 'data/processed_dataset.csv' if WORKING_LOCALLY \
    else '/content/drive/My Drive/Projects/IRBoardGameComplexity/processed_dataset.csv'

df_features.to_csv(PROCESSED_DATASET_FILE_PATH, header=True, index=False, mode='w')