<a href="https://colab.research.google.com/github/marco-luzzara/boardgame-complexity-predictor/blob/master/src/extract_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
from IPython.display import clear_output
import os
WORKING_LOCALLY = bool(os.getenv('WORKING_LOCALLY'))

if WORKING_LOCALLY:
    DATASET_FILE_PATH = 'data/dataset.csv'
else:
    from google.colab import drive
    drive.mount('/content/drive')
    DATASET_FILE_PATH = '/content/drive/My Drive/Projects/IRBoardGameComplexity/dataset.csv'
    !pip install spacy-transformers
    !python3 -m pip install coreferee==1.3.*
    !python3 -m coreferee install en
    !python -m spacy download en_core_web_lg
    !python -m spacy download en_core_web_trf
    !pip install git+https://github.com/LIAAD/yake
    !pip install rake-nltk
    clear_output(wait=False)


In [2]:
from IPython.display import clear_output
import spacy
from spacy import displacy

## +++++++++++ with fastcoref
# from fastcoref import spacy_component
# nlp = spacy.load("en_core_web_sm", exclude=["parser", "lemmatizer", "ner", "textcat"])
# nlp.add_pipe("fastcoref")
#              #config={'model_architecture': 'LingMessCoref', 'model_path': 'biu-nlp/lingmess-coref'})

# # to remove tqdm progress bar: https://stackoverflow.com/questions/37091673/silence-tqdms-output-while-running-tests-or-running-the-code-via-cron
# from tqdm.auto import tqdm
# from functools import partialmethod
# tqdm.__init__ = partialmethod(tqdm.__init__, disable=True, ncols=0, nrows=0, gui=False, bar_format='', leave=False)

## +++++++++++ with coreferee
import coreferee
nlp = spacy.load('en_core_web_trf')
nlp.add_pipe("coreferee")

clear_output(wait=False)

In [4]:
import logging

logger = logging.getLogger('bgg_predict')
logger.handlers.clear()
handler = logging.StreamHandler()
formatter = logging.Formatter(
        '%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)

logger.debug('test')

2022-11-22 00:07:42,257 bgg_predict  DEBUG    test
DEBUG:bgg_predict:test


In [5]:
import re

regex_mail = re.compile(r'\w+(?:\.\w+)*?@\w+(?:\.\w+)+')
# modified from https://stackoverflow.com/a/163684/5587393
regex_link = re.compile(r'(?:\b(?:(?:https?|ftp|file)://|www))[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#%=~_|]')
# in a sentence there must be at least 4 words of length 2 each
regex_at_least_4_words_in_sentence = re.compile(r'^(?=.*?(?:[,:;()]?[a-zA-Z]{2,}[,:;()]?(?: |$)[^a-zA-Z]*?){4,})')
# a string like "first.Second" could be misinterpreted by the tokenizer as a single token
# with the regex it becomes "first. Second"
regex_distance_between_period_and_following_word = re.compile(r'\.(?!\s|$)')
# compress consecutive whitespaces
regex_multiple_spaces = re.compile(r'\s{2,}')
# interrupted words usually have a "- " at the end before the new line, 'inter- rupted' -> 'interrupted'
# NOTE: must be after whitespace compression
regex_interrupted_word = re.compile(r'([a-zA-Z])- ')
# remove page numbers, that are usually enclosed in characters like = or -, for example "-12-"
regex_consecutive_meaningless_chars = re.compile(r'[^\.a-zA-Z0-9\s()]{2,} *(?:\d+)?|(?P<prepage>[^a-zA-Z\s\d\.])\d+(?P=prepage)')
# remove paragraphs id, '1.2.3' -> ''
regex_dot_separated_digits = re.compile(r'(?:\d+\.)+\d+')
# remove meaningless chars after sentence start, '. (- start' -> '. start'
regex_clean_start = re.compile(r'\.(\s?)[^a-zA-Z\s]+')

def clean_from_short_sentences(text: str) -> str:
    return '.'.join(sentence for sentence in text.split('.') if regex_at_least_4_words_in_sentence.match(sentence) is not None)

def clean_text(text: str) -> str:
    for clean_function in [lambda x: regex_mail.sub('', x),
                           lambda x: regex_link.sub('', x),
                           lambda x: regex_dot_separated_digits.sub('', x),
                           lambda x: regex_consecutive_meaningless_chars.sub('', x),
                           lambda x: regex_clean_start.sub(r'.\1', x),
                           # everything that is remove should be placed before this line so that 
                           # eventual spaces are compressed with regex_multiple_space
                           lambda x: regex_multiple_spaces.sub(' ', x),
                           lambda x: regex_interrupted_word.sub(r'\1', x),
                           lambda x: clean_from_short_sentences(x),
                           lambda x: regex_distance_between_period_and_following_word.sub('. ', x)]:
        text = clean_function(text)
    return text

test_text = 'this is a test (me@gmail.it) -12- that wi-  ll be   cleaned. with 2 5 6 not valid. two sentences is good enough http://or.not.'
cleaned_text = clean_text(test_text)
print(cleaned_text)
assert cleaned_text == 'this is a test () that will be cleaned. two sentences is good enough '

this is a test () that will be cleaned. two sentences is good enough 


In [8]:
import pandas as pd
import ast

def remove_columns_prefix(df: pd.DataFrame) -> None:
    '''remove prefix 'info.' from the columns of df'''
    df.rename(columns=lambda c: c.rsplit('.', 1)[-1], inplace=True)

def get_df_with_docs(file_path: str, nrows=None, skiprows=1) -> pd.DataFrame:
    ''' get a dataframe containing nrows and skipping the first `skiprows` (including the header)'''
    df_dataset = pd.read_csv(file_path, converters={ 'info.family': ast.literal_eval }, 
                             nrows=nrows, skiprows=range(1, skiprows))
    remove_columns_prefix(df_dataset)
    return df_dataset

def get_document_by_line(file_path: str, line: int) -> str:
    ''' the line includes the header too '''
    # range from 1 is used to keep the first row https://stackoverflow.com/a/27325729/5587393
    df = get_df_with_docs(file_path, 1, line - 1)
    return df['rulebook'].iloc[0]

def get_document_by_id(file_path: str, id: int) -> str:
     with pd.read_csv(file_path, chunksize=1, converters={ 'family': ast.literal_eval }) as reader:
        while True:
            df = next(reader)
            bg_id = df['info.id'].iloc[0]
            if bg_id == id:
                return df['rulebook'].iloc[0]

assert get_document_by_id(DATASET_FILE_PATH, 2310) == get_document_by_line(DATASET_FILE_PATH, 40)

In [7]:
from typing import Tuple

def get_sentences_from_clusters(clusters: List[List[Tuple[int, int]]], sentences: List[Sentence]) -> List[List[int]]:
    # + sentences[0] because sentences are built from the entire text and not from the current group
    clusters_on_sentences = [[next(filter(lambda x: x[1].does_include_pos(entity[0] + sentences[0].start), enumerate(sentences)))[0] 
                              for entity in cluster]
                             for cluster in clusters]

    return clusters_on_sentences

# text = 'Alice goes down the rabbit hole. Where she would discover a new reality beyond her expectations.'
# sentences = get_sentences_from_text(text)
# clusters = [[(0, 5), (39, 42), (79, 82)]]
sentences = [Sentence(content=' A boom unit is destroyed when it has received 5 floatation hits,  and is removed from play, clearing the hex for unobstructed vessel  movement', start=65348, end=65490), 
             Sentence(content=' If a boom unit destroyed on the same game turn it is attacked, the  attacking vessel (A', start=65492, end=65579), 
             Sentence(content=' is not subject to a die roll on the Vessel  Fouling Table (Combat Table No', start=65581, end=65655), 
             Sentence(content=' 13) and continues its movement', start=65657, end=65687)]
clusters = [[(8, 11), (31, 32)], [(192, 193), (231, 232), (308, 308)], [(306, 307), (328, 330)]]        
get_sentences_from_clusters(clusters, sentences)

[[0, 0], [1, 1, 2], [2, 3]]

In [8]:
from typing import List, Set
from itertools import groupby
from operator import itemgetter
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import connected_components

def get_rule_groups_from_sentence_clusters(sentences: List[Sentence], sentence_clusters: List[List[int]]) -> List[List[int]]:
    def normalize_group(group: Set[int]) -> List[List[int]]:
        '''each group could contain multiple consecutive sublists. this method split these sublists'''
        res = []

        # https://stackoverflow.com/a/23861347/5587393
        for k, g in groupby(enumerate(sorted(list(group))), lambda x: x[0] - x[1]):
            res.append(list(map(itemgetter(1), g)))

        return res
    # the graph is built as a directed sparse graph where the first element of each cluster
    # is connected to the other elements in the same cluster
    graph = [[0 for _ in range(len(sentences))] for __ in range(len(sentences))]
    for cluster in sentence_clusters:
        for sentence in cluster[1:]:
            graph[cluster[0]][sentence] = 1

    # find the connected components of the graph created from the clusters returned after coref     
    graph = csr_matrix(graph)
    n_components, labels = connected_components(csgraph=graph, directed=False, return_labels=True)
    groups = [set() for _ in range(n_components)]
    for i, label in enumerate(labels):
        groups[label].add(i)

    return [norm_group for group in groups for norm_group in normalize_group(group)]

In [9]:
from typing import List, Tuple
def convert_result_to_cluster(result, pipeline) -> List[List[Tuple[int, int]]]:
    component_names = [x[0] for x in pipeline]
    if 'coreferee' in component_names:
        return [[(result[entity[0]].idx, result[entity[0]].idx + len(result[entity[0]]) - 1) 
                 for entity in chain] for chain in result._.coref_chains]
    elif 'fastcoref' in component_names:
        return result._.coref_clusters

result = nlp("Although he was very busy with his work, Peter had had enough of it. He and his wife decided they needed a holiday. They travelled to Spain because they loved the country very much.")
convert_result_to_cluster(result, nlp.pipeline)



[[(9, 10), (31, 33), (41, 45), (69, 70), (76, 78)],
 [(35, 38), (65, 66)],
 [(69, 70), (93, 96), (116, 119), (148, 151)],
 [(134, 138), (163, 169)]]

In [None]:
from typing import List, Tuple
import pandas as pd
import ast

def get_rules(text: str) -> List[str]:
    text = clean_text(text)
    sentences = get_sentences_from_text(text)
    
    GROUP_STEP_OFFSET = 2
    # I create groups of 4 sentences to speed up the process of finding connected sentences
    # and to make sure to find connected sentences not immediately adjacent
    sentences_groups = [sentences[i:min(i+4, len(sentences))] for i in range(0, len(sentences) - 2, GROUP_STEP_OFFSET)]
    doc_groups = nlp.pipe(['.'.join(map(lambda s: s.content, group)) for group in sentences_groups])

    cluster_groups = []
    for i, group in enumerate(sentences_groups):
        group_text = next(doc_groups)
        group_coref_clusters = convert_result_to_cluster(group_text, nlp.pipeline)
        group_sentence_clusters = get_sentences_from_clusters(group_coref_clusters, group)
        # + i * GROUP_STEP_OFFSET to retrieve the actual index of the sentence
        cluster_groups.extend([sentence_id + i * GROUP_STEP_OFFSET for sentence_id in gsc] 
                               for gsc in group_sentence_clusters)
        
    rule_groups = get_rule_groups_from_sentence_clusters(sentences, cluster_groups)                                                                                
    
    return ['. '.join([sentences[s_index].content for s_index in group]) for group in rule_groups]

text = get_document_by_id(DATASET_FILE_PATH, 24770)
rules = get_rules(text)

rules

In [None]:
import itertools
from collections import Counter, defaultdict
from typing import List, Dict

def filter_tokens_as_components(doc: spacy.tokens.Doc) -> Dict[str, List[spacy.tokens.Token]]:
    tokens_dict = defaultdict(list)

    for token in doc:
        if len(token) >= 3 and \
            token.pos_ in ['NOUN', 'PROPN'] and \
            token.dep_ in ['nsubj', 'dobj', 'nsubjpass', 'pobj', 'compound']:
            tokens_dict[token.lemma_.lower()].append(token)
           
    return tokens_dict

def find_n_most_common_nouns(n, docs: List[spacy.tokens.Doc]) -> List[str]:
    docs_sets = [set(filter_tokens_as_components(doc).keys())
                 for doc in docs]
    all_tokens_from_docs = itertools.chain(*docs_sets)
    tokens_counter = Counter(all_tokens_from_docs)
    return tokens_counter.most_common(n)
    

nlp = spacy.load('en_core_web_sm')
df_dataset = get_df_with_docs(DATASET_FILE_PATH, 10, 50)
docs = nlp.pipe(map(clean_text, df_dataset['rulebook'].values))

find_n_most_common_nouns(10, docs)

In [15]:
from spacy import displacy
from collections import namedtuple
from spacy.matcher import Matcher, DependencyMatcher

LuckMetrics = namedtuple('LuckMetrics', ['dice_based', 'drawing_based', 'shuffling_based', 'random_based'])

def get_luck_matches(doc: spacy.tokens.Doc) -> LuckMetrics:
    # ---------- random ----------
    random_matcher = Matcher(doc.vocab)
    random_patterns_match = [
        [{"LEMMA": { "IN": ["random", "randomly"]}}]
    ]
    random_matcher.add("random", random_patterns_match)

    # ---------- shuffle ----------
    shuffle_matcher = Matcher(doc.vocab)
    shuffle_patterns_match = [
        [{"LEMMA": "shuffle", "POS": "VERB"}]
    ]
    shuffle_matcher.add("shuffle", shuffle_patterns_match)

    # ---------- card drawing ----------
    drawing_matcher = DependencyMatcher(doc.vocab)    
    drawing_patterns = [
        [
            {
                "RIGHT_ID": "drawing",
                "RIGHT_ATTRS": {"LEMMA": "draw", "POS": "VERB"}
            },
            {
                "LEFT_ID": "drawing",
                "REL_OP": ">",
                "RIGHT_ID": "card",
                "RIGHT_ATTRS": {
                    "LEMMA": "card",
                    "POS": "NOUN", 
                    "DEP": { "IN": ['dobj', 'nsubjpass', 'compound'] }
                }
            }
        ]
    ]
    drawing_matcher.add("drawing", drawing_patterns)
    # ---------- dice rolling ----------
    dice_matcher = DependencyMatcher(doc.vocab)    
    dice_patterns = [
        [
            {
                "RIGHT_ID": "rolling",
                "RIGHT_ATTRS": {"LEMMA": { "IN": ["use", "throw", "roll"]}, "POS": "VERB"}
            },
            {
                "LEFT_ID": "rolling",
                "REL_OP": ">",
                "RIGHT_ID": "dice_or_die",
                "RIGHT_ATTRS": {
                    "LEMMA": { "IN": ["die", "dice"]},
                    "POS": "NOUN", 
                    "DEP": { "IN": ['nsubj', 'dobj', 'nsubjpass', 'compound'] }
                }
            }
        ],
        [
            {
                "RIGHT_ID": "rolling",
                "RIGHT_ATTRS": {"LEMMA": { "IN": ["use", "throw", "roll"]}, "POS": "VERB"}
            },
            {
                "LEFT_ID": "rolling",
                "REL_OP": ">",
                "RIGHT_ID": "number",
                "RIGHT_ATTRS": {
                    "IS_DIGIT": True, 
                    "DEP": { "IN": ['dobj'] }
                }
            }
        ]
    ]
    dice_matcher.add("diceroll", dice_patterns)

    dice_matches = dice_matcher(doc) 
    draw_matches = drawing_matcher(doc)
    shuffle_matches = shuffle_matcher(doc)
    random_matches = random_matcher(doc)
    
    return LuckMetrics(len(dice_matches), len(draw_matches), len(shuffle_matches), len(random_matches))

# text = get_document_by_line(DATASET_FILE_PATH, 153)
text = '''you could use the random choice or randomly pick a card. some dice are thrown. next you roll 12 when the dice roll'''

nlp = spacy.load('en_core_web_sm')
doc = nlp(text)
print([token.lemma_ for token in doc])
print(get_luck_matches(doc))

displacy.render(doc, style='dep', jupyter=True)

['you', 'could', 'use', 'the', 'random', 'choice', 'or', 'randomly', 'pick', 'a', 'card', '.', 'some', 'dice', 'be', 'throw', '.', 'next', 'you', 'roll', '12', 'when', 'the', 'dice', 'roll']
LuckMetrics(dice_based=3, drawing_based=0, shuffling_based=0, random_based=2)


In [None]:
from typing import List
import coreferee

def _get_new_token_from_resolve(token: spacy.tokens.Token, 
                                chains: coreferee.data_model.ChainHolder) -> spacy.tokens.Token:
    resolved_token = chains.resolve(token)
    return token.text_with_ws if resolved_token is None \
                              else ' and '.join([res_token.text_with_ws for res_token in resolved_token])   

def _process_doc_for_coref(doc: spacy.tokens.Doc) -> str:
    replacement_tokens = []
    chains = doc._.coref_chains
    new_doc_tokens_text = [_get_new_token_from_resolve(token, chains) for token in doc]

    return ''.join(new_doc_tokens_text)

def preprocess_texts(texts: List[str]) -> List[str]:
    nlp = spacy.load('en_core_web_trf')
    nlp.add_pipe("coreferee")

    texts = [clean_text(text) for text in texts]
    docs = nlp.pipe(texts)

    return [_process_doc_for_coref(doc) for doc in docs]
        
# text = get_document_by_line(DATASET_FILE_PATH, 40)
text = '''Although he was very busy with his work, the magical Peter had had enough of it. 
    He and his wife decided they needed a holiday. 
    this couple travelled to Spain because it loves the country very much.'''
preprocess_texts([text])

In [None]:
doc = nlp(clean_text(get_document_by_line(DATASET_FILE_PATH, 155)))
doc.text
# print(doc[12].dep_)
#doc.text.find('Tinners Trail Player')

In [None]:
import re
from collections import Counter
from typing import List, Set, Dict
from rake_nltk import Rake
from nltk.util import ngrams
import yake
import numpy as np
from string import punctuation

# stopwords = nlp.Defaults.stop_words
# rake = Rake(stopwords=stopwords, punctuations={ c for c in punctuation },
#             sentence_tokenizer=lambda txt: txt.split('.'))

regex_word_within_boundaries = re.compile(r'\b')
MIN_TOKEN_TO_BE_CONSIDERED_COMPONENT = 4
MAX_COMPONENTS = 100

def _get_ngrams_components(doc: spacy.tokens.Doc, \
                           components: Dict[str, List[spacy.tokens.Token]],
                           n_grams: int):
    pass


def _get_bg_components_by_deps_inspection(doc: spacy.tokens.Doc) -> Dict[str, List[int]]:
    words_to_leave_out = ['beginning', 'board', 'book', 'case', 'clarification', 'design', 
                          'effect', 'end', 'example', 'case', 'game', 'number', 
                          'overview', 'order', 'play', 'player', 'purpose', 'reference',
                          'result', 'rule', 'rulebook', 'section', 'set', 'setup', 'summary', 
                          'start', 'step', 'thing', 'type', 'time', 'total', 'use', 'value', 'version', 'way']

    possible_components = dict(filter(lambda token: token[0] not in words_to_leave_out and 
                                      len(token[1]) >= MIN_TOKEN_TO_BE_CONSIDERED_COMPONENT, 
                                  filter_tokens_as_components(doc).items()))
    return possible_components

# def _get_lemmas_given_keywords_group(group: str, doc: spacy.tokens.Doc) -> List[str]:
#     kw_match = re.search(r'\b' + group + '\\b', doc.text)
#     if kw_match is None:
#         return []

#     group_span = doc.char_span(kw_match.start(0), kw_match.end(0))
#     return [token.lemma_.lower() for token in group_span]

# def _get_bg_components_by_keyword_analysis(doc: spacy.tokens.Doc, max_keywords: int) -> List[str]:
#     kw_extractor = yake.KeywordExtractor(top=max_keywords)
#     keywords_info = kw_extractor.extract_keywords(doc.text)
#     keyword_groups = [keyword_info[0] for keyword_info in keywords_info if keyword_info[1] < 0.1]

#     return [lemma for keyword_group in keyword_groups 
#             for lemma in _get_lemmas_given_keywords_group(keyword_group, doc)]

def get_bg_components(doc: spacy.tokens.Doc) -> Dict[str, List[int]]:
    components_by_deps = _get_bg_components_by_deps_inspection(doc)
    # print(components_by_deps)
    # components_by_kws = _get_bg_components_by_keyword_analysis(doc, len(components_by_deps))
    # print(components_by_kws)

    # return set(components_by_deps).intersection(set(components_by_kws))
    return components_by_deps

def get_doc_variance(doc: spacy.tokens.Doc, components_dict: Dict[str, List[int]]) -> float:
    '''variance measures how components interleave in the text. This could mean that rules involve
    many components and are therefore more complex. variancy is computed using `np.var` on each
    component list. the results are normalized by multiplicating for the frequency of the component.
    eventually the partial variances are summed together and the result normalized with the 
    total numbers of tokens.'''
    tokens_count = sum(len(token_list) for token_list in components_dict.values())
    return sum((len(tokens) / tokens_count) * np.var([token.i for token in tokens])
        for tokens in components_dict.values()) / len((doc))

nlp = spacy.load('en_core_web_sm')
doc = nlp(clean_text(get_document_by_line(DATASET_FILE_PATH, 138)))
print(len(doc.text))
components = get_bg_components(doc)
print(components)
print(get_doc_variance(doc, components))


# rake.extract_keywords_from_text(doc.text)
# print(rake.get_word_frequency_distribution())
# for keyword in rake.get_ranked_phrases_with_scores():
#     print(keyword)
# print(rake.get_word_degrees())

In [None]:
from typing import Tuple
import pandas as pd
import ast
import os
from IPython.display import clear_output

def get_rules_features(id: int, doc: spacy.tokens.Doc) -> Tuple[int, float]:
    logger.info(f'processing board game {id}')
    rulebook_len = len(doc)
    bg_components = get_bg_components(doc)
    print(bg_components)

    return 0, 0
    # rules = get_rules(text)
    # rule_count = len(rules)
    # return rule_count, len(text) / rule_count

def apply_for_rulebook_features(row, docs_dict):
    next_doc_info = next(docs_dict)
    assert next_doc_info[0] == row.id
    return pd.Series(get_rules_features(row.id, next_doc_info[1]), 
                     index=['rule_count', 'avg_rule_len'])

PROCESSED_DATASET_FILE_PATH = 'data/processed_dataset.csv' if WORKING_LOCALLY \
    else '/content/drive/My Drive/Projects/IRBoardGameComplexity/processed_dataset.csv'

# ast.literal_eval converts the family column string into a python array
# with pd.read_csv(DATASET_FILE_PATH, chunksize=5, converters={ 'info.family': ast.literal_eval }) as reader:
#     for df in reader:
df_dataset = pd.read_csv(DATASET_FILE_PATH, converters={ 'info.family': ast.literal_eval }, nrows=1)
remove_columns_prefix(df_dataset)
docs_dict = zip(df_dataset['id'].values, 
                nlp.pipe(map(clean_text, df_dataset['rulebook'].values)))

df_rules_features = df_dataset.apply(lambda x: apply_for_rulebook_features(x, docs_dict),
                                     axis='columns')
df_features = df_dataset[['averageweight', 'playingtime', 'family']].join(df_rules_features)
        
# one-hot encoding "family" field 
# from https://stackoverflow.com/questions/71401193/one-hot-encoding-in-python-for-array-values-in-a-dataframe
df_features = df_features.join(df_features.pop('family').apply('|'.join).str.get_dummies())
df_features.head()

# df_features.to_csv(PROCESSED_DATASET_FILE_PATH, header=True, index=False, mode='w')    
# if not WORKING_LOCALLY:
#     drive.flush_and_unmount()