<a href="https://colab.research.google.com/github/marco-luzzara/boardgame-complexity-predictor/blob/master/src/extract_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from IPython.display import clear_output
import os
WORKING_LOCALLY = bool(os.getenv('WORKING_LOCALLY'))

if WORKING_LOCALLY:
    DATASET_FILE_PATH = 'data/dataset.csv'
    CLEANED_DATASET_FILE_PATH = 'data/cleaned_dataset.csv'
else:
    from google.colab import drive
    drive.mount('/content/drive')
    DATASET_FILE_PATH = '/content/drive/My Drive/Projects/IRBoardGameComplexity/dataset.csv'
    CLEANED_DATASET_FILE_PATH = '/content/drive/My Drive/Projects/IRBoardGameComplexity/cleaned_dataset.csv'
    # !pip install git+https://github.com/LIAAD/yake
    # !pip install rake-nltk
    clear_output(wait=False)

In [2]:
from IPython.display import clear_output
import spacy
from spacy import displacy

In [3]:
import logging

logger = logging.getLogger('bgg_predict')
logger.handlers.clear()
handler = logging.StreamHandler()
formatter = logging.Formatter(
        '%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)

logger.debug('test')

2022-11-27 22:02:43,286 bgg_predict  DEBUG    test
DEBUG:bgg_predict:test


In [4]:
import pandas as pd
import ast

def get_df_with_docs(file_path: str, nrows=None, skiprows=1) -> pd.DataFrame:
    ''' get a dataframe containing nrows and skipping the first `skiprows` (including the header)'''
    df_dataset = pd.read_csv(file_path, converters={ 'family': ast.literal_eval }, 
                             nrows=nrows, skiprows=range(1, skiprows))
    return df_dataset

def get_document_by_line(file_path: str, line: int) -> str:
    ''' the line includes the header too '''
    # range from 1 is used to keep the first row https://stackoverflow.com/a/27325729/5587393
    df = get_df_with_docs(file_path, 1, line - 1)
    return df['rulebook'].iloc[0]

def get_document_by_id(file_path: str, id: int) -> str:
     with pd.read_csv(file_path, chunksize=1, converters={ 'family': ast.literal_eval }) as reader:
        while True:
            df = next(reader)
            bg_id = df['id'].iloc[0]
            if bg_id == id:
                return df['rulebook'].iloc[0]

assert get_document_by_id(CLEANED_DATASET_FILE_PATH, 2310) == get_document_by_line(CLEANED_DATASET_FILE_PATH, 40)

## Cleaning and Preprocessing

In this part, data are cleaned and processed using coreference resolution. This means that all the pronouns and references to other objects in the sentence are resolved. The next 2 cells should be run only when you want to preprocess data, which takes a lot of time.

In [None]:
if not WORKING_LOCALLY:
    !pip install spacy-transformers
    !python3 -m pip install coreferee==1.3.*
    !python3 -m coreferee install en
    !python -m spacy download en_core_web_lg
    !python -m spacy download en_core_web_trf
    clear_output(wait=False)

In [None]:
import re

regex_mail = re.compile(r'\w+(?:\.\w+)*?@\w+(?:\.\w+)+')
# modified from https://stackoverflow.com/a/163684/5587393
regex_link = re.compile(r'(?:\b(?:(?:https?|ftp|file)://|www))[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#%=~_|]')
# in a sentence there must be at least 4 words of length 2 each
regex_at_least_4_words_in_sentence = re.compile(r"^(?=.*?(?:[,:;()'\"]?[a-zA-Z']{2,}[,:;()'\"]?(?: |-|$)(?:[^a-zA-Z]*?|[a-zA-Z]? ?)){4,})")         
# a string like "first.Second" could be misinterpreted by the tokenizer as a single token
# with the regex it becomes "first. Second"
regex_distance_between_period_and_following_word = re.compile(r'\.(?!\s|$)')
# compress consecutive whitespaces
regex_multiple_spaces = re.compile(r'\s{2,}')
# interrupted words usually have a "- " at the end before the new line, 'inter- rupted' -> 'interrupted'
# NOTE: must be after whitespace compression
regex_interrupted_word = re.compile(r'([a-zA-Z])- ')
# remove page numbers, that are usually enclosed in characters like = or -, for example "-12-"
regex_consecutive_meaningless_chars = re.compile(r'[^\.a-zA-Z0-9\s()]{2,} *(?:\d+)?|(?P<prepage>[^a-zA-Z\s\d\.])\d+(?P=prepage)')
# remove paragraphs id, '1.2.3' -> ''
regex_dot_separated_digits = re.compile(r'(?:\d+\.)+\d+')
# remove meaningless chars after sentence start, '. (- start' -> '. start'
regex_clean_start = re.compile(r'\.(\s?)[^a-zA-Z\s]+')
# recover missing apices
regex_missing_apices = re.compile(r"\b([a-zA-Z]+) (t|s)\b")

def clean_from_short_sentences(text: str) -> str:
    return '.'.join(sentence for sentence in text.split('.') if regex_at_least_4_words_in_sentence.match(sentence) is not None)

def clean_text(text: str) -> str:
    for clean_function in [lambda x: regex_mail.sub('', x),
                           lambda x: regex_link.sub('', x),
                           lambda x: regex_dot_separated_digits.sub('', x),
                           lambda x: regex_consecutive_meaningless_chars.sub('', x),
                           lambda x: regex_clean_start.sub(r'.\1', x),
                           # everything that is remove should be placed before this line so that 
                           # eventual spaces are compressed with regex_multiple_space
                           lambda x: regex_multiple_spaces.sub(' ', x),
                           lambda x: regex_interrupted_word.sub(r'\1', x),
                           lambda x: regex_missing_apices.sub(r"\1'\2", x),
                           lambda x: clean_from_short_sentences(x),
                           lambda x: regex_distance_between_period_and_following_word.sub('. ', x)]:
        text = clean_function(text)
    return text

test_text = 'this is a test (me@gmail.it) -12- that wi-  ll be   cleaned. with 2 5 6 not valid. two sentences can t be good http://or.not.'
cleaned_text = clean_text(test_text)
print(cleaned_text)
assert cleaned_text == 'this is a test () that will be cleaned. two sentences can\'t be good '

this is a test () that will be cleaned. two sentences can't be good 


In [None]:
from typing import List
import pandas as pd
import coreferee

def remove_columns_prefix(df: pd.DataFrame) -> None:
    '''remove prefix 'info.' from the columns of df'''
    df.rename(columns=lambda c: c.rsplit('.', 1)[-1], inplace=True)

def _get_new_token_from_resolve(token: spacy.tokens.Token, 
                                chains: coreferee.data_model.ChainHolder) -> spacy.tokens.Token:
    resolved_token = chains.resolve(token)
    return token.text_with_ws if resolved_token is None \
                              else 'and '.join([res_token.text_with_ws + ' ' for res_token in resolved_token])   

def _process_doc_for_coref(doc: spacy.tokens.Doc) -> str:
    replacement_tokens = []
    chains = doc._.coref_chains
    new_doc_tokens_text = [_get_new_token_from_resolve(token, chains) for token in doc]

    return ''.join(new_doc_tokens_text)

def preprocess_texts(texts: List[str]) -> List[str]:
    nlp = spacy.load('en_core_web_trf')
    nlp.add_pipe("coreferee")

    texts = [clean_text(text) for text in texts]
    docs = nlp.pipe(texts)

    return [regex_multiple_spaces.sub(' ', _process_doc_for_coref(doc)) for doc in docs]
        
text = get_document_by_line(DATASET_FILE_PATH, 103)
# text = '''Although he was very busy with his work, the magical Peter had had enough of it. 
#     He and his wife decided they needed a holiday. 
#     this couple travelled to Spain because it loves the country very much.'''
preprocess_texts([text])

In [None]:
import pandas as pd
from typing import Generator, Tuple
import ast

START_CLEANING = False
CHUNK_SIZE = 20
DATASET_ROWS = 381

assert START_CLEANING == True # make sure you do not start preprocessing again

def clean_data_row(row, docs_dict: Generator[Tuple[int, str], None, None]):
    id_rulebook = next(docs_dict)
    assert id_rulebook[0] == row['id']
    row['rulebook'] = id_rulebook[1]
    return row

drive.mount('/content/drive')
for skip_rows in range(1, DATASET_ROWS, CHUNK_SIZE):
    column_names = ['rulebook', 'info.id', 'info.name', 'info.averageweight', 'info.playingtime', 'info.family']
    # ast.literal_eval converts the family column string into a python array
    df_dataset = pd.read_csv(DATASET_FILE_PATH, converters={ 'info.family': ast.literal_eval },
                            names=column_names, header=None,
                            nrows=CHUNK_SIZE, skiprows=skip_rows)
    remove_columns_prefix(df_dataset)
    logger.info(f"processing boardgames from {df_dataset.loc[0, 'id']} to {df_dataset.loc[df_dataset.index[-1], 'id']}")
    docs_dict = zip(df_dataset['id'].values, preprocess_texts(df_dataset['rulebook'].values))

    df_cleaned_dataset = df_dataset.apply(lambda x: clean_data_row(x, docs_dict),
                                        axis='columns')

    df_cleaned_dataset.to_csv(CLEANED_DATASET_FILE_PATH, 
                            header=True if skip_rows == 1 else False, index=False, 
                            mode='w' if skip_rows == 1 else 'a')

if not WORKING_LOCALLY:
    drive.flush_and_unmount()
drive.mount('/content/drive')
df_cleaned_dataset.head()

## Luck metrics
these metrics are retrieved using rule-based matching and dependency matching. Luck is one of the criteria that determine the bg weight. In this case, the sources of luck considered are:

- Dice rolling
- Drawing
- Shuffling
- Words like *random* or *randomly*

In [None]:
from spacy import displacy
from collections import namedtuple
from spacy.matcher import Matcher, DependencyMatcher

# TODO: flip a coin, flip sth like a coin
LuckMetrics = namedtuple('LuckMetrics', ['dice_based', 'drawing_based', 'shuffling_based', 'random_based'])

def get_luck_metrics(doc: spacy.tokens.Doc) -> LuckMetrics:
    # ---------- random ----------
    random_matcher = Matcher(doc.vocab)
    random_patterns_match = [
        [{"LEMMA": { "IN": ["random", "randomly"]}}]
    ]
    random_matcher.add("random", random_patterns_match)

    # ---------- shuffle ----------
    shuffle_matcher = Matcher(doc.vocab)
    shuffle_patterns_match = [
        [{"LEMMA": "shuffle"}]
    ]
    shuffle_matcher.add("shuffle", shuffle_patterns_match)

    # ---------- card drawing ----------
    drawing_matcher = DependencyMatcher(doc.vocab)    
    drawing_patterns = [
        [
            {
                "RIGHT_ID": "drawing",
                "RIGHT_ATTRS": {"LEMMA": "draw", "POS": "VERB"}
            },
            {
                "LEFT_ID": "drawing",
                "REL_OP": ">",
                "RIGHT_ID": "card",
                "RIGHT_ATTRS": {
                    "LEMMA": "card",
                    "POS": "NOUN", 
                    "DEP": { "IN": ['dobj', 'nsubjpass', 'compound'] }
                }
            }
        ]
    ]
    drawing_matcher.add("drawing", drawing_patterns)
    # ---------- dice rolling ----------
    dice_matcher = DependencyMatcher(doc.vocab)    
    dice_patterns = [
        [
            {
                "RIGHT_ID": "rolling",
                "RIGHT_ATTRS": {"LEMMA": { "IN": ["use", "throw", "roll"]}, "POS": "VERB"}
            },
            {
                "LEFT_ID": "rolling",
                "REL_OP": ">",
                "RIGHT_ID": "dice_or_die",
                "RIGHT_ATTRS": {
                    "LEMMA": { "IN": ["die", "dice"]},
                    "POS": "NOUN", 
                    "DEP": { "IN": ['nsubj', 'dobj', 'nsubjpass', 'compound'] }
                }
            }
        ],
        [
            {
                "RIGHT_ID": "rolling",
                "RIGHT_ATTRS": {"LEMMA": { "IN": ["use", "throw", "roll"]}, "POS": "VERB"}
            },
            {
                "LEFT_ID": "rolling",
                "REL_OP": ">",
                "RIGHT_ID": "number",
                "RIGHT_ATTRS": {
                    "IS_DIGIT": True, 
                    "DEP": { "IN": ['dobj'] }
                }
            }
        ]
    ]
    dice_matcher.add("diceroll", dice_patterns)

    dice_matches = dice_matcher(doc) 
    draw_matches = drawing_matcher(doc)
    shuffle_matches = shuffle_matcher(doc)
    random_matches = random_matcher(doc)

    # TODO: needs normalization? (divide by rulebook length or tokens)

    return LuckMetrics(len(dice_matches), len(draw_matches), len(shuffle_matches), len(random_matches))

text = get_document_by_line(CLEANED_DATASET_FILE_PATH, 130)

nlp = spacy.load('en_core_web_sm')
doc = nlp(clean_text(text))
print(len(doc), len(doc.text))
print(get_luck_metrics(doc))

# displacy.render(doc, style='dep', jupyter=True)

2607 12624
LuckMetrics(dice_based=12, drawing_based=4, shuffling_based=3, random_based=0)


In [54]:
# text = '''you can only take this because it can be outrageous. 
#     you can't take it. you could not also choose. you may never be sure of the result. 
#     you can decide the next thing. he has no other choice but to stop, another option is winning.'''

text = '''In some cases Good and Goods could have different lemma'''

nlp = spacy.load('en_core_web_sm')
doc = nlp(text)
print([token.pos for token in doc])
print(len(doc), len(doc.text))

displacy.render(doc, style='dep', jupyter=True)

[85, 90, 92, 84, 89, 96, 87, 100, 84, 92]
10 55


## Amount of choices metrics
these metrics are retrieved using rule-based matching and dependency matching. The amount of choices every player has is one of the criteria that determine the bg weight. In this case, I am considering:

- *can/could/may/decide/...*, with some exceptions:
    - negatives are not considered choices, like *cannot draw* or *don't choose*
    - *can* + *choose* and similar ones increase the *amount of choices* metrics by 1
- *choice/option*, except when there is a leading *no*.

In [None]:
from spacy import displacy
from collections import namedtuple
from spacy.matcher import Matcher, DependencyMatcher

def get_choices_amount_metric(doc: spacy.tokens.Doc) -> int:
    # --------------  can/could/may/choose/select/... -------------- 
    # all can/could/may
    can_could_may_matcher = Matcher(doc.vocab)
    can_could_may_patterns = [
        [{
            "LEMMA": { "IN": ["can", "could", "may", "decide", "select", "choose", "opt"]}, 
            "POS": { "IN": ["AUX", "VERB"]}
        }]
    ]
    can_could_may_matcher.add('can_could_may', can_could_may_patterns)
    can_could_may_matches = { match[1] for match in can_could_may_matcher(doc) }

    # can/could/may with only or neg
    can_could_may_exceptions_matcher = DependencyMatcher(doc.vocab)
    can_could_may_exceptions_patterns = [
        [
            # ❌ can not/only/never verb 
            {
                "RIGHT_ID": "can_could_may",
                "RIGHT_ATTRS": {
                    "LEMMA": { "IN": ["can", "could", "may"]}, 
                    "POS": "AUX"
                }
            },
            {
                "LEFT_ID": "can_could_may",
                "REL_OP": "<",
                "RIGHT_ID": "generic_verb",
                "RIGHT_ATTRS": {
                    "POS": { "IN": ["AUX", "VERB"] }
                }
            },
            {
                "LEFT_ID": "generic_verb",
                "REL_OP": ">",
                "RIGHT_ID": "neg_or_only",
                "RIGHT_ATTRS": {
                    "LEMMA": { "IN": ["not", "only", "never"]}, 
                    "DEP": { "IN": ["advmod", "neg"] }
                }
            }
        ],
        [
            # ❌ not/only/never choose
            {
                "RIGHT_ID": "decision_verb",
                "RIGHT_ATTRS": {
                    "LEMMA": { "IN": ["decide", "select", "choose", "opt"]}, 
                    "POS": "VERB"
                }
            },
            {
                "LEFT_ID": "decision_verb",
                "REL_OP": ">",
                "RIGHT_ID": "negation",
                "RIGHT_ATTRS": {
                    "LEMMA": { "IN": ["not", "only", "never"]}, 
                    "DEP": { "IN": ["advmod", "neg"] }
                }
            }
        ],
        [
            # ❌ can + choose are counted as 1. can token is left out
            {
                "RIGHT_ID": "can_could_may",
                "RIGHT_ATTRS": {
                    "LEMMA": { "IN": ["can", "could", "may"]}, 
                    "POS": "AUX"
                }
            },
            {
                "LEFT_ID": "can_could_may",
                "REL_OP": "<",
                "RIGHT_ID": "decision_verb",
                "RIGHT_ATTRS": {
                    "LEMMA": { "IN": ["decide", "select", "choose", "opt"]},
                    "POS": "VERB"
                }
            }
        ]
    ]
    can_could_may_exceptions_matcher.add('can_could_may_exceptions', can_could_may_exceptions_patterns)
    can_could_may_exceptions_matches = { match[1][0] for match in can_could_may_exceptions_matcher(doc) }

    # -------------- choice and option -------------- 
    choice_option_matcher = Matcher(doc.vocab)
    choice_option_patterns = [
        [{
            "LEMMA": { "IN": ["choice", "option"]}, 
            "POS": "NOUN"
        }]
    ]
    choice_option_matcher.add('choice_option', choice_option_patterns)
    choice_option_matches = { match[1] for match in choice_option_matcher(doc) }

    choice_option_exceptions_matcher = DependencyMatcher(doc.vocab)
    choice_option_exceptions_patterns = [
        [
            {
                "RIGHT_ID": "choice",
                "RIGHT_ATTRS": {
                    "LEMMA": { "IN": ["choice", "option"]}, 
                    "POS": "NOUN"
                }
            },
            {
                "LEFT_ID": "choice",
                "REL_OP": ">",
                "RIGHT_ID": "prefix_no",
                "RIGHT_ATTRS": {
                    "LEMMA": "no",
                    "POS": "DET",
                    "DEP": "det"
                }
            }
        ]
    ]
    choice_option_exceptions_matcher.add('choice_option_exceptions', choice_option_exceptions_patterns)
    choice_option_exceptions_matches = { match[1][0] for match in choice_option_exceptions_matcher(doc) }

    return len(can_could_may_matches.difference(can_could_may_exceptions_matches)) + \
           len(choice_option_matches.difference(choice_option_exceptions_matches))

text = get_document_by_line(CLEANED_DATASET_FILE_PATH, 130)
# text = '''you can only take this because it can be outrageous. 
#     you can't take it. you can not also choose. you can never be sure of the result. 
#     you can decide the next thing, or you choose the target. another choice is to win. 
#     but there is no right option.'''

nlp = spacy.load('en_core_web_sm')
doc = nlp(clean_text(text))
print([(i, token.lemma_) for i, token in enumerate(doc)])
print(len(doc), len(doc.text))
print(get_choices_amount_metric(doc))

# displacy.render(doc, style='dep', jupyter=True)

[(0, 'page'), (1, '1'), (2, 'of'), (3, '4'), (4, '"'), (5, 'wizardology'), (6, '"'), (7, 'GAME'), (8, 'play'), (9, 'objective'), (10, ':'), (11, 'to'), (12, 'become'), (13, 'a'), (14, 'Master'), (15, 'Wizard'), (16, 'by'), (17, 'collect'), (18, '4'), (19, 'talisman'), (20, '('), (21, 'a'), (22, 'familiar'), (23, ','), (24, 'a'), (25, 'wizard'), (26, "'s"), (27, 'hat'), (28, ','), (29, 'a'), (30, 'staff'), (31, ','), (32, 'and'), (33, 'an'), (34, 'amulet'), (35, ')'), (36, 'and'), (37, 'free'), (38, 'Merlin'), (39, "'s"), (40, 'spirit'), (41, 'from'), (42, 'the'), (43, 'old'), (44, 'oak'), (45, 'tree'), (46, '.'), (47, 'place'), (48, 'the'), (49, 'maze'), (50, 'board'), (51, 'in'), (52, 'the'), (53, 'center'), (54, 'of'), (55, 'the'), (56, 'table'), (57, '.'), (58, 'place'), (59, 'a'), (60, 'Spirit'), (61, 'Chamber'), (62, 'at'), (63, 'each'), (64, 'of'), (65, 'the'), (66, '4'), (67, 'open'), (68, 'outer'), (69, 'doorway'), (70, 'of'), (71, 'the'), (72, 'game'), (73, 'board'), (74, '.')

In [30]:
text = '''If at Low or High altitude, place a Low or High altitude marker on the Leader (Players will need to make Players own Low & High altitude markers for this).'''

# text = get_document_by_line(CLEANED_DATASET_FILE_PATH, 130)

nlp = spacy.load('en_core_web_sm')
doc = nlp(text.lower())
print([(token.lemma_, token.head, token.dep_) for token in doc])
print(len(doc), len(doc.text))

displacy.render(doc, style='dep', jupyter=True)

[('if', at, 'mark'), ('at', place, 'prep'), ('low', altitude, 'amod'), ('or', low, 'cc'), ('high', low, 'conj'), ('altitude', at, 'pobj'), (',', place, 'punct'), ('place', need, 'advcl'), ('a', marker, 'det'), ('low', altitude, 'amod'), ('or', low, 'cc'), ('high', low, 'conj'), ('altitude', marker, 'compound'), ('marker', place, 'dobj'), ('on', marker, 'prep'), ('the', leader, 'det'), ('leader', on, 'pobj'), ('(', need, 'punct'), ('player', need, 'nsubj'), ('will', need, 'aux'), ('need', need, 'ROOT'), ('to', make, 'aux'), ('make', need, 'xcomp'), ('player', own, 'nsubj'), ('own', make, 'ccomp'), ('low', markers, 'amod'), ('&', low, 'cc'), ('high', low, 'conj'), ('altitude', low, 'conj'), ('marker', own, 'dobj'), ('for', own, 'prep'), ('this', for, 'pobj'), (')', need, 'punct'), ('.', need, 'punct')]
34 155


In [None]:
nlp = spacy.load('en_core_web_sm')
print(nlp.Defaults.stop_words)
doc = nlp(get_document_by_line(CLEANED_DATASET_FILE_PATH, 130))
display(doc.text)

for ent in doc.ents:
    print(ent.text)


{'besides', 'them', 'whose', 'n’t', 'not', 'serious', 'could', 'have', 'might', 'whenever', 'after', 'both', 'together', 'may', 'sixty', 'used', 'back', '’ve', 'these', 'get', 'too', 'such', 'and', 'to', '‘s', 'where', 'does', 'about', 'whereafter', 'therein', 'thence', 'down', 'latterly', 'any', 'with', 'seeming', 'between', 'four', 'themselves', 'often', 'ourselves', 'so', 'almost', 'however', 'my', 'front', 'n‘t', 'herself', 'beside', 'its', 'their', 'were', 'make', 'now', 'others', 'be', 'top', 'regarding', 'over', 'seemed', 'i', 'third', 'what', 'herein', 'using', 'thru', 'already', 'ca', 'can', 'few', 'being', 're', 'anything', 'least', 'everything', 'which', 'put', 'below', 'never', 'just', 'would', 'whither', '’m', 'hence', 'when', 'first', 'per', 'do', 'before', 'our', 'move', '‘m', 'else', 'through', 'afterwards', 'nobody', 'whole', 'yourself', 'against', 'empty', 'nine', 'towards', 'well', 'behind', 'therefore', 'formerly', 'two', 'nevertheless', 'hereupon', 'him', 'perhaps'

'Page 1 of 4 "WIZARDOLOGY" GAME PLAY OBJECTIVE: To become a Master Wizard by collecting 4 talismans (a familiar, a wizard\'s hat, a staff, and an amulet) and free Merlin\'s spirit from the old oak tree. Place the maze board in the center of the table. Place a Spirit Chamber at each of the 4 open outer doorways of the game board. Place the familiar die and the familiars in the Water Spirit Chamber. Place the Dragon Medallion and the wizard hats in the Air Spirit Chamber. Place the cup with the wizard staffs inside cup in the Fire Spirit Chamber. Place the magic wand, levitation magnets and amulets in the Earth Spirit Chamber. Each player selects a wizard whose path wizard or player wishes to follow and places the wizard figure in the center of the maze. Shuffle the Magical Item Cards, Crystal Ball Cards, and Phoenix Feather Cards. Keep each deck separate and put Cards and Cards and Cards to the side. Place the Spells and Potions book and dice off to the side . PLAYING THE GAME: Your goa

1
4
Merlin
4
the Water Spirit Chamber
the Dragon Medallion
the Air Spirit Chamber
the Fire Spirit Chamber
the Earth Spirit Chamber
Crystal Ball Cards
Phoenix Feather Cards
Cards and Cards and Cards
Spells
Potions
4
first
first
4
Spirit Chambers
the Spirit Chambers
5
only 2
2
3
4
5
the "Open Sesame
1
one
Magic Item
Item Cards
Animal Magic Guide Book
the Air Spirit Chamber
the Fire Spirit Chamber Ring of Power: Key
the Earth Spirit Chamber
4
Chamber
Spirit
Item Card
Spirit Chamber
Item Card
the Spirit Chamber
genie
Fairy Flag
the Spirit Chamber
5
Spirit
Kee
one
the Spirit Chamber
All Play
Item Card
Vivienne
Circle
Elf Charms
Wind Knot
a quarter
Spirit
Spirit
False Prophecy
Phoenix Feathers
one
Phoenix
3
Seven
Open Sesame" Card
Broomstick Card
4
Spirit Chambers
Magic Carpet
the Spirit Chambers
at least one
Magic Carpet Card
Merlin
3
Spells
One
Spells
The Alchemy Lab: Rolling
Spells
the Alchemy Lab
Spells
only 3
Secret Path
Secret Path
Secret Path
Crystal Ball
20
Spirit
Gladde
First
the Do

In [None]:
# from multi_rake import Rake
# from summa import keywords
import yake

nlp = spacy.load('en_core_web_sm')
text = clean_text(get_document_by_line(CLEANED_DATASET_FILE_PATH, 155))

# def use_rake(text: str):
#     rake = Rake()
#     keywords = rake.apply(text)
#     return keywords[:30]

def use_yake(text: str):
    kw_extractor = yake.KeywordExtractor(top=20)
    keywords_info = kw_extractor.extract_keywords(text)
    keyword_groups = [keyword_info[0] for keyword_info in keywords_info if keyword_info[1] < 0.1]
    return keyword_groups

# def use_TextRank(text: str):
#     TR_keywords = keywords.keywords(text, scores=True)
#     return TR_keywords

# display(use_rake(text))
# display(use_TextRank(text))
use_yake(text)

['property',
 'resources',
 'resource',
 'card',
 'develop',
 'deed',
 'Ace',
 'resource tokens',
 'player',
 'district',
 'Grand Duke',
 'tokens',
 'collect resources',
 'collect',
 'cards',
 'Duke',
 'properties',
 'matching',
 'Court',
 'turn']

In [5]:
from spacy.language import Language
from spacy.lang.en import EnglishLemmatizer
from spacy.vocab import Vocab
from typing import Optional, Any, Callable, List

# class TokenWithLowerText:
#     def __init__(self, wrapped_token: spacy.tokens.Token):
#         self.__wrapped_token = wrapped_token
#         self.text = wrapped_token.text.lower()
#         self.pos_ = 'NOUN' if self.__wrapped_token.pos_ == 'PROPN' else self.__wrapped_token.pos_
#         # 92 is pos of NOUN , 96 is pos of PROPN
#         self.pos = 92 if self.__wrapped_token.pos == 96 else self.__wrapped_token.pos
#         self.orth = self.__wrapped_token.vocab.strings[self.text]
        

#     def __getattr__(self, name):
#         # print(f"{self.__wrapped_token.text}.{name} = {getattr(self.__wrapped_token, name)}")
#         return getattr(self.__wrapped_token, name)

# class LowerLemmatizer(EnglishLemmatizer):
#     def __init__(self,
#                  vocab: Vocab,
#                  lookup_tables: Any,
#                  model: Optional[Any],
#                  name: str,
#                  *,
#                  mode: str = "lookup",
#                  scorer: Optional[Callable]):
#         super().__init__(vocab, model, name, mode=mode, overwrite=True, scorer=scorer)
#         self.initialize(lookups=lemmatizer_lookup_tables)

#     def is_base_form(self, token: spacy.tokens.Token) -> bool:
#         return True if token.pos_ == 'PROPN' else super().is_base_form(token)

#     def rule_lemmatize(self, token: spacy.tokens.Token) -> List[str]:
#         return super().rule_lemmatize(TokenWithLowerText(token))

#     def lookup_lemmatize(self, token: spacy.tokens.Token) -> List[str]:
#         return super().lookup_lemmatize(TokenWithLowerText(token))

# if not Language.has_factory('lower_lemmatizer'):
#     nlp = spacy.load('en_core_web_sm')
#     lemmatizer_lookup_tables = nlp.get_pipe("lemmatizer").lookups
#     assert lemmatizer_lookup_tables is not None

#     @Language.factory(
#         "lower_lemmatizer",
#         assigns=["token.lemma"],
#         default_config={
#             "model": None,
#             "mode": "lookup",
#             "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
#         },
#         default_score_weights={"lemma_acc": 1.0},
#     )
#     def make_lower_lemmatizer(nlp: Language,
#                               model: Optional[Any],
#                               name: str,
#                               mode: str,
#                               scorer: Optional[Callable]
#                               ):
#         return LowerLemmatizer(
#             nlp.vocab, lemmatizer_lookup_tables, 
#             model, name, mode=mode, scorer=scorer
#         )

# text = """Axis Empires: Dai Senso! Living Rules (February 2014) 61 Design Note: Although these two countries can't be conquered, countries can be subdued through the Axis Forced Settlement Conditional Event () or an Allied Surrender Conditional Event ( and )."""
# nlp = spacy.load('en_core_web_sm')
# # nlp.replace_pipe('lemmatizer', 'lower_lemmatizer', config={"mode": "rule"})
# doc = nlp(text)
# print([token.lemma_ for token in doc])
#['sometimes', ',', 'good', 'and', 'good', 'be', 'different', 'lemma', ',', 'not', 'this', 'conditional', 'time', '.', 'the', 'same', 'go', 'for', 'rule', 'and', 'rule']
#['sometimes', ',', 'goods', 'and', 'good', 'are', 'different', 'lemma', ',', 'not', 'this', 'conditional', 'time', '.', 'the', 'same', 'go', 'for', 'rule', 'and', 'rule']

['Axis', 'Empires', ':', 'Dai', 'Senso', '!', 'live', 'rule', '(', 'February', '2014', ')', '61', 'design', 'note', ':', 'although', 'these', 'two', 'country', 'can', 'not', 'be', 'conquer', ',', 'country', 'can', 'be', 'subdue', 'through', 'the', 'Axis', 'Forced', 'Settlement', 'Conditional', 'Event', '(', ')', 'or', 'an', 'Allied', 'Surrender', 'Conditional', 'event', '(', 'and', ')', '.']
[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x7efc03b1f590>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x7efc03b1f440>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x7efc03b50e50>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x7efc02c59730>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x7efc02c5df50>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x7efc02d0ff50>)]


In [27]:
import re
from collections import Counter, defaultdict
from typing import List, Set, Dict
from nltk.util import ngrams
# import yake
import numpy as np
from string import punctuation

regex_word_within_boundaries = re.compile(r'\b')
MIN_TOKEN_TO_BE_CONSIDERED_COMPONENT = 4
MAX_COMPONENTS = 100

ignored_words = {
        'amount', 'beginning', 'board', 'book', 'bottom', 'case', 'choice', 
        'clarification', 'clockwise', 'condition', 'cost', 'design', 'effect', 
        'end', 'example', 'face', 'front', 'game', 'left', 'middle', 'note', 'number', 
        'opponent', 'option', 'order', 'overview', 'page', 'play', 'player', 
        'purpose', 'reference', 'result', 'right', 'rule', 'rulebook', 
        'section', 'set', 'setup', 'side', 'summary', 'start', 'step', 'thing',
        'type', 'tie', 'time', 'top', 'total', 'use', 'value', 'version', 'way'
        }.union(nlp.Defaults.stop_words)

def find_most_common_nouns(doc: spacy.tokens.Doc) -> Dict[str, List[spacy.tokens.Token]]:
    tokens_dict = defaultdict(list)

    for token in doc:
        if len(token) >= 3 and \
            token.pos_ in ['NOUN', 'PROPN'] and \
            token.dep_ in ['nsubj', 'dobj', 'nsubjpass', 'pobj']:
            tokens_dict[token.lemma_.lower()].append(token)
           
    return tokens_dict

def _is_token_part_of_bigram(token: spacy.tokens.Token, 
                             unigram_token: spacy.tokens.Token) -> bool:
    return token.dep_ == 'compound' and token.pos_ is ['NOUN', 'PROPN'] and \
        token.head.i == unigram_token.i

def _find_most_relevant_ngram(doc: spacy.tokens.Doc,
                              unigrams: Dict[str, List[spacy.tokens.Token]],
                              n_grams: int):
    bigram_associated_dict = defaultdict(set)
    for name, tokens in unigrams.items():
        for token in tokens:
            if token.i > 0 and _is_token_part_of_bigram(doc[token.i - 1], token):
                bigram_associated_dict[name].add(doc[token.i - 1].lemma_)



def _find_most_relevant_unigram_nouns(doc: spacy.tokens.Doc, nlp: spacy.Language) -> Dict[str, List[int]]:
    possible_components_info = dict(filter(lambda token: token[0] not in ignored_words and 
                                           len(token[1]) >= MIN_TOKEN_TO_BE_CONSIDERED_COMPONENT and
                                           any(token_occurrence.dep_ in ['nsubj', 'nsubjpass', 'dobj'] 
                                               for token_occurrence in token[1]), 
                                           find_most_common_nouns(doc).items()))

    return possible_components_info

def get_bg_components(text: str) -> Dict[str, List[int]]:
    nlp = spacy.load('en_core_web_sm')
    original_text = text
    doc = nlp(text.lower())
    components_by_deps = _find_most_relevant_unigram_nouns(doc, nlp)

    # print(components_by_deps)
    # components_by_kws = _get_bg_components_by_keyword_analysis(doc, len(components_by_deps))
    # print(components_by_kws)

    # return set(components_by_deps).intersection(set(components_by_kws))
    return components_by_deps.keys()

def get_doc_variance(doc: spacy.tokens.Doc, components_dict: Dict[str, List[int]]) -> float:
    '''variance measures how components interleave in the text. This could mean that rules involve
    many components and are therefore more complex. variancy is computed using `np.var` on each
    component list. the results are normalized by multiplicating for the frequency of the component.
    eventually the partial variances are summed together and the result normalized with the 
    total numbers of tokens.'''
    tokens_count = sum(len(token_list) for token_list in components_dict.values())
    return sum((len(tokens) / tokens_count) * np.var([token.i for token in tokens])
        for tokens in components_dict.values()) / len((doc))


text = get_document_by_line(CLEANED_DATASET_FILE_PATH, 149)
components = get_bg_components(text)
display(components)
print(len(components))
#dict_keys(['dice', 'good', 'monument', 'disaster', 'point', 'sheet', 'peg', 'star', 'turn', 'food', 'city', 'development', 'excess', 'goods', 'wood', 'box', 'score'])
#dict_keys(['dice', 'good', 'monument', 'disaster', 'point', 'sheet', 'peg', 'star', 'turn', 'food', 'city', 'development', 'excess', 'wood', 'worker', 'box', 'score'])
# print(get_doc_variance(doc, components))

dict_keys(['circus', 'bomber', 'altitude', 'aircraft', 'gunner', 'leader', 'roll', 'mission', 'pilot', 'repair', 'card', 'table', 'attack', 'change', 'horsepower', 'turn', 'limit', 'hand', 'performance', 'place', 'wingman', 'campaign', 'gun', 'burst', 'hit', 'enemy', 'propaganda', 'maneuver', 'position', 'counter', 'damage', 'target', 'deck', 'dogfight', 'fighter', 'die', 'season', 'week', 'flight', 'dice', 'patrol', 'area', 'point', 'skill', 'fire', 'formation', 'balloon', 'rocket', 'trench', 'kill', 'pfalz'])

51


In [22]:
s1 = {'sequence', 'hex', 'port', 'procedure', 'phase', 'control', 'combat', 'china', 'britain', 'russia', 'theater', 'axi', 'note', 'box', 'item', 'piece', 'turn', 'marker', 'event', 'instruction', 'card', 'die', 'count', 'aid', 'word', 'action', 'list', 'country', 'unit', 'faction', 'exception', 'symbol', 'map', 'term', 'road', 'base', 'line', 'japan', 'arrow', 'hexside', 'range', 'blitz', 'border', 'key', 'indochina', 'kiangsu', 'hopeh', 'capital', 'troop', 'restriction', 'dependent', 'priority', 'nagasaki', 'ally', 'counter', 'stripe', 'drm', 'roll', 'square', 'war', 'germany', 'pool', 'location', 'enemy', 'india', 'tokyo', 'land', 'city', 'korea', 'guadalcanal', 'zone', 'snow', 'area', 'bay', 'statu', 'basis', 'indie', 'batavia', 'check', 'activity', 'singapore', 'situation', 'display', 'following', 'track', 'deck', 'force', 'year', 'army', 'factor', 'information', 'garrison', 'ability', 'segment', 'infantry', 'armor', 'icon', 'placement', 'beachhead', 'supply', 'support', 'failure', 'france', 'space', 'state', 'requirement', 'esv', 'place', 'sun', 'crusade', 'victory', 'surrender', 'point', 'level', 'vp', 'score', 'table', 'replacement', 'hand', 'reserve', 'treaty', 'uscl', 'szechwan', 'selection', 'posture', 'government', 'branch', 'program', 'navy', 'mongolia', 'pre', 'production', 'limit', 'adjustment', 'source', 'manchukuo', 'region', 'minor', 'ezoc', 'nationality', 'cloudy', 'philippine', 'elimination', 'convoy', 'defense', 'advance', 'direction', 'mud', 'landing', 'policy', 'manila', 'fleet', 'path', 'strike', 'distinction', 'saigon', 'island', 'defender', 'shift', 'detachment', 'mode', 'australia', 'combination', 'breakdown', 'fortres', 'fort', 'conversion', 'allowance', 'heilungkiang', 'movement', 'overrun', 'strait', 'w5311', 'capacity', 'storm', 'passage', 'airdrop', 'attacker', 'attack', 'exploitation', 'column', 'retreat', 'los', 'loss', 'stack', 'weather', 'stalingrad', 'distance', 'empire', 'alignment', 'activation', 'reduction', 'reminder', 'zoc', 'burma', 'peace', 'ceylon', 'kansu', 'truce', 'incident', 'neutral', 'acceptance', 'negotiation', 'lease', 'occupation', 'effort', 'politic'}
s2 = {'table', 'sequence', 'adjustment', 'hex', 'port', 'map', 'procedure', 'phase', 'control', 'combat', 'victory', 'france', 'russia', 'event', 'reactivation', 'china', 'britain', 'theater', 'day', 'axis', 'note', 'box', 'item', 'piece', 'turn', 'marker', 'instruction', 'card', 'die', 'count', 'aid', 'word', 'action', 'list', 'country', 'unit', 'faction', 'exception', 'symbol', 'term', 'city', 'base', 'line', 'road', 'japan', 'arrow', 'hexside', 'blitz', 'range', 'border', 'key', 'chinese', 'indochina', 'hopeh', 'troop', 'restriction', 'dependent', 'priority', 'nagasaki', 'ally', 'counter', 'stripe', 'delay', 'drm', 'roll', 'square', 'success', 'war', 'germany', 'pool', 'location', 'enemy', 'india', 'tokyo', 'land', 'korea', 'zone', 'snow', 'area', 'bay', 'status', 'basis', 'indie', 'batavia', 'check', 'kong', 'activity', 'fleet', 'singapore', 'situation', 'display', 'following', 'track', 'deck', 'force', 'year', 'army', 'factor', 'information', 'garrison', 'ability', 'segment', 'infantry', 'armor', 'fort', 'icon', 'placement', 'beachhead', 'supply', 'support', 'failure', 'space', 'state', 'requirement', 'point', 'esv', 'place', 'sun', 'crusade', 'surrender', 'level', 'vps', 'score', 'mandate', 'replacement', 'hand', 'reserve', 'influence', 'treaty', 'uscl', 'kiangsu', 'szechwan', 'selection', 'posture', 'government', 'branch', 'program', 'navy', 'mongolia', 'production', 'limit', 'empire', 'source', 'region', 'minor', 'ezoc', 'nationality', 'philippine', 'elimination', 'convoy', 'defense', 'advance', 'direction', 'mud', 'landing', 'policy', 'manila', 'path', 'strike', 'distinction', 'saigon', 'island', 'defender', 'shift', 'detachment', 'mode', 'australia', 'combination', 'breakdown', 'conversion', 'allowance', 'heilungkiang', 'movement', 'overrun', 'terrain', 'strait', 'w5311', 'capacity', 'storm', 'passage', 'capital', 'airdrop', 'attacker', 'attack', 'exploitation', 'column', 'retreat', 'loss', 'dr3', 'stack', 'weather', 'distance', 'alignment', 'activation', 'reduction', 'reminder', 'zoc', 'kamchatka', 'rain', 'peace', 'neutrality', 'increase', 'marker(s', 'ceylon', 'kansu', 'truce', 'incident', 'acceptance', 'negotiation', 'settlement', 'lease', 'loc', 'occupation', 'effort', 'politic'}
display(get_document_by_line(CLEANED_DATASET_FILE_PATH, 149))
set(comp2).difference(set(components))

"FC Sol 1 Flying Circus: Solitaire & Campaign These unofficial rules have been put together by me for use with DVG Games Flying Circus. rules include some house rules for some of the things that should really have been included in the game's rules (but which were only included in the Bombers and Campaigns expansion which is currently unavailable in the UK). I ve also included my own solitaire rules and a solitaire campaign. CONTENTS Page House Rules 1 Altitude 2 2 Aircraft with Gunners 3 3 Fokker Dr. Triplane 3 4 Rule Clarifications Leaders vs Wingmen 4 Who Leaders can Target 4 Vertical Rolls 4 5 Solitaire Rules Enemy Wingmen 5 Enemy Leaders 5 Targeting Enemy Wingmen 6 Targeting Enemy Leaders 7 Enemy Leaders with Gunners 7 Enemy Fokker Dr. 7 Enemy Vertical Rolls 7 6 Solitaire Campaign Set-up 8 Recon Escort Mission 8 Bomber Escort Mission 9 Balloon Busting Mission 10 Trench Attack Mission 12 Patrol Mission 12 Getting Back Safely 13 Shot Down Pilots 13 Gaining Experience 13 Aircraft Repa

{'action',
 'bonus',
 'building',
 'card',
 'circle',
 'city',
 'connection',
 'hall',
 'house',
 'icon',
 'lane',
 'pirate',
 'population',
 'prison',
 'token'}

In [None]:
import itertools
from collections import Counter
from typing import List

def find_n_most_common_nouns(n, docs: List[spacy.tokens.Doc]) -> List[str]:
    docs_sets = [set(find_most_common_nouns(doc).keys())
                 for doc in docs]
    all_tokens_from_docs = itertools.chain(*docs_sets)
    tokens_counter = Counter(all_tokens_from_docs)
    return tokens_counter.most_common(n)
    

nlp = spacy.load('en_core_web_sm')
df_dataset = get_df_with_docs(CLEANED_DATASET_FILE_PATH, 100, 200)
docs = nlp.pipe(df_dataset['rulebook'].values)

find_n_most_common_nouns(80, docs)

[('player', 99),
 ('game', 97),
 ('turn', 90),
 ('end', 83),
 ('time', 83),
 ('number', 83),
 ('card', 81),
 ('order', 81),
 ('point', 73),
 ('action', 69),
 ('rule', 69),
 ('side', 64),
 ('deck', 64),
 ('table', 64),
 ('hand', 63),
 ('space', 63),
 ('front', 62),
 ('value', 61),
 ('place', 61),
 ('example', 59),
 ('pile', 56),
 ('play', 56),
 ('top', 56),
 ('type', 55),
 ('effect', 53),
 ('round', 53),
 ('marker', 52),
 ('phase', 51),
 ('board', 51),
 ('way', 50),
 ('case', 50),
 ('choice', 48),
 ('area', 47),
 ('one', 46),
 ('color', 45),
 ('box', 45),
 ('track', 44),
 ('tie', 44),
 ('right', 44),
 ('total', 42),
 ('cards', 42),
 ('face', 42),
 ('players', 41),
 ('cost', 41),
 ('step', 40),
 ('rest', 40),
 ('bottom', 40),
 ('opponent', 40),
 ('start', 38),
 ('ability', 37),
 ('symbol', 37),
 ('beginning', 37),
 ('die', 36),
 ('page', 36),
 ('result', 36),
 ('set', 35),
 ('movement', 35),
 ('condition', 34),
 ('setup', 34),
 ('token', 33),
 ('stack', 33),
 ('amount', 33),
 ('part', 33

In [None]:
from typing import Tuple
import pandas as pd
import ast
import os
from IPython.display import clear_output

def get_rules_features(id: int, doc: spacy.tokens.Doc) -> Tuple[int, float]:
    logger.info(f'processing board game {id}')
    rulebook_len = len(doc)
    bg_components = get_bg_components(doc)
    print(bg_components)

    return 0, 0
    # rules = get_rules(text)
    # rule_count = len(rules)
    # return rule_count, len(text) / rule_count

def apply_for_rulebook_features(row, docs_dict):
    next_doc_info = next(docs_dict)
    assert next_doc_info[0] == row.id
    return pd.Series(get_rules_features(row.id, next_doc_info[1]), 
                     index=['rule_count', 'avg_rule_len'])

PROCESSED_DATASET_FILE_PATH = 'data/processed_dataset.csv' if WORKING_LOCALLY \
    else '/content/drive/My Drive/Projects/IRBoardGameComplexity/processed_dataset.csv'

# ast.literal_eval converts the family column string into a python array
df_dataset = pd.read_csv(CLEANED_DATASET_FILE_PATH, converters={ 'family': ast.literal_eval }, nrows=1)
remove_columns_prefix(df_dataset)
docs_dict = zip(df_dataset['id'].values, 
                nlp.pipe(map(clean_text, df_dataset['rulebook'].values)))

df_rules_features = df_dataset.apply(lambda x: apply_for_rulebook_features(x, docs_dict),
                                     axis='columns')
df_features = df_dataset[['averageweight', 'playingtime', 'family']].join(df_rules_features)
        
# one-hot encoding "family" field 
# from https://stackoverflow.com/questions/71401193/one-hot-encoding-in-python-for-array-values-in-a-dataframe
df_features = df_features.join(df_features.pop('family').apply('|'.join).str.get_dummies())
df_features.head()

# df_features.to_csv(PROCESSED_DATASET_FILE_PATH, header=True, index=False, mode='w')    
# if not WORKING_LOCALLY:
#     drive.flush_and_unmount()