<a href="https://colab.research.google.com/github/marco-luzzara/boardgame-complexity-predictor/blob/master/src/extract_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from IPython.display import clear_output
import os
WORKING_LOCALLY = bool(os.getenv('WORKING_LOCALLY'))

if WORKING_LOCALLY:
    DATASET_FILE_PATH = 'data/dataset.csv'
else:
    from google.colab import drive
    drive.mount('/content/drive')
    DATASET_FILE_PATH = '/content/drive/My Drive/Projects/IRBoardGameComplexity/dataset.csv'
    !pip install spacy-transformers
    !python3 -m pip install coreferee==1.3.*
    !python3 -m coreferee install en
    !python -m spacy download en_core_web_lg
    !python -m spacy download en_core_web_trf
    !pip install git+https://github.com/LIAAD/yake
    !pip install rake-nltk
    clear_output(wait=False)


In [2]:
from IPython.display import clear_output
import spacy
from spacy import displacy

## +++++++++++ with fastcoref
# from fastcoref import spacy_component
# nlp = spacy.load("en_core_web_sm", exclude=["parser", "lemmatizer", "ner", "textcat"])
# nlp.add_pipe("fastcoref")
#              #config={'model_architecture': 'LingMessCoref', 'model_path': 'biu-nlp/lingmess-coref'})

# # to remove tqdm progress bar: https://stackoverflow.com/questions/37091673/silence-tqdms-output-while-running-tests-or-running-the-code-via-cron
# from tqdm.auto import tqdm
# from functools import partialmethod
# tqdm.__init__ = partialmethod(tqdm.__init__, disable=True, ncols=0, nrows=0, gui=False, bar_format='', leave=False)

## +++++++++++ with coreferee
import coreferee
nlp = spacy.load('en_core_web_trf')
nlp.add_pipe("coreferee")

clear_output(wait=False)

In [3]:
import logging

logger = logging.getLogger('bgg_predict')
logger.handlers.clear()
handler = logging.StreamHandler()
formatter = logging.Formatter(
        '%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)

logger.debug('test')

2022-11-23 20:43:11,700 bgg_predict  DEBUG    test
DEBUG:bgg_predict:test


In [18]:
import re

regex_mail = re.compile(r'\w+(?:\.\w+)*?@\w+(?:\.\w+)+')
# modified from https://stackoverflow.com/a/163684/5587393
regex_link = re.compile(r'(?:\b(?:(?:https?|ftp|file)://|www))[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#%=~_|]')
# in a sentence there must be at least 4 words of length 2 each
regex_at_least_4_words_in_sentence = re.compile(r"^(?=.*?(?:[,:;()'\"]?[a-zA-Z']{2,}[,:;()'\"]?(?: |-|$)(?:[^a-zA-Z]*?|[a-zA-Z]? ?)){4,})")         
# a string like "first.Second" could be misinterpreted by the tokenizer as a single token
# with the regex it becomes "first. Second"
regex_distance_between_period_and_following_word = re.compile(r'\.(?!\s|$)')
# compress consecutive whitespaces
regex_multiple_spaces = re.compile(r'\s{2,}')
# interrupted words usually have a "- " at the end before the new line, 'inter- rupted' -> 'interrupted'
# NOTE: must be after whitespace compression
regex_interrupted_word = re.compile(r'([a-zA-Z])- ')
# remove page numbers, that are usually enclosed in characters like = or -, for example "-12-"
regex_consecutive_meaningless_chars = re.compile(r'[^\.a-zA-Z0-9\s()]{2,} *(?:\d+)?|(?P<prepage>[^a-zA-Z\s\d\.])\d+(?P=prepage)')
# remove paragraphs id, '1.2.3' -> ''
regex_dot_separated_digits = re.compile(r'(?:\d+\.)+\d+')
# remove meaningless chars after sentence start, '. (- start' -> '. start'
regex_clean_start = re.compile(r'\.(\s?)[^a-zA-Z\s]+')
# recover missing apices
regex_missing_apices = re.compile(r"\b([a-zA-Z]+) (t|s)\b")

def clean_from_short_sentences(text: str) -> str:
    return '.'.join(sentence for sentence in text.split('.') if regex_at_least_4_words_in_sentence.match(sentence) is not None)

def clean_text(text: str) -> str:
    for clean_function in [lambda x: regex_mail.sub('', x),
                           lambda x: regex_link.sub('', x),
                           lambda x: regex_dot_separated_digits.sub('', x),
                           lambda x: regex_consecutive_meaningless_chars.sub('', x),
                           lambda x: regex_clean_start.sub(r'.\1', x),
                           # everything that is remove should be placed before this line so that 
                           # eventual spaces are compressed with regex_multiple_space
                           lambda x: regex_multiple_spaces.sub(' ', x),
                           lambda x: regex_interrupted_word.sub(r'\1', x),
                           lambda x: regex_missing_apices.sub(r"\1'\2", x),
                           lambda x: clean_from_short_sentences(x),
                           lambda x: regex_distance_between_period_and_following_word.sub('. ', x)]:
        text = clean_function(text)
    return text

test_text = 'this is a test (me@gmail.it) -12- that wi-  ll be   cleaned. with 2 5 6 not valid. two sentences can t be good http://or.not.'
cleaned_text = clean_text(test_text)
print(cleaned_text)
assert cleaned_text == 'this is a test () that will be cleaned. two sentences can\'t be good '

this is a test () that will be cleaned. two sentences can't be good 


In [19]:
from typing import List
import pandas as pd
import coreferee

def remove_columns_prefix(df: pd.DataFrame) -> None:
    '''remove prefix 'info.' from the columns of df'''
    df.rename(columns=lambda c: c.rsplit('.', 1)[-1], inplace=True)

def _get_new_token_from_resolve(token: spacy.tokens.Token, 
                                chains: coreferee.data_model.ChainHolder) -> spacy.tokens.Token:
    resolved_token = chains.resolve(token)
    return token.text_with_ws if resolved_token is None \
                              else ' and '.join([res_token.text_with_ws for res_token in resolved_token])   

def _process_doc_for_coref(doc: spacy.tokens.Doc) -> str:
    replacement_tokens = []
    chains = doc._.coref_chains
    new_doc_tokens_text = [_get_new_token_from_resolve(token, chains) for token in doc]

    return ''.join(new_doc_tokens_text)

def preprocess_texts(texts: List[str]) -> List[str]:
    nlp = spacy.load('en_core_web_trf')
    nlp.add_pipe("coreferee")

    texts = [clean_text(text) for text in texts]
    docs = nlp.pipe(texts)

    return [_process_doc_for_coref(doc) for doc in docs]
        
# text = get_document_by_line(DATASET_FILE_PATH, 40)
text = '''Although he was very busy with his work, the magical Peter had had enough of it. 
    He and his wife decided they needed a holiday. 
    this couple travelled to Spain because it loves the country very much.'''
preprocess_texts([text])

['Although Peter was very busy with Peter work, the magical Peter had had enough of work. Peter and Peter wife decided Peter  and wife needed a holiday. this couple travelled to Spain because couple loves the Spain very much']

In [None]:
import pandas as pd
from typing import Generator, Tuple
import ast

CLEANED_DATASET_FILE_PATH = 'data/cleaned_dataset.csv' if WORKING_LOCALLY \
    else '/content/drive/My Drive/Projects/IRBoardGameComplexity/cleaned_dataset.csv'

def clean_data_row(row, docs_dict: Generator[Tuple[int, str], None, None]):
    id_rulebook = next(docs_dict)
    assert id_rulebook[0] == row['id']
    row['rulebook'] = id_rulebook[1]
    return row

# ast.literal_eval converts the family column string into a python array
# with pd.read_csv(DATASET_FILE_PATH, chunksize=5, converters={ 'info.family': ast.literal_eval }) as reader:
#     for df in reader:
df_dataset = pd.read_csv(DATASET_FILE_PATH, converters={ 'info.family': ast.literal_eval })
remove_columns_prefix(df_dataset)
docs_dict = zip(df_dataset['id'].values, preprocess_texts(df_dataset['rulebook'].values))

df_cleaned_dataset = df_dataset.apply(lambda x: clean_data_row(x, docs_dict),
                                     axis='columns')

df_cleaned_dataset.to_csv(CLEANED_DATASET_FILE_PATH, header=True, index=False, mode='w') 
if not WORKING_LOCALLY:
    drive.flush_and_unmount()

df_cleaned_dataset.head()

In [6]:
import pandas as pd
import ast

def get_df_with_docs(file_path: str, nrows=None, skiprows=1) -> pd.DataFrame:
    ''' get a dataframe containing nrows and skipping the first `skiprows` (including the header)'''
    df_dataset = pd.read_csv(file_path, converters={ 'info.family': ast.literal_eval }, 
                             nrows=nrows, skiprows=range(1, skiprows))
    remove_columns_prefix(df_dataset)
    return df_dataset

def get_document_by_line(file_path: str, line: int) -> str:
    ''' the line includes the header too '''
    # range from 1 is used to keep the first row https://stackoverflow.com/a/27325729/5587393
    df = get_df_with_docs(file_path, 1, line - 1)
    return df['rulebook'].iloc[0]

def get_document_by_id(file_path: str, id: int) -> str:
     with pd.read_csv(file_path, chunksize=1, converters={ 'family': ast.literal_eval }) as reader:
        while True:
            df = next(reader)
            bg_id = df['info.id'].iloc[0]
            if bg_id == id:
                return df['rulebook'].iloc[0]

assert get_document_by_id(DATASET_FILE_PATH, 2310) == get_document_by_line(DATASET_FILE_PATH, 40)

Unnamed: 0,rulebook,id,name,averageweight,playingtime,family
0,Corrections or constructive criticisms? Em...,10,Elfenland,2.1579,60,[familygames]
1,RULES OF THE GAME FULL METAL PLANET You are ...,20,Full Metal Planète,3.1452,90,[strategygames]


In [None]:
from spacy import displacy
from collections import namedtuple
from spacy.matcher import Matcher, DependencyMatcher

LuckMetrics = namedtuple('LuckMetrics', ['dice_based', 'drawing_based', 'shuffling_based', 'random_based'])

def get_luck_metrics(doc: spacy.tokens.Doc) -> LuckMetrics:
    # ---------- random ----------
    random_matcher = Matcher(doc.vocab)
    random_patterns_match = [
        [{"LEMMA": { "IN": ["random", "randomly"]}}]
    ]
    random_matcher.add("random", random_patterns_match)

    # ---------- shuffle ----------
    shuffle_matcher = Matcher(doc.vocab)
    shuffle_patterns_match = [
        [{"LEMMA": "shuffle", "POS": "VERB"}]
    ]
    shuffle_matcher.add("shuffle", shuffle_patterns_match)

    # ---------- card drawing ----------
    drawing_matcher = DependencyMatcher(doc.vocab)    
    drawing_patterns = [
        [
            {
                "RIGHT_ID": "drawing",
                "RIGHT_ATTRS": {"LEMMA": "draw", "POS": "VERB"}
            },
            {
                "LEFT_ID": "drawing",
                "REL_OP": ">",
                "RIGHT_ID": "card",
                "RIGHT_ATTRS": {
                    "LEMMA": "card",
                    "POS": "NOUN", 
                    "DEP": { "IN": ['dobj', 'nsubjpass', 'compound'] }
                }
            }
        ]
    ]
    drawing_matcher.add("drawing", drawing_patterns)
    # ---------- dice rolling ----------
    dice_matcher = DependencyMatcher(doc.vocab)    
    dice_patterns = [
        [
            {
                "RIGHT_ID": "rolling",
                "RIGHT_ATTRS": {"LEMMA": { "IN": ["use", "throw", "roll"]}, "POS": "VERB"}
            },
            {
                "LEFT_ID": "rolling",
                "REL_OP": ">",
                "RIGHT_ID": "dice_or_die",
                "RIGHT_ATTRS": {
                    "LEMMA": { "IN": ["die", "dice"]},
                    "POS": "NOUN", 
                    "DEP": { "IN": ['nsubj', 'dobj', 'nsubjpass', 'compound'] }
                }
            }
        ],
        [
            {
                "RIGHT_ID": "rolling",
                "RIGHT_ATTRS": {"LEMMA": { "IN": ["use", "throw", "roll"]}, "POS": "VERB"}
            },
            {
                "LEFT_ID": "rolling",
                "REL_OP": ">",
                "RIGHT_ID": "number",
                "RIGHT_ATTRS": {
                    "IS_DIGIT": True, 
                    "DEP": { "IN": ['dobj'] }
                }
            }
        ]
    ]
    dice_matcher.add("diceroll", dice_patterns)

    dice_matches = dice_matcher(doc) 
    draw_matches = drawing_matcher(doc)
    shuffle_matches = shuffle_matcher(doc)
    random_matches = random_matcher(doc)

    # TODO: needs normalization? (divide by rulebook length or tokens)

    return LuckMetrics(len(dice_matches), len(draw_matches), len(shuffle_matches), len(random_matches))

text = get_document_by_line(DATASET_FILE_PATH, 130)

nlp = spacy.load('en_core_web_sm')
doc = nlp(clean_text(text))
print(len(doc), len(doc.text))
print(get_luck_metrics(doc))

# displacy.render(doc, style='dep', jupyter=True)

2603 12372
LuckMetrics(dice_based=12, drawing_based=4, shuffling_based=3, random_based=0)


In [None]:
text = '''you can only take this because it can be outrageous. 
    you can't take it. you could not also choose. you may never be sure of the result. 
    you can decide the next thing. he has no other choice but to stop, another option is winning.'''

nlp = spacy.load('en_core_web_sm')
doc = nlp(clean_text(text))
print([token.lemma_ for token in doc])
print(len(doc), len(doc.text))

displacy.render(doc, style='dep', jupyter=True)

['you', 'can', 'only', 'take', 'this', 'because', 'it', 'can', 'be', 'outrageous', '.', 'you', 'can', 'not', 'take', 'it', '.', 'you', 'could', 'not', 'also', 'choose', '.', 'you', 'may', 'never', 'be', 'sure', 'of', 'the', 'result', '.', 'you', 'can', 'decide', 'the', 'next', 'thing', '.', 'he', 'have', 'no', 'other', 'choice', 'but', 'to', 'stop', ',', 'another', 'option', 'be', 'win']
52 228


In [None]:
from spacy import displacy
from collections import namedtuple
from spacy.matcher import Matcher, DependencyMatcher

def get_choices_amount_metric(doc: spacy.tokens.Doc) -> int:
    # --------------  can/could/may/choose/select/... -------------- 
    # all can/could/may
    can_could_may_matcher = Matcher(doc.vocab)
    can_could_may_patterns = [
        [{
            "LEMMA": { "IN": ["can", "could", "may", "decide", "select", "choose", "opt"]}, 
            "POS": { "IN": ["AUX", "VERB"]}
        }]
    ]
    can_could_may_matcher.add('can_could_may', can_could_may_patterns)
    can_could_may_matches = { match[1] for match in can_could_may_matcher(doc) }

    # can/could/may with only or neg
    can_could_may_exceptions_matcher = DependencyMatcher(doc.vocab)
    can_could_may_exceptions_patterns = [
        [
            # ❌ can not/only/never verb 
            {
                "RIGHT_ID": "can_could_may",
                "RIGHT_ATTRS": {
                    "LEMMA": { "IN": ["can", "could", "may"]}, 
                    "POS": "AUX"
                }
            },
            {
                "LEFT_ID": "can_could_may",
                "REL_OP": "<",
                "RIGHT_ID": "generic_verb",
                "RIGHT_ATTRS": {
                    "POS": { "IN": ["AUX", "VERB"] }
                }
            },
            {
                "LEFT_ID": "generic_verb",
                "REL_OP": ">",
                "RIGHT_ID": "neg_or_only",
                "RIGHT_ATTRS": {
                    "LEMMA": { "IN": ["not", "only", "never"]}, 
                    "DEP": { "IN": ["advmod", "neg"] }
                }
            }
        ],
        [
            # ❌ not/only/never choose
            {
                "RIGHT_ID": "decision_verb",
                "RIGHT_ATTRS": {
                    "LEMMA": { "IN": ["decide", "select", "choose", "opt"]}, 
                    "POS": "VERB"
                }
            },
            {
                "LEFT_ID": "decision_verb",
                "REL_OP": ">",
                "RIGHT_ID": "negation",
                "RIGHT_ATTRS": {
                    "LEMMA": { "IN": ["not", "only", "never"]}, 
                    "DEP": { "IN": ["advmod", "neg"] }
                }
            }
        ],
        [
            # ❌ can + choose are counted as 1. can token is left out
            {
                "RIGHT_ID": "can_could_may",
                "RIGHT_ATTRS": {
                    "LEMMA": { "IN": ["can", "could", "may"]}, 
                    "POS": "AUX"
                }
            },
            {
                "LEFT_ID": "can_could_may",
                "REL_OP": "<",
                "RIGHT_ID": "decision_verb",
                "RIGHT_ATTRS": {
                    "LEMMA": { "IN": ["decide", "select", "choose", "opt"]},
                    "POS": "VERB"
                }
            }
        ]
    ]
    can_could_may_exceptions_matcher.add('can_could_may_exceptions', can_could_may_exceptions_patterns)
    can_could_may_exceptions_matches = { match[1][0] for match in can_could_may_exceptions_matcher(doc) }

    # -------------- choice and option -------------- 
    choice_option_matcher = Matcher(doc.vocab)
    choice_option_patterns = [
        [{
            "LEMMA": { "IN": ["choice", "option"]}, 
            "POS": "NOUN"
        }]
    ]
    choice_option_matcher.add('choice_option', choice_option_patterns)
    choice_option_matches = { match[1] for match in choice_option_matcher(doc) }

    choice_option_exceptions_matcher = DependencyMatcher(doc.vocab)
    choice_option_exceptions_patterns = [
        [
            {
                "RIGHT_ID": "choice",
                "RIGHT_ATTRS": {
                    "LEMMA": { "IN": ["choice", "option"]}, 
                    "POS": "NOUN"
                }
            },
            {
                "LEFT_ID": "choice",
                "REL_OP": ">",
                "RIGHT_ID": "prefix_no",
                "RIGHT_ATTRS": {
                    "LEMMA": "no",
                    "POS": "DET",
                    "DEP": "det"
                }
            }
        ]
    ]
    choice_option_exceptions_matcher.add('choice_option_exceptions', choice_option_exceptions_patterns)
    choice_option_exceptions_matches = { match[1][0] for match in choice_option_exceptions_matcher(doc) }

    return len(can_could_may_matches.difference(can_could_may_exceptions_matches)) + \
           len(choice_option_matches.difference(choice_option_exceptions_matches))

# text = get_document_by_line(DATASET_FILE_PATH, 130)
text = '''you can only take this because it can be outrageous. 
    you can't take it. you can not also choose. you can never be sure of the result. 
    you can decide the next thing, or you choose the target. another choice is to win. 
    but there is no right option.'''

nlp = spacy.load('en_core_web_sm')
doc = nlp(clean_text(text))
print([(i, token.lemma_) for i, token in enumerate(doc)])
print(len(doc), len(doc.text))
print(get_choices_amount_metric(doc))

# displacy.render(doc, style='dep', jupyter=True)

[(0, 'you'), (1, 'can'), (2, 'only'), (3, 'take'), (4, 'this'), (5, 'because'), (6, 'it'), (7, 'can'), (8, 'be'), (9, 'outrageous'), (10, '.'), (11, 'you'), (12, 'can'), (13, 'not'), (14, 'take'), (15, 'it'), (16, '.'), (17, 'you'), (18, 'can'), (19, 'not'), (20, 'also'), (21, 'choose'), (22, '.'), (23, 'you'), (24, 'can'), (25, 'never'), (26, 'be'), (27, 'sure'), (28, 'of'), (29, 'the'), (30, 'result'), (31, '.'), (32, 'you'), (33, 'can'), (34, 'decide'), (35, 'the'), (36, 'next'), (37, 'thing'), (38, ','), (39, 'or'), (40, 'you'), (41, 'choose'), (42, 'the'), (43, 'target'), (44, '.'), (45, 'another'), (46, 'choice'), (47, 'be'), (48, 'to'), (49, 'win'), (50, '.'), (51, 'but'), (52, 'there'), (53, 'be'), (54, 'no'), (55, 'right'), (56, 'option')]
57 245
4


In [None]:
doc = nlp(clean_text(get_document_by_line(DATASET_FILE_PATH, 155)))
doc.text
# print(doc[12].dep_)
#doc.text.find('Tinners Trail Player')

In [22]:
from multi_rake import Rake
from summa import keywords
import yake

nlp = spacy.load('en_core_web_sm')
text = clean_text(get_document_by_line(DATASET_FILE_PATH, 155))

def use_rake(text: str):
    rake = Rake()
    keywords = rake.apply(text)
    return keywords[:30]

def use_yake(text: str):
    kw_extractor = yake.KeywordExtractor(top=20)
    keywords_info = kw_extractor.extract_keywords(text)
    keyword_groups = [keyword_info[0] for keyword_info in keywords_info if keyword_info[1] < 0.1]
    return keyword_groups

def use_TextRank(text: str):
    TR_keywords = keywords.keywords(text, scores=True)
    return TR_keywords

display(use_rake(text))
display(use_TextRank(text))
use_yake(text)

[('make multiple trades', 9.0),
 ('resolving die rolls', 7.5),
 ('grand duke dies', 7.4),
 ('deed ultimately adds', 7.105263157894737),
 ('resource tokens equal', 6.882352941176471),
 ('immediately resolve taxation', 6.766666666666666),
 ('resource token matching', 6.582352941176471),
 ('high rank cards', 6.5),
 ('remaining resources wins', 6.5),
 ('higher total wins', 6.333333333333334),
 ('immediately previous property', 6.333333333333333),
 ('additional property cards', 6.333333333333333),
 ('extended deck courts', 6.166666666666667),
 ('number rolled collects', 5.75),
 ('developed court card', 4.904761904761905),
 ('extended decktet', 4.5),
 ('die rolls', 4.5),
 ('cards remaining', 4.5),
 ('grand duke', 4.4),
 ('resolve taxation', 4.1),
 ('civic development', 4.0),
 ('common merchant', 4.0),
 ('noble title', 4.0),
 ('tokens suggestions', 4.0),
 ('six-sided die', 4.0),
 ('setup separate', 4.0),
 ('crown cards', 4.0),
 ('higher die', 4.0),
 ('fully developed', 4.0),
 ('subsequent car

[('resources', 0.4417596299435171),
 ('resource tokens', 0.29996468943364707),
 ('development', 0.29448786503185653),
 ('develop', 0.29448786503185653),
 ('developed', 0.29448786503185653),
 ('developing', 0.29448786503185653),
 ('property', 0.27289832052503754),
 ('properties', 0.27289832052503754),
 ('cards', 0.2263382310563111),
 ('card', 0.2263382310563111),
 ('players', 0.1851843289810696),
 ('player', 0.1851843289810696),
 ('token', 0.1581697489237771),
 ('deeds', 0.15214773284272246),
 ('deed', 0.15214773284272246),
 ('deeded', 0.15214773284272246),
 ('die', 0.14029181040318847),
 ('dies', 0.14029181040318847),
 ('roll', 0.13760610474450682),
 ('rolls', 0.13760610474450682),
 ('rolled', 0.13760610474450682),
 ('dice', 0.13232216747821235),
 ('ace', 0.11713298995261101),
 ('aces', 0.11713298995261101),
 ('total', 0.11572784280803868),
 ('totals', 0.11572784280803868),
 ('victory', 0.11449303391910352),
 ('costs', 0.11195097394654079),
 ('cost', 0.11195097394654079),
 ('district',

['property',
 'resources',
 'resource',
 'card',
 'develop',
 'resource tokens',
 'Ace',
 'deed',
 'district',
 'player',
 'Grand Duke',
 'collect resources',
 'tokens',
 'collect',
 'matching',
 'cards',
 'Court',
 'turn',
 'suit',
 'developed']

In [27]:
import itertools
from collections import Counter, defaultdict
from typing import List, Dict

def find_most_common_nouns(doc: spacy.tokens.Doc) -> Dict[str, List[spacy.tokens.Token]]:
    tokens_dict = defaultdict(list)

    for token in doc:
        if len(token) >= 3 and \
            token.pos_ in ['NOUN', 'PROPN'] and \
            token.dep_ in ['nsubj', 'dobj', 'nsubjpass', 'pobj', 'compound']:
            tokens_dict[token.lemma_.lower()].append(token)
           
    return tokens_dict

def find_n_most_common_nouns(n, docs: List[spacy.tokens.Doc]) -> List[str]:
    docs_sets = [set(find_most_common_nouns(doc).keys())
                 for doc in docs]
    all_tokens_from_docs = itertools.chain(*docs_sets)
    tokens_counter = Counter(all_tokens_from_docs)
    return tokens_counter.most_common(n)
    

nlp = spacy.load('en_core_web_sm')
df_dataset = get_df_with_docs(DATASET_FILE_PATH, 10, 50)
docs = nlp.pipe(map(clean_text, df_dataset['rulebook'].values))

find_n_most_common_nouns(10, docs)

[('game', 10),
 ('turn', 10),
 ('player', 10),
 ('point', 9),
 ('number', 9),
 ('play', 9),
 ('hand', 8),
 ('end', 8),
 ('side', 8),
 ('way', 8)]

In [None]:
import re
from collections import Counter
from typing import List, Set, Dict
from rake_nltk import Rake
from nltk.util import ngrams
import yake
import numpy as np
from string import punctuation

regex_word_within_boundaries = re.compile(r'\b')
MIN_TOKEN_TO_BE_CONSIDERED_COMPONENT = 4
MAX_COMPONENTS = 100

def _get_ngrams_components(doc: spacy.tokens.Doc, \
                           components: Dict[str, List[spacy.tokens.Token]],
                           n_grams: int):
    pass


def _get_bg_components_by_deps_inspection(doc: spacy.tokens.Doc) -> Dict[str, List[int]]:
    words_to_leave_out = ['beginning', 'board', 'book', 'case', 'clarification', 'design', 
                          'effect', 'end', 'example', 'case', 'game', 'number', 
                          'overview', 'order', 'play', 'player', 'purpose', 'reference',
                          'result', 'rule', 'rulebook', 'section', 'set', 'setup', 'side', 'summary', 
                          'start', 'step', 'thing', 'type', 'time', 'total', 'use', 'value', 'version', 'way']

    possible_components = dict(filter(lambda token: token[0] not in words_to_leave_out and 
                                      len(token[1]) >= MIN_TOKEN_TO_BE_CONSIDERED_COMPONENT, 
                                  find_most_common_nouns(doc).items()))
    return possible_components

# def _get_lemmas_given_keywords_group(group: str, doc: spacy.tokens.Doc) -> List[str]:
#     kw_match = re.search(r'\b' + group + '\\b', doc.text)
#     if kw_match is None:
#         return []

#     group_span = doc.char_span(kw_match.start(0), kw_match.end(0))
#     return [token.lemma_.lower() for token in group_span]

# def _get_bg_components_by_keyword_analysis(doc: spacy.tokens.Doc, max_keywords: int) -> List[str]:
#     kw_extractor = yake.KeywordExtractor(top=max_keywords)
#     keywords_info = kw_extractor.extract_keywords(doc.text)
#     keyword_groups = [keyword_info[0] for keyword_info in keywords_info if keyword_info[1] < 0.1]

#     return [lemma for keyword_group in keyword_groups 
#             for lemma in _get_lemmas_given_keywords_group(keyword_group, doc)]

def get_bg_components(doc: spacy.tokens.Doc) -> Dict[str, List[int]]:
    components_by_deps = _get_bg_components_by_deps_inspection(doc)
    # print(components_by_deps)
    # components_by_kws = _get_bg_components_by_keyword_analysis(doc, len(components_by_deps))
    # print(components_by_kws)

    # return set(components_by_deps).intersection(set(components_by_kws))
    return components_by_deps

def get_doc_variance(doc: spacy.tokens.Doc, components_dict: Dict[str, List[int]]) -> float:
    '''variance measures how components interleave in the text. This could mean that rules involve
    many components and are therefore more complex. variancy is computed using `np.var` on each
    component list. the results are normalized by multiplicating for the frequency of the component.
    eventually the partial variances are summed together and the result normalized with the 
    total numbers of tokens.'''
    tokens_count = sum(len(token_list) for token_list in components_dict.values())
    return sum((len(tokens) / tokens_count) * np.var([token.i for token in tokens])
        for tokens in components_dict.values()) / len((doc))

nlp = spacy.load('en_core_web_sm')
doc = nlp(clean_text(get_document_by_line(DATASET_FILE_PATH, 138)))
print(len(doc.text))
components = get_bg_components(doc)
print(components)
print(get_doc_variance(doc, components))

In [None]:
from typing import Tuple
import pandas as pd
import ast
import os
from IPython.display import clear_output

def get_rules_features(id: int, doc: spacy.tokens.Doc) -> Tuple[int, float]:
    logger.info(f'processing board game {id}')
    rulebook_len = len(doc)
    bg_components = get_bg_components(doc)
    print(bg_components)

    return 0, 0
    # rules = get_rules(text)
    # rule_count = len(rules)
    # return rule_count, len(text) / rule_count

def apply_for_rulebook_features(row, docs_dict):
    next_doc_info = next(docs_dict)
    assert next_doc_info[0] == row.id
    return pd.Series(get_rules_features(row.id, next_doc_info[1]), 
                     index=['rule_count', 'avg_rule_len'])

PROCESSED_DATASET_FILE_PATH = 'data/processed_dataset.csv' if WORKING_LOCALLY \
    else '/content/drive/My Drive/Projects/IRBoardGameComplexity/processed_dataset.csv'

# ast.literal_eval converts the family column string into a python array
# with pd.read_csv(DATASET_FILE_PATH, chunksize=5, converters={ 'info.family': ast.literal_eval }) as reader:
#     for df in reader:
df_dataset = pd.read_csv(DATASET_FILE_PATH, converters={ 'info.family': ast.literal_eval }, nrows=1)
remove_columns_prefix(df_dataset)
docs_dict = zip(df_dataset['id'].values, 
                nlp.pipe(map(clean_text, df_dataset['rulebook'].values)))

df_rules_features = df_dataset.apply(lambda x: apply_for_rulebook_features(x, docs_dict),
                                     axis='columns')
df_features = df_dataset[['averageweight', 'playingtime', 'family']].join(df_rules_features)
        
# one-hot encoding "family" field 
# from https://stackoverflow.com/questions/71401193/one-hot-encoding-in-python-for-array-values-in-a-dataframe
df_features = df_features.join(df_features.pop('family').apply('|'.join).str.get_dummies())
df_features.head()

# df_features.to_csv(PROCESSED_DATASET_FILE_PATH, header=True, index=False, mode='w')    
# if not WORKING_LOCALLY:
#     drive.flush_and_unmount()