# This notebook is the start of the "trial and error approach"

References:

- https://github.com/xurxodiz/cardwalker/tree/master/oracle
- https://laterna--magica.blogspot.com/2011/10/oracle-parser.html

In [None]:
import json
import pandas as pd
import re
from collections import defaultdict

In [None]:
sets = json.load(open('./AllSets.json', 'rb'))

In [None]:
for k, v in sorted(sets.items()):
    print(k, v['name'])

In [None]:
cards_usaga = sets['USG']['cards']

In [None]:
cards_all=[]
for k, sett in sets.items():
    if (k in ['UGL', 'UST', 'UNH']) or (len(k)>3): # Ignore Unglued, Unstable and promotional things
        continue
    for card in sett['cards']:
        card['set'] = k
    cards_all.extend(sett['cards'])    

# Params

In [None]:
ASPAS_TEXT = "ASPAS_TEXT"

In [None]:
mains_col_names = ['name', 'manaCost', 'text_preworked', 'type', 'power', 'toughness',
                   'types', 'supertypes', 'subtypes']

In [None]:
from sqlalchemy import create_engine
engine = create_engine('postgresql+psycopg2://mtg:mtg@localhost:5432/mtg')
engine.connect()

# Create dataframe of cards

In [None]:
# Load deck list
filename = './decks/Benalia-knights-rotation-proof.txt'
deck_regex = r'^(?P<amount>\d+) (?P<card_name>.*?)\n'
with open(filename, 'r') as f:
    txt = f.readlines()
    #print(txt)
    deck_list = []
    for x in txt:
        deck_list.extend(re.findall(deck_regex, x))
#deck_list # -> [(amount, card_name), (amount, card_name), ...]
cards_in_deck_names_list = []
for amount, card in deck_list:
    for i in range(int(amount)):
        cards_in_deck_names_list.append(card)

In [None]:
#cards = cards_usaga
cards = cards_all
cards_df = pd.DataFrame.from_dict(cards)

# Keep only the card in the deck, and as many copy of it as necessary
cards_df = cards_df.drop_duplicates(subset=['name'])
cards_df = cards_df.merge(
    pd.DataFrame(cards_in_deck_names_list), how='right', left_on=['name'], right_on=[0])
cards_df['card_id_in_deck'] = cards_df.index
cards_df = cards_df.set_index('card_id_in_deck')

##  Are there different cards with the same name?

No, there are not. Some cards with the same name have texts which are slightly differently written, but are the same.

In [None]:
# Lets learn about duplicated card names
test = pd.DataFrame.from_dict(cards_all).set_index('id')

In [None]:
print(test.shape)
a=test[test['name'].duplicated()]#['name'].unique().shape
print(a.shape)
b = test[
    (test['name'].duplicated())&(test['text'].duplicated())]
print(b.shape)
c = test[
    (test['name'].duplicated(keep=False))&(~test['text'].duplicated(keep=False))]
for idx, row in c.sort_values(by='name')[['name', 'text']].iterrows():
    print(row['name'], '---', row['text'], '\n')

# DF version: Let's start by trying to extract static habilities from cards

In [None]:
#cards_df = pd.DataFrame.from_dict(cards_usaga)
cards_df = pd.DataFrame.from_dict(cards_all)

In [None]:
# Filter out new types
#set(cards_df.types.apply(lambda x: tuple(set(x))))
ignore_types = ('Conspiracy', 'Eaturecray', 'Phenomenon', 'Plane', 'Planeswalker', 'Scheme', 'Vanguard')

In [None]:
cards_df = cards_df[cards_df.types.apply(lambda x: not set(x).intersection(ignore_types))]

In [None]:
#cards_df.head(4).transpose()

## Questions and preprocessing

### Does parenthesis contain usefull info or only explanations of abilities/effects?

Seems like its always an explanation (so, no usefull info to discern possible targets, zones affected, etc.)

In [None]:
pattern_parenthesis = r'\((.*?)\)'
test = """('Flying',), ('Trample',), ('Paper',), ('First strike',),
       ('Phasing',), ('Haste',), ('Flash',), ('Island',), ('Defender',),
       ('Blue',), ('Reach',), ('Devour X',), ('Vigilance',),
       ('Double strike',), ('Indestructible',), ('Artifacts',),
       ('Deathtouch',), ('Lifelink',), ('Menace',), ('Werewolf',),
       ('Leviathans',), ('While voting',), ('Flying', 'Demon'),
       ('Islandwalk',), ('Hexproof',), ('Plains',), ('Instant',),
       ('Swamp',), ('Mountain',), ('Forest',), ('Dinosaur',),
       ('Dinosaur Knight',), ('Leviathan',), ('Simultaneously',),
       ('Rat',), ('During combat',), ('Investigate',),
       ('Minotaur Pirate',), ('Each noncreature',), ('Vampire',),
       ('Pyrogenius',), ('Swampwalk',), ('Bolster X',), ('Timebender',),
       ('Bold Pyromancer',), ('Scry X',), ('Desertwalk',), ('Prowess',),
       ('Martial Paragon',), ('Death Wielder',), ('Equipment',),
       ('Valiant Protector',)"""
a = re.findall(pattern_parenthesis, test)
a

In [None]:
pattern_parenthesis = r'\((.*?)\)'
cards_df['in_parentheses'] = cards_df['text'].apply(lambda x: tuple(re.findall(pattern_parenthesis, str(x))))
set(cards_df['in_parentheses'])

In [None]:
#This example is not explaning an ability, but it is explaning something (an effect)
st = 'If two or more creatures are tied for greatest power, target any one of them.'
cards_df[cards_df['text'].str.contains(st).fillna(False)]['text'].values

### Remove anything between parenthesis and replace name by SELF

In [None]:
# Replace name by SELF and remove anything between parethesis
pattern_parenthesis = r' ?\(.*?\)'
def prework_text(card):
    t = str(card['text']).replace(card['name'], 'SELF')
    t = re.sub(pattern_parenthesis, '', t)
    return t
    
cards_df['text_preworked'] = cards_df.apply(prework_text, axis=1)
#cards_df['text_preworked']

In [None]:
import copy
lands = [('Plains', '{W}'), ('Swamp', '{B}'), ('Island', '{U}'), ('Mountain', '{R}'), ('Forest', '{G}')]
for land_name, sym in lands:
    cards_df.loc[(cards_df[cards_df['name']==land_name]).index, 'text_preworked'] = '{T}: Add ' + sym +'.'

In [None]:
sep = "ª"
if cards_df['text_preworked'].str.contains(sep).any():
    raise Exception("Bad separator symbol. It is contained in some text.")

In [None]:
# replace card names by their ids - DOES NOT WORK: It replaces stuff that are not really card names
# for example, just run name_id_dict.get('When')
# Repalcing When in the cards will obviously replace a lot of stuff we don't want to
name_id_dict = {c['name']: c['id'] for c in cards_all}
#name_id_dict
temp = sep.join(cards_df['text_preworked'])
for i, (name, id_) in enumerate(name_id_dict.items()):
    if not i%1000: print(i)
    temp = temp.replace(name, id_)
cards_df['text_preworked_name_id_replaced'] = temp.split(sep)
#cards_df['text_preworked']

In [None]:
assert cards_df[cards_df['text_preworked'].str.contains('\(').fillna(False)]['text_preworked'].empty

# Domain specific vocabulary

Let's build some domain specific vocabulary for MTG. For example, let's list supertypes, types, subtypes, know all card names, this kind f thing.

In [None]:
# Create set of cards names
cards_names = set(cards_df.name.unique())

In [None]:
# Create set of supertypes
array_of_supertypes_tuples = cards_df['supertypes'].dropna().apply(tuple).unique()
cards_supertypes = tuple()
for tup in array_of_supertypes_tuples:
    cards_supertypes += tup
    
cards_supertypes = set(cards_supertypes)
cards_supertypes

In [None]:
# Create set of types
array_of_types_tuples = cards_df['types'].dropna().apply(tuple).unique()
cards_types = tuple()
for tup in array_of_types_tuples:
    cards_types += tup
    
cards_types = set(cards_types)
cards_types

In [None]:
# Create set of types
array_of_subtypes_tuples = cards_df['subtypes'].dropna().apply(tuple).unique()
cards_subtypes = tuple()
for tup in array_of_subtypes_tuples:
    cards_subtypes += tup
    
cards_subtypes = set(cards_subtypes)
#cards_subtypes

In [None]:
#cards_df.head(10).transpose()

In [None]:
import requests
import pickle
r = requests.get('http://media.wizards.com/2018/downloads/MagicCompRules%2020180713.txt')
if not r.status_code == 200:
    r.raise_for_status()
comprules = r.text

In [None]:
with open('rules.txt', 'r', encoding='latin-1') as f:
    comprules = '\n'.join(f.readlines())

In [None]:
kw_abilities_pat = r'702\.\d+\. ([A-Za-z ]+)'
abilities = re.findall(kw_abilities_pat, comprules)
abilities.pop(0) # Its just the rulings 
abilities.sort()
#abilities

## How can we detect an abilities sentence?

We should:
- Split sentences in a card by '\n' (=card_sentences_list)
- Split each element in card_sentences_list by ', ' (=split_candidate_sentences)
- Search for the pattern r'^ability' in each item of split_candidate_sentences
- If the pattern is found for evey item, then, split_candidate_sentences is an abilities sentence

We can, at the same time, detect activated abilites sentences and "rest" sentences (which are not abilites and not triggered abilites ones).
- Split sentences in a card by '\n' (=card_sentences_list)
- Those sentences which contain : are activated abilites

Sentences which are not in any case above are "rest" sentences.

In [None]:
ability_start_pattern = r'|'.join(['^'+ab+r'\b' for ab in abilities])
#print(ability_start_pattern)
def is_ability_sentence(sentence):
    elem_starting_with_ability = []
    exceptions = ['Cycling abilities you activate cost up to {2} less to activate.']
    if sentence in exceptions:
        return False
    elems = sentence.split(', ')
    for elem in elems:
        if re.search(ability_start_pattern, elem):
            elem_starting_with_ability.append(re.search(ability_start_pattern, elem))
        else:
            return False
    if len(elems)==len(elem_starting_with_ability):
        return True
    raise Exception('We should never get here')

In [None]:
df = cards_df
df['split_sentences'] = df['text_preworked'].apply(lambda x: x.split('\n')) # list of sentences
df['split_sentences_is_ability'] = df['split_sentences'].apply(lambda x: [is_ability_sentence(y) for y in x])

df[df['split_sentences_is_ability'].apply(lambda x: True in x)][
    ['split_sentences', 'split_sentences_is_ability']].iloc[1]['split_sentences']

### Detect all possible differente abilities text

In [None]:
import itertools
ability_sentences = list(itertools.chain.from_iterable(df['split_sentences']))
ability_sentences_is_ability = list(itertools.chain.from_iterable(df['split_sentences_is_ability']))
abilities_full_set = []
for a, b in zip(ability_sentences, ability_sentences_is_ability):
    if b: abilities_full_set.append(a)
abilities_full_set = set(abilities_full_set)
len(abilities_full_set)

## Lets detetect all paragraphs types (and keep each ability as a separate paragraph)

In [None]:
import uuid

In [None]:
import collections
def splitDataFrameList(df,target_column,separator=None):
    '''
    https://gist.github.com/jlln/338b4b0b55bd6984f883
    df = dataframe to split,
    target_column = the column containing the values to split
    separator = the symbol used to perform the split
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    '''
    def splitListToRows(row,row_accumulator,target_column,separator):
        split_row = row[target_column]#.split(separator)
        if isinstance(split_row, collections.Iterable):
            for s in split_row:
                new_row = row.to_dict()
                new_row[target_column] = s
                row_accumulator.append(new_row)
        else:
            new_row = row.to_dict()
            new_row[target_column] = pd.np.nan
            row_accumulator.append(new_row)
    new_rows = []
    df.apply(splitListToRows, axis=1, args=(new_rows,target_column,separator))
    new_df = pd.DataFrame(new_rows)
    return new_df

In [None]:
df = pd.DataFrame({'name':['a','b','c'], 'c':['a','b','c'], 'b':['a','b','c'], "items":[['a1','a2','a3'],['b1','b2','b3'],['c1','c2','c3']]})
display(df)
test = splitDataFrameList(df, target_column="items")
display(test)

In [None]:
def get_paragraph_type(paragraph):
    if is_ability_sentence(paragraph):
        return 'ability'
    elif ':' in paragraph:
        return 'activated'
    else:
        return 'rest'

def split_abilities_and_keep_the_rest(df_row):
    '''Returns a list of abilities or a list of one element, which is not ability'''
    if df_row['paragraph_type'] == 'ability':
        return df_row['paragraph'].split(',')
    
    return [df_row['paragraph']]

def get_aspas(text):
    if pd.isnull(text):
        return pd.np.nan
    
    reg = re.findall(r'\"(.+?)\"', text)
    
    if not reg:
        return pd.np.nan
    
    res = reg[0]
    
    return res
        
    
def get_paragraphs_and_types_df(card_row):
    res = pd.DataFrame()
    temp = pd.DataFrame()
    
    # Get initial paragraphs
    temp['paragraph'] = card_row['text_preworked'].split('\n')
    temp[ASPAS_TEXT] = temp['paragraph'].apply(get_aspas)
    # TODO CONTINUE FROM HERE CORRECT THIS
    temp['paragraph'] = temp.apply(lambda x: x['paragraph'].replace(x[ASPAS_TEXT], ASPAS_TEXT)
                                             if not pd.isnull(x[ASPAS_TEXT]) else x['paragraph'],
                                  axis=1)
    
    temp['paragraph_type'] = temp['paragraph'].apply(get_paragraph_type)
    
    # Split the abilities paragraphs into multiple rows
    temp['paragraph'] = temp.apply(split_abilities_and_keep_the_rest, axis=1)
    temp = splitDataFrameList(temp, 'paragraph')
    res = temp
    
    res['card_id'] = card_row.name
    res['paragraph_order'] = range(res.shape[0])
    return res

In [None]:
cards_df['df_paragraphs'] = cards_df.apply(get_paragraphs_and_types_df, axis=1)

In [None]:
cards_df[['text_preworked','df_paragraphs']].iloc[21]['df_paragraphs']

In [None]:
cards_df_paragraphs = pd.concat(cards_df['df_paragraphs'].values)
cards_df_paragraphs.head(3)

In [None]:
temp = cards_df_paragraphs[~pd.isnull(cards_df_paragraphs[ASPAS_TEXT])]
temp

In [None]:
# Check if we get a different paragraph order for each ability
cards_df_paragraphs[cards_df_paragraphs['type']=='ability'].sort_values(by=['card_id', 'paragraph_order'])

In [None]:
# Show cards with triggered abilities
#cards_df[cards_df['df_sentences'].apply(lambda x: 'activated' in x['type'].values)]

## Lets use the same approach and separate paragraphs in abilities-complements, costs-effects and keep the rest as is

In [None]:
ability_and_complement_regex = r'(' + ability_start_pattern +')' + r'(.*)'
#ability_and_complement_regex

In [None]:
def get_pop_and_complements_df(paragraph_row):
    res = pd.DataFrame()
    pat_ability = re.compile(ability_and_complement_regex)
    
    if paragraph_row['paragraph_type'] == 'ability':
        
        #print(res['pop'].iloc[0])
        #print(re.findall(pat, res['pop'].iloc[0]))
        x = paragraph_row['paragraph']
        if (not pd.isnull(x)) and re.findall(pat_ability, x):
            ability = re.findall(pat_ability, x)[0][0].strip()
            ability_complement = re.findall(pat_ability, x)[0][1].strip()
        else:
            import pdb
            pdb.set_trace()
        
        res['pop'] = [ability, ability_complement] 
        res['pop_type'] =  ['ability', 'ability_complement'] 
        res['pop_order'] = range(res['pop'].shape[0])
    
    elif paragraph_row['paragraph_type'] == 'activated':
        '''Break the costs in individual ones'''
        costs, effect = paragraph_row['paragraph'].split(':')
        
        exceptions = ['Pay half your life, rounded up']
        if costs in exceptions:
            costs = costs.replace(',','')
            
        res['pop'] =  costs.split(',') + [effect]
        types = ['activation_cost' for x in costs.split(',')] + ['activated_effect']
        
        res['pop_type'] =  types
        res['pop_order'] = range(res['pop'].shape[0])
        
    else:
        '''Keep the rest as rest or effect'''
        effect = paragraph_row['paragraph']
        
        res['pop'] =  [effect]
        res['pop_type'] =  ['effect']
        res['pop_order'] = range(res['pop'].shape[0])
        
        
    res['card_id'] = paragraph_row['card_id']
    res['paragraph_order'] = paragraph_row['paragraph_order']
    return res

In [None]:
cards_df_paragraphs['pop'] = cards_df_paragraphs.apply(get_pop_and_complements_df, axis=1)

In [None]:
cards_df_paragraphs.iloc[3]['pop']

In [None]:
cards_df_pops = pd.concat(cards_df_paragraphs['pop'].values, sort=True)
#cards_df_pops['pop_hash'] = cards_df_pops['pop'].apply(lambda x: uuid.uuid4().hex)
cards_df_pops.sort_values(by=['card_id','paragraph_order','pop_order']).head(3)

In [None]:
activated_ability_paragraph_hash = cards_df_paragraphs[cards_df_paragraphs['type']=='activated'].sample(1)['paragraph_hash'].iloc[0]
cards_df_pops[cards_df_pops['paragraph_hash']==activated_ability_paragraph_hash]

In [None]:
investigate = '08038de1ded341a1b63f792d29b8dad8'
cards_df_pops[cards_df_pops['paragraph_hash']==investigate]

In [None]:
#investigate = cards_df_pops[cards_df_pops['pop']=='Creatures you control have "{T}'].iloc[0]['card_id']
investigate = '7011018896f7a9a24b7f9dff722a7e990c43922b'
cards_df_pops[cards_df_pops['card_id']==investigate]

In [None]:
#investigate = cards_df_pops[cards_df_pops['pop']=='Creatures you control have "{T}'].iloc[0]['card_id']
investigate = 'ade9880f3121cdf8db57c3f4ba0375c843ec14c0'
cards_df_pops[cards_df_pops['card_id']==investigate]

In [None]:
cards_df_pops[cards_df_pops['pop']=='Pay half your life']

In [None]:
cards_df_pops[cards_df_pops['pop_type']=='activation_cost']['pop'].dropna().unique()

In [None]:
# Count how many abilities, activated abilities and effects there are
cards_df_pops['cont'] = 1

index = ['pop_type']
values = ['cont']

pivot_pop = cards_df_pops.pivot_table(index=index, values=values, aggfunc=pd.np.sum)
pivot_pop

In [None]:
# Show cards with triggered abilities
#cards_df[cards_df['df_sentences'].apply(lambda x: 'activated' in x['type'].values)]

## Lets use the same approach and separate conditions-"result effect"

In [None]:
condition_regex = r'((?:if |whenever |when |only |unless ).*?[,.])'
#condition_regex

In [None]:
step_condition_regex = r'(at the (?:beginning |end )of.*?[,.])'
#step_condition_regex

In [None]:
def get_condition(text):
    if pd.isnull(text):
        return None
    
    reg = re.findall(condition_regex, text, flags=re.IGNORECASE)
    if not reg:
        return None
    
    return reg

def clean_effect_from_condition(row):
    clean_effect = row['pop']
    
    if (not row['condition']):
        return clean_effect
    
    condition = ''.join(row['condition'])
    clean_effect = clean_effect.replace(condition, '')
    return clean_effect
    
cards_df_pops['condition'] = cards_df_pops['pop'].apply(get_condition)
cards_df_pops['effect_wo_condition'] = cards_df_pops.apply(clean_effect_from_condition, axis=1)

In [None]:
def get_conditions_and_effects_df(pop_row, original_cols=[]):
    res = pd.DataFrame()
    text = pop_row['pop']
    
    # Get list of conditions in text
    reg_cond = re.findall(condition_regex, text, flags=re.IGNORECASE)
    if not reg_cond:
        reg_cond = []
    
    # Get list of step (time) conditions in text
    reg_step_cond = re.findall(step_condition_regex, text, flags=re.IGNORECASE)
    if not reg_step_cond:
        reg_step_cond = []
    
    # Get the rest of the text in a list
    text_wo_conditions = text
    for cond in reg_cond + reg_step_cond:
        text_wo_conditions = text_wo_conditions.replace(cond, '')
    text_wo_conditions = text_wo_conditions.strip(',. ')
    text_wo_conditions = [text_wo_conditions]
    
    temp = []
    for part in reg_cond:
        temp.append({'part_order':text.find(part), 'part': part.strip(',. '), 'part_type': 'condition'})
    for part in reg_step_cond:
        temp.append({'part_order':text.find(part), 'part': part.strip(',. '), 'part_type': 'step_condition'})
    for part in text_wo_conditions:
        temp.append({'part_order':text.find(part), 'part': part.strip(',. '), 'part_type': 'wo_conditions'})
    
    # Reset order to start from zero
    res = pd.DataFrame(temp).sort_values(by=['part_order'])
    res = res.reset_index(drop=True)
    res['part_order'] = res.index

    for col in original_cols:
        res[col] = pop_row[col]
        
    return res

cards_df_pops['pop_parts'] = cards_df_pops.apply(get_conditions_and_effects_df,
                                                           args=(cards_df_pops.columns,),
                                                           axis=1)
cards_df_pop_parts = pd.concat(cards_df_pops['pop_parts'].values)

In [None]:
#cards_df_pop_parts

## Detect named cards cited inside cards text

For later: define a way to get card named cited in other cards text. Same approach of self should suffice:
1. Detect the names (done below)
2. Replace the names with a place holder. CARD_NAME_1, CARD_NAME_2 (for each card name in a cards text).
3. Create columns CARD_NAME_1, CARD_NAME_2, etc. in dataframe, holding the actual name in the cell value
4. Create entity detector for CARD_NAME_1, CARD_NAME_2,...
5. Manually add edge between CARD_NAME_1 and its actual value (the actual card name)

In [None]:
named_card_pattern = r'('+r'|'.join(['{0}'.format(n) for n in cards_names])+r')'
named_card_regex = r' named ' + named_card_pattern + '((?: or )' + named_card_pattern + ')?' + r'.*?'
#named_card_regex

### Tests

In [None]:
test_text = 'Add {G} for every card named Path of Peace in all graveyards.'
test = re.findall(named_card_regex, test_text)
test

In [None]:
a = cards_df['text_preworked'].apply(
    lambda x: re.findall(named_card_regex, x)
    if re.findall(named_card_regex, x)
    else pd.np.nan
).dropna()

In [None]:
'Zhang Fei, Fierce Warrior' in named_card_regex

In [None]:
cards_df.loc[a.index[0]]['text_preworked']

In [None]:
a.iloc[0]

In [None]:
test_text = 'SELF gets +2/+2 as long as you control a permanent named Guan Yu, Sainted Warrior or a permanent named Zhang Fei, Fierce Warrior in the battlefield.'
test = re.findall(named_card_regex, test_text)
test

In [None]:
cards_df.loc['ef0fe275d7e5625b20f4c5cd7fc34301df0bea6d']['text_preworked']

In [None]:
a['ef0fe275d7e5625b20f4c5cd7fc34301df0bea6d']

## Save / Load (this process took some time)

In [None]:
filename = './cards_df_pop_parts.pkl'

In [None]:
# Save
cards_df_pop_parts.to_pickle(filename)

In [None]:
# Load
cards_df_pop_parts = pd.read_pickle(filename)

In [None]:
(cards_df_pop_parts==cards_df_pop_parts2).all().all()

In [None]:
cards_df_pop_parts[cards_df_pop_parts['part_type']=='step_condition']['part'].unique()

In [None]:
# Lets avoid creating a pop node
cards_df_pop_parts['part_type_full'] = cards_df_pop_parts['pop_type'] + '-' + cards_df_pop_parts['part_type']

## Investingating abilities

Now, how to work with abilites followed by cost?

In [None]:
def detect_abilities_sentence(sentlist):
    for sent in sentlist:
        if set(sent.split(', ')).issubset(set(abilities)):
            return True
    return False
t = df['split_sentences'].apply(detect_abilities_sentence)
df[t][df['text'].str.contains('umulative upkeep').fillna(False)]['text_preworked']

In [None]:
df[df['text'].str.contains('umulative upkeep').fillna(False)]['text_preworked'].loc[30808]

### Deal with cummulative upkeep

Seems like, if followed by mana cost, cumulative upkeep COST may be followed by , (comma) or \n (newline). But if the text for cumulative upkeep is longer, it seems to end with \n everytime.

In [None]:
# Check that these things are always the same
cumulative_upkeep_pattern1 = r'(?:, )?cumulative upkeep—.*?[.]'
cumulative_upkeep_pattern2 = r'(?:, )?cumulative upkeep—.*?[.\n]'
print(cumulative_upkeep_pattern1)
def get_cumup1(xstr):
    res = re.findall(cumulative_upkeep_pattern1, str(xstr), re.IGNORECASE)
    if res:
        return res
    return pd.np.nan
def get_cumup2(xstr):
    res = re.findall(cumulative_upkeep_pattern2, str(xstr), re.IGNORECASE)
    if res:
        return res
    return pd.np.nan
df['cumup1'] = df['text_preworked'].apply(get_cumup1).fillna(False)
df['cumup2'] = df['text_preworked'].apply(get_cumup2).fillna(False)
diff = df['cumup1']==df['cumup2']
df[~diff][['cumup1', 'cumup2', 'text_preworked']]
assert diff.all()

In [None]:
# Check that this never matches anything
cumulative_upkeep_pattern1 = r'(?:, )?cumulative upkeep—.*?,'
print(cumulative_upkeep_pattern1)
def get_cumup1(xstr):
    res = re.findall(cumulative_upkeep_pattern1, str(xstr), re.IGNORECASE)
    if res:
        return res
    return pd.np.nan
df['cumup1'] = df['text_preworked'].apply(get_cumup1)
df['cumup1'].dropna()
assert df['cumup1'].dropna().empty

In [None]:
# Check that this never matches anything
cumulative_upkeep_pattern1 = r', cumulative upkeep—'
print(cumulative_upkeep_pattern1)
def get_cumup1(xstr):
    res = re.findall(cumulative_upkeep_pattern1, str(xstr), re.IGNORECASE)
    if res:
        return res
    return pd.np.nan
df['cumup1'] = df['text_preworked'].apply(get_cumup1)
df['cumup1'].dropna()
assert df['cumup1'].dropna().empty

In [None]:
#re.search('test', 'TeSt', re.IGNORECASE)
#re.match('test', 'TeSt', re.IGNORECASE)
#re.sub('test', 'xxxx', 'Testing', flags=re.IGNORECASE)
# Non capturing group https://stackoverflow.com/questions/2703029/why-regular-expressions-non-capturing-group-is-not-working

#cumulative_upkeep_pattern = r' ?cumulative upkeep[ |—].*?[.|,|\n]'
type1_cost = r' (\{[A-Z0-9]+\})+'
type2_cost = r'—.*?[.|\n]'
cumulative_upkeep_pattern = r'(?:, )?(cumulative upkeep)({0}|{1})'.format(type1_cost, type2_cost)
print(cumulative_upkeep_pattern)
def get_cumup(xstr):
    res = re.findall(cumulative_upkeep_pattern, str(xstr), re.IGNORECASE)
    if res:
        return res
    return pd.np.nan
df['cumup'] = df['text_preworked'].apply(get_cumup)
posit = 28118
display(df[['cumup', 'text_preworked']].dropna())#.loc[posit]['text_preworked'])
#display(df[['cumup', 'text_preworked']].dropna()#.loc[posit]['cumup'])

In [None]:
# check what is not contained (GREAT: the only card should not be considered anyway)
cumup_all = df[df['text_preworked'].str.contains('umulative up')]
cumup_detected = df[['cumup', 'text_preworked']].dropna()
cumup_all[~cumup_all.index.isin(cumup_detected.index)]['text_preworked'].iloc[0]

### Extend procedure to other abilities

Check what 'Enchant' ability can enchant

In [None]:
# Get everythin that can follow Enchant
def get_whats_enchanted(xstr):
    res = re.findall(r'Enchant .*?[.|\n|$]', str(xstr))#, re.IGNORECASE)
    if res:
        return tuple(res)
    return pd.np.nan
df['enchant_something'] = df['text_preworked'].apply(get_whats_enchanted)
df['enchant_something'].dropna().drop_duplicates()
enchant_abilities = set([x[0].strip('\n') for x in df['enchant_something'].dropna().drop_duplicates()])
#enchant_abilities

Regex below can detect any abilities with costs.

In [None]:
type1_cost = r' (\{[A-Z0-9]+\})+'
type2_cost = r'—.*?[.|\n]'
type3_cost = r' \d+[,|\n]'
abilities_lower = '|'.join(abilities).lower()
cumulative_upkeep_pattern = r'(?:, )?({abi})({cost1}|{cost2}|{cost3})'.format(
    cost1=type1_cost, cost2=type2_cost, cost3=type3_cost, abi=abilities_lower)
print(cumulative_upkeep_pattern)
def get_cumup(xstr):
    res = re.findall(cumulative_upkeep_pattern, str(xstr), re.IGNORECASE)
    if res:
        return res
    return pd.np.nan
df['cost_abilities'] = df['text_preworked'].apply(get_cumup)
posit = 227
display(df[['cost_abilities', 'text_preworked']].dropna())#.loc[posit]['text_preworked'])
#display(df[['cumup', 'text_preworked']].dropna()#.loc[posit]['cumup'])

In [None]:
# Detect other things following abilities
abilities_follower = r' .*?[.|\n]'
abilities_lower = '|'.join(abilities)
ability_w_follower = r'({abi})({fol})'.format(fol=abilities_follower, abi=abilities_lower)
print(ability_w_follower)
def get_cumup(xstr):
    res = re.findall(ability_w_follower, str(xstr))
    if res:
        return tuple(res)
    return pd.np.nan
df['ability_w_follower'] = df['text_preworked'].apply(get_cumup)
posit = 1100
display(df[['ability_w_follower', 'text_preworked']].dropna())#.loc[posit]['text_preworked'])

detected_cost_abi = df['cost_abilities'].dropna()
df[~df.index.isin(detected_cost_abi.index)]['ability_w_follower'].dropna().drop_duplicates()

## What can bt in place of X in +X/+x (or actually +|-X/+|-X)

Besides number, only X or Y will appear.

In [None]:
from itertools import chain
def get_increases(text_str, pat=r'([+-][^\d]/[+-][^\d])'):
    '''Given a text, extract a pattern and return the extraction or None'''
    res = re.findall(pat, text_str)
    return res
t = cards_df['text_preworked'].apply(get_increases)
res = set(chain(*(t.values)))
res

Which numbers may it contain?

In [None]:
from itertools import chain
def get_increases(text_str, pat=r'([+-]\d+/)|(/[+-]\d+)'):
    '''Given a text, extract a pattern and return the extraction or None'''
    res = re.findall(pat, text_str)
    return res
t = cards_df['text_preworked'].apply(get_increases)
res = set(chain(*(t.values)))
res

In [None]:
from itertools import chain
pincre_pat=r'([+-][\dXxYx]+/)'
rincre_pat=r'(/[+-][\dXxYx]+)'
def get_increases(text_str, pat=r'([+-]\d+/)|(/[+-]\d+)'):
    '''Given a text, extract a pattern and return the extraction or None'''
    res = re.findall(pat, text_str)
    return res
pincre = cards_df['text_preworked'].apply(get_increases, args=(pincre_pat,))
pincre_res = set(chain(*(pincre.values)))
rincre = cards_df['text_preworked'].apply(get_increases, args=(rincre_pat,))
rincre_res = set(chain(*(rincre.values)))
print(pincre_res, rincre_res)

There is no +\*/+\*

In [None]:
cards_df[cards_df['text_preworked'].str.contains('\-\*')]
cards_df[cards_df['text_preworked'].str.contains('\+\*')]

In [None]:
from itertools import chain
def get_increases(text_str, pat=r'([+-][*])'):
    '''Given a text, extract a pattern and return the extraction or None'''
    res = re.findall(pat, text_str)
    return res
t = cards_df['text_preworked'].apply(get_increases)
res = set(chain(*(t.values)))
res

## Detecting special symbols

In [None]:
import itertools

In [None]:
patt = r'\{.*?\}'
t = cards_df_pop_parts['pop'].apply(lambda x: re.findall(patt, str(x))
                             if re.findall(patt, str(x)) else pd.np.nan)
symbols_set=set(itertools.chain.from_iterable(t.dropna()))
#symbols_set

In [None]:
weird_symbols = []
worth_ignoring = ['{hr}','{½}','{∞}'] # Unglued or similar
worth_ignoring.append('{CHAOS}')
symbols_explanation = {
    '{S}': {'explanation': 'Snow mana', 'example_card': 'Glacial Plating'},
    '{R/P}': {'explanation': 'can be paid with either {R} or 2 life', 'example_card': 'Rage Extractor'},
    '{Q}': {'explanation': '{Q} is the untap symbol', 'example_card': 'Order of Whiteclay'},
    '{E}': {'explanation': 'Energy counter', 'example_card': 'Consulate Surveillance'},
    '{C}': {'explanation': 'Colorless mana', 'example_card': 'Skarrg, the Rage Pits'},
    '{CHAOS}': {'explanation': 'It is only in Plane cards and for a specific kind of game',
                'example_card': 'Glimmervoid Basin'},
}
weird_cards = []
for item in weird_symbols:
    weird = cards_df_sentences[cards_df_sentences['sentences'].str.contains(item)]
    weird_cards.append(cards_df[cards_df['id'].isin(weird['card_id'])])
if weird_symbols:
    weird_cards = pd.concat(weird_cards)
    weird_cards[mains_col_names]

In [None]:
from itertools import chain
def get_increases(text_str, pat=r'([+-][\d+XxYx]{1,4}/[+-][\d+XxYx]{1,4})'):
    '''Given a text, extract a pattern and return the extraction or None'''
    res = re.findall(pat, text_str)
    return res
t = cards_df_pop_parts['part'].apply(get_increases)
pr_increase_symbols = set(chain(*(t.values)))
#pr_increase_symbols

# Spacy

In [None]:
#https://stackoverflow.com/questions/51766157/how-to-force-a-pos-tag-in-spacy-before-after-tagger/51776803#51776803
from spacy.symbols import ORTH, POS, NOUN, VERB

nlp.tokenizer.add_special_case('{G}', [{ORTH: '{G}', POS: NOUN}])
nlp.tokenizer.add_special_case('{T}', [{ORTH: '{T}', POS: VERB}])
for symb in pr_increase_symbols:
    nlp.tokenizer.add_special_case(symb, [{ORTH: symb, POS: NOUN}])

doc = nlp('{T}: This {G} is a noun. Target creature gets +1/+1')

for token in doc:
    print('{:10}{:10}'.format(token.text, token.pos_))

In [None]:
# Interpret {something} as NOUN (but tap and untap as verb)
#https://stackoverflow.com/questions/51766157/how-to-force-a-pos-tag-in-spacy-before-after-tagger/51776803#51776803
from spacy.symbols import ORTH, POS, NOUN, VERB, LOWER,LEMMA, TAG, NounType_com, nn, VerbForm_inf
import spacy
from spacy import displacy
 
doc = nlp('I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ')
displacy.render(doc, style='ent', jupyter=True)

In [None]:
# Noun phrases
doc = nlp("Wall Street Journal just published an interesting piece on crypto currencies")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.label_, chunk.root.text)

In [None]:
# Dependency parser
doc = nlp('Wall Street Journal just published an interesting piece on crypto currencies')
 
for token in doc:
    print("{0}/{1} <--{2}-- {3}/{4}".format(
        token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))

In [None]:
doc = nlp('Wall Street Journal just published an interesting piece on crypto currencies')
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})


In [None]:
#test_sentence = cards_df[cards_df['static_abilities']==('Phasing',)].text.values[0]
test_sentence ='\nWhenever SELF attacks, it gets +1/+1.' #test_sentence +'\nWhenever SELF attacks, it gets +1/+1.'
test_sentence

In [None]:
doc = nlp(test_sentence)
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})


# Spacy applied

In [None]:
from spacy.symbols import ORTH, POS, NOUN, VERB, LOWER,LEMMA, TAG, nn#, VerbForm_inf,NounType_com,
import spacy
from spacy import displacy

In [None]:
#MODEL = 'en_core_web_lg'
MODEL = 'en_core_web_sm'

In [None]:
from spacy.tokens import Token

def get_token_sent(token):
    token_span = token.doc[token.i:token.i+1]
    return token_span.sent

try:
    Token.set_extension('sent', getter=get_token_sent, force=True)
except Exception:
    Token.set_extension('sent', getter=get_token_sent)

In [None]:
#MODEL = r'C:\Users\cs294662\Downloads\programas\spacy\data\en_core_web_md-2.0.0\en_core_web_md\en_core_web_md-2.0.0'
#MODEL = r'C:\Users\cs294662\Downloads\programas\spacy\data\en_coref_lg-3.0.0\en_coref_lg\en_coref_lg-3.0.0'
nlp = spacy.load(MODEL)

In [None]:
a=nlp('a')
b=nlp('a')
a==b

## Set custom tags for special cases

In [None]:
#symbols_explanation

In [None]:
symbols_set_valid = symbols_set.difference(set(worth_ignoring))
symbols_set_valid

In [None]:
# Add {SYMBOL} to NOUN recognizer
symbols_set_mana = set()
symbols_set_action = set()
for sym in symbols_set_valid:
    if not sym in ['{T}', '{Q}']:
        symbols_set_mana.add(sym)
        #nlp.tokenizer.add_special_case(sym, [{ORTH: sym, POS: NOUN, TAG:nn}])
        nlp.tokenizer.add_special_case(sym, [{ORTH: sym, POS: NOUN}])
    else:
        symbols_set_action.add(sym)
        nlp.tokenizer.add_special_case(sym, [{ORTH: sym, POS: VERB, TAG:'VB'}])

# Add power and toughness in/decresing symbols to NOUN recognizer
for sym in pr_increase_symbols:
    #nlp.tokenizer.add_special_case(sym, [{ORTH: sym, POS: NOUN, TAG:nn}])
    nlp.tokenizer.add_special_case(sym, [{ORTH: sym, POS: NOUN}])

In [None]:
# https://stackoverflow.com/questions/44594759/spacy-adding-special-case-tokenization-rules-by-regular-expression-or-pattern
#cost_pattern = r'{[\dWGBURTX]}'
#cost_pattern = re.compile(r'{[\dWGBURTX]}')
# add special case rule
#special_case = [{ORTH: cost_pattern, LEMMA: 'COST', POS: 'NOUN'}]
#nlp.tokenizer.add_special_case(cost_pattern, special_case)

In [None]:
should_be_verbs = ['attacks', 'block', 'blocks', 'cast', 'control','controls', 'deal','deals', 'dies', 'enchant', 'flip', 'gain', 'gains', 'pay', 'return', 'sacrifice', 'shares', 'tap', 'untap']
#for token in should_be_verbs:
#    nlp.tokenizer.add_special_case(token, [{ORTH: token, POS: VERB}])
#    nlp.tokenizer.add_special_case(token.title(), [{ORTH: token.title(), POS: VERB}])

In [None]:
test_phrase = '{G}: Two target creatures get +1/-1'
#test_phrase = 'Target creature has flying'
doc = nlp(test_phrase)
displacy.render(doc, style='dep', jupyter=True)

## Create custom entity matcher

In [None]:
from spacy.matcher import PhraseMatcher, Matcher
from spacy.tokens import Span

class EntityPhraseMatcher(object):
    '''https://stackoverflow.com/questions/49097804/spacy-entity-from-phrasematcher-only'''
    
    name = 'entity_phrase_matcher'

    def __init__(self, nlp, terms, label):
        patterns = [nlp(term) for term in terms]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add(label, None, *patterns)

    def __call__(self, doc):
        matches = self.matcher(doc)
        spans = []
        for label, start, end in matches:
            span = Span(doc, start, end, label=label)
            spans.append(span)
        doc.ents = spans
        return doc
    
class EntityMatcher(object):
    name = 'entity_matcher'

    def __init__(self, nlp, dict_label_terms):
        '''dict_label_terms shoould be a dictionary in the format
        {label(str): patterns(list)}'''
        self.matcher = Matcher(nlp.vocab)
        for label, patterns in dict_label_terms.items():
            self.matcher.add(label, None, *patterns)

    def __call__(self, doc):
        matches = self.matcher(doc)
        spans = []
        for label, start, end in matches:
            span = Span(doc, start, end, label=label)
            spans.append(span)
        doc.ents = spans
        return doc

In [None]:
zones = ['graveyard', 'play', 'library', 'hand', 'battlefield', 'exile', 'stack']
players = ['opponent', 'you', 'controller', 'owner', 'player']
steps = ['upkeep', 'draw step', 'end step', 'cleanup step', 'main phase', 'main phases']

entities = {}
entities['zones'] = ['graveyard', 'play', 'library', 'hand', 'battlefield', 'exile', 'stack']
entities['players'] = ['opponent', 'you', 'controller', 'owner', 'player']
entities['steps'] = ['upkeep', 'draw step', 'end step', 'cleanup step', 'main phase', 'main phases']
entities['types'] = cards_types
entities['subtypes'] = cards_subtypes
entities['supertypes'] = cards_supertypes
entities['supertypes'] = cards_supertypes

In [None]:
from collections import OrderedDict
import hashlib
class HashableDict(OrderedDict):
    def __hash__(self):
        return hash(tuple(sorted(self.items())))
    
    def hexdigext(self):
        return hashlib.sha256(''.join([str(k)+str(v) for k, v in self.items()]).encode()).hexdigest()

In [None]:
from collections import defaultdict
if 'ner' in nlp.pipe_names:
    nlp.remove_pipe('ner')
if 'entity_matcher' in nlp.pipe_names:
    nlp.remove_pipe('entity_matcher')
#nlp.remove_pipe('ent_type_matcher')
#nlp.remove_pipe('ent_subtype_matcher')
#nlp.remove_pipe('ent_supertype_matcher')

dict_label_terms = defaultdict(list)
entity_to_kind_map = {}
entity_key_to_hash_map = {} # entity key: entity node hash (node_id)

for typ in cards_types:
    key = 'TYPE: ' + typ.lower()
    dict_label_terms[key].append([{'LOWER': t} for t in typ.lower().split()])
    dict_label_terms[key].append([{'LOWER': t+'s'} for t in typ.lower().split()])
    entity_to_kind_map[key] = 'TYPE'
    entity_key_to_hash_map[key] = HashableDict({'entity': key}).hexdigext()
# TODO define plural for subtypes and types, like elves
cards_subtypes.add('elves')
for typ in cards_subtypes:
    key = 'SUBTYPE: ' + typ.lower()
    dict_label_terms[key].append([{'LOWER': t} for t in typ.lower().split()])
    dict_label_terms[key].append([{'LOWER': t+'s'} for t in typ.lower().split()])
    entity_to_kind_map[key] = 'SUBTYPE'
    entity_key_to_hash_map[key] = HashableDict({'entity': key}).hexdigext()
for typ in cards_supertypes:
    key = 'SUPERTYPE: '+typ.lower()
    dict_label_terms[key].append([{'LOWER': t} for t in typ.lower().split()])
    dict_label_terms[key].append([{'LOWER': t+'s'} for t in typ.lower().split()])
    entity_to_kind_map[key] = 'SUPERTYPE'
    entity_key_to_hash_map[key] = HashableDict({'entity': key}).hexdigext()
for typ in ['white','black','blue','white','red','green','colorless', 'multicolored', 'multicolor']:
    key = 'COLOR: '+typ.lower()
    dict_label_terms[key].append([{'LOWER': t} for t in typ.lower().split()])
    entity_to_kind_map[key] = 'COLOR'
    entity_key_to_hash_map[key] = HashableDict({'entity': key}).hexdigext()
for abi in abilities:
    key = 'ABILITY: '+abi.lower()
    dict_label_terms[key].append([{'LOWER': t} for t in abi.lower().split()])
    entity_to_kind_map[key] = 'ABILITY'
    entity_key_to_hash_map[key] = HashableDict({'entity': key}).hexdigext()
for zone in zones:
    key = 'ZONE: '+zone.lower()
    dict_label_terms[key].append([{'LOWER': t, 'POS': NOUN} for t in zone.lower().split()])
    dict_label_terms[key].append([{'LOWER': t+'s', 'POS': NOUN} for t in zone.lower().split()])
    entity_to_kind_map[key] = 'ZONE'
    entity_key_to_hash_map[key] = HashableDict({'entity': key}).hexdigext()
for player in players:
    key = 'PLAYER: '+player.lower()
    dict_label_terms[key].append([{'LOWER': t, 'POS':spacy.symbols.PRON} for t in player.lower().split()])
    dict_label_terms[key].append([{'LOWER': t, 'POS':spacy.symbols.NOUN} for t in player.lower().split()])
    entity_to_kind_map[key] = 'PLAYER'
    entity_key_to_hash_map[key] = HashableDict({'entity': key}).hexdigext()
for step in steps:
    key = 'STEP: '+step.lower()
    dict_label_terms[key].append([{'LOWER': t} for t in step.lower().split()])
    entity_to_kind_map[key] = 'STEP'
    entity_key_to_hash_map[key] = HashableDict({'entity': key}).hexdigext()
for sym in symbols_set_mana:
    #print([{'ORTH': t} for t in sym.split()])
    key = 'MANA: '+sym.lower()
    if sym.strip('{}').isdigit() or sym.strip('{}').upper() == 'X':
        key = 'MANA: '+'{generic}'
    dict_label_terms[key].append([{'ORTH': t} for t in sym.split()])
    entity_to_kind_map[key] = 'MANA'
    entity_key_to_hash_map[key] = HashableDict({'entity': key}).hexdigext()
for sym in symbols_set_action:
    key = 'ACTION: '+sym.lower()
    dict_label_terms[key].append([{'ORTH': t} for t in sym.split()])
    entity_to_kind_map[key] = 'ACTION'
    entity_key_to_hash_map[key] = HashableDict({'entity': key}).hexdigext()

entity_matcher = EntityMatcher(nlp, dict_label_terms)
try:
    nlp.add_pipe(entity_matcher, before='ner')
except Exception:
    nlp.add_pipe(entity_matcher)

print(nlp.pipe_names)  # see all components in the pipeline

## Select subset of cards to work with

In [None]:
# Work on full cards_df_pop_parts (cards_df was probably filtered right at the beginning)
cards_df_for_graph = cards_df_pop_parts.copy()
cards_df_for_graph.loc[:, 'part_doc'] = cards_df_for_graph['part'].apply(lambda x: nlp(x.strip('.,')))

In [None]:
# Work on sample for now
sample_cards = ['Tolarian Academy', 'Pacifism','Pariah','Congregate', 'Priest of Titania',
             'Disenchant', ' Brilliant Halo', 'Thran Quarry', 'Path of Peace', 'Arcane Laboratory',
            'Plains', 'Mountain', 'Forest', 'Swamp', 'Island']
sample_ids = (cards_df[cards_df['name'].isin(sample_cards)]
              .drop_duplicates(subset=['name'])
              .index.unique()
             )
cards_df_for_graph = cards_df_pop_parts[cards_df_pop_parts['card_id'].isin(sample_ids)].copy()
cards_df_for_graph.loc[:, 'part_doc'] = cards_df_for_graph['part'].apply(lambda x: nlp(x.strip('.,')))

In [None]:
test_phrase = "{G}: At your end step, you can put target creatures from an opponent's graveyard into play under your control"
#test_phrase = 'At your end step, target creature has flying'
doc = nlp(test_phrase)

In [None]:
displacy.render(doc, style='ent', jupyter=True)
displacy.render(doc, style='dep', jupyter=True)

In [None]:
list(doc.ents)+list(doc.noun_chunks)

## Extend pop with spacy dep_

### Objective: card_id <- pop <- part <- root <- (children) <- entities

In [None]:
def get_all_children(root, parts, extra_keys):
    '''From root, get all the children'''
    while root.children:
        for child in root.children:
            key = child.dep_
            while key in parts.keys():
                key += '_'
            parts[key] = child
            if child.ent_type_:
                parts['TYPE_'+key] = child.ent_type_
                for i, c in enumerate(child.children):
                    parts[key+'-'+'CHILD_'+str(i)] = {c.dep_:c}
            #extra_keys[child] = key
            #parts['key_of_head_of_'+key] = extra_keys[child.head]
            get_all_children(child, parts, extra_keys)
        break
    return parts, extra_keys
        
def make_df_from_doc(doc):
    '''Transform doc into a dataframe with interesting stuff'''
    sents = []
    
    for sent in doc.sents:
        extra_keys = {}
        parts = defaultdict(list)
        for root in sent:
            if root.dep_ == 'ROOT':
                parts['root'] = root
                if root.ent_type_:
                    parts['TYPE_'+'root'] = root.ent_type_
                    for i, c in enumerate(root.children):
                        parts['root'+'-'+'CHILD_'+str(i)] = {c.dep_:c}
                #extra_keys[root] = 'root'
                parts, extra_keys = get_all_children(root, parts, extra_keys)
            sents.append(parts) if parts else []
            parts = defaultdict(list)

    return pd.DataFrame.from_records(sents)

In [None]:
test_sents = ['Target creature gets +1/+1 until end of turn. You may gain 4 life', 'You may put target creatures from graveyard into play']
doc = nlp('\n'.join(test_sents))
df = make_df_from_doc(doc)
df

In [None]:
cards_df_pop_parts.columns

In [None]:
pop_defining_cols = ['card_id', 'paragraph_order', 'pop_order', 'pop_type', 'pop']
part_defining_cols = ['card_id', 'paragraph_order', 'pop_order', 'part_order', 'part_type_full', 'pop', 'part']

In [None]:
# Function to parse docs to list of dicts
def parse_doc_to_list_of_dicts(df_row, original_cols=[], doc_col = 'part_doc'):
    '''Get a dataframe, parse the doc column to token and entity nodes and edges dict, return the dataframe'''
    doc = df_row[doc_col]
    
    token_node = [] # Source node
    token_node_text = []
    token_node_pos = []
    token_node_tag = []
    token_node_label = []
    
    token_head_dep = [] # Token edge do head
    token_head_node = [] # Target node
#     pop_node = [] # Pop node (avoided)
    part_node = [] # part node
    
    token_to_entity_edge = []
    entity_node = [] # Target entity nodes (target of token_node)
    entity_node_ent_type = []
    entity_node_entity = []
    entity_node_desc = []
    
    # Track relations between token_nodes
    tracker = HashableDict()
    
    # Avoided
#     pop_dic = HashableDict()
#     for col in pop_defining_cols:
#         pop_dic[col] = df_row[col]
    
    part_dic = HashableDict()
    for col in part_defining_cols:
        part_dic[col] = df_row[col]
    
    for t in doc:
        '''Create token and entity nodes and edges dict.'''
        
        token_dic = HashableDict()
        ent_dic = HashableDict()

        # Create node object as dict
        for col in ['card_id', 'paragraph_order', 'part_order', 'pop_order', 'part_type_full']:
            token_dic[col] = df_row[col]
        token_dic['text'] = t.text.lower()
        token_node_text.append(t.text.lower())
        #token_node_label.append(t.text.lower())
        token_dic['pos'] = t.pos_.lower()
        token_node_pos.append(t.pos_.lower())
        token_dic['tag'] = t.tag_.lower()
        token_node_tag.append(t.tag_.lower())
        token_dic['i'] = t.i
        
        # Create entity node object as dict. All entities should be equal in all processed cards
        if t.ent_type_:
            
            ent = t.ent_type_
            ent_dic['entity'] = ent
            entity_node_entity.append(ent)
            typ, desc = ent.split(': ')
            entity_node_ent_type.append(typ)
            entity_node_desc.append(desc)
        
        else:
            entity_node_ent_type.append(pd.np.nan)
            entity_node_entity.append(pd.np.nan)
            entity_node_desc.append(pd.np.nan)
            
        token_node.append(token_dic.hexdigext())
        token_head_dep.append(t.dep_.lower())
        entity_node.append(ent_dic.hexdigext())
        token_to_entity_edge.append(t.ent_iob_.lower())
#         pop_node.append(pop_dic.hexdigext())
        part_node.append(part_dic.hexdigext())
        
        tracker[t] = {'token_dic': token_dic}
        
    # Now, set the head of a token as its target node
    for t, dicts in tracker.items():
        head = t.head
        head_dict = tracker[head]['token_dic']
        token_head_node.append(head_dict.hexdigext())
    
    # Create dataframe 
    res = pd.DataFrame()
    res['token_node'] = token_node
    res['token_node_text'] = token_node_text
    res['token_node_pos'] = token_node_pos
    res['token_node_tag'] = token_node_tag
    res['token_head_node'] = token_head_node
    res['token_head_dep'] = token_head_dep

    # Entity
    res['entity_node'] = entity_node
    res['entity_node_ent_type'] = entity_node_ent_type
    res['entity_node_entity'] = entity_node_entity
    res['entity_node_desc'] = entity_node_desc
    
    res['token_to_entity_edge'] = token_to_entity_edge
#     res['pop_node'] = pop_node # avoided
    res['part_node'] = part_node
    
    #res['label'] = token_node_label
    #res = res.reset_index(drop=True)

    for col in original_cols:
        res[col] = df_row[col]
    
    # If src and target are the same, the token is a root, set target to card_id
    res = res.reset_index(drop=True)
    try:
        res.loc[res[res['token_head_dep'] == 'root'].index, 'token_head_node'] = res['part_node']
        res.loc[res[res['token_head_dep'] == 'root'].index, 'label'] = part_dic['part']
    except TypeError as e:
        # Someime res['token_head_dep'] = [] and cannot be compared to 'root'
        pass
    
    return res

In [None]:
# Parse for graph
cards_df_for_graph = cards_df_for_graph.apply(parse_doc_to_list_of_dicts, args=(cards_df_pop_parts.columns,), axis=1)
cards_df_for_graph = pd.concat(cards_df_for_graph.values, sort=False).reset_index(drop=True)
#cards_df_for_graph.describe().transpose()

# Build a graph for the cards

## Helping functions

In [None]:
#G = nx.petersen_graph()
shapes = ['box', 'polygon', 'ellipse', 'oval', 'circle', 'egg', 'triangle', 'exagon', 'star']
colors = ['blue', 'black', 'red', '#db8625', 'green', 'gray', 'cyan', '#ed125b']
styles = ['filled', 'rounded', 'rounded, filled', 'dashed', 'dotted, bold']

entities_colors = {
    'PLAYER': '#FF6E6E',
    'ZONE': '#F5D300',
    'ACTION': '#1ADA00',
    'MANA': '#00DA84',
    'SUBTYPE': '#0DE5E5',
    'TYPE': '#0513F0',
    'SUPERTYPE': '#8D0BCA',
    'ABILITY': '#cc3300',
    'COLOR': '#666633',
    'STEP': '#E0E0F8'
}

def draw_graph(G, filename='test.png'):
    pdot = nx.drawing.nx_pydot.to_pydot(G)


    for i, node in enumerate(pdot.get_nodes()):
        attrs = node.get_attributes()
        node.set_label(str(attrs.get('label', 'none')))
    #     node.set_fontcolor(colors[random.randrange(len(colors))])
        entity_node_ent_type = attrs.get('entity_node_ent_type', pd.np.nan)
        if not pd.isnull(entity_node_ent_type):
            color = entities_colors[entity_node_ent_type.strip('"')]
            node.set_fillcolor(color)
            node.set_color(color)
            node.set_shape('hexagon')
            #node.set_colorscheme()
            node.set_style('filled')
        
        node_type = attrs.get('type', None)
        if node_type == '"card"':
            color = '#999966'
            node.set_fillcolor(color)
#             node.set_color(color)
            node.set_shape('star')
            #node.set_colorscheme()
            node.set_style('filled')
    #     
        #pass

    for i, edge in enumerate(pdot.get_edges()):
        att = edge.get_attributes()
        att = att.get('label', 'NO-LABEL')
        edge.set_label(att)
    #     edge.set_fontcolor(colors[random.randrange(len(colors))])
    #     edge.set_style(styles[random.randrange(len(styles))])
    #     edge.set_color(colors[random.randrange(len(colors))])

    png_path = filename
    pdot.write_png(png_path)

    from IPython.display import Image
    return Image(png_path)

## Build graph

In [None]:
import networkx as nx

In [None]:
cards_df_for_graph.columns

In [None]:
cdfg=cards_df_for_graph
cdfg = cards_df_for_graph.merge(cards_df, left_on=['card_id'], right_index=True)
#cdfg

In [None]:
# types|colors to cards graph
nodes_card_df = cdfg[['card_id', 'supertypes', 'types', 'subtypes', 'colors', 'manaCost']].copy()

# Generate df with card_id and entity_node_id refering to card's type, color, supertype, etc.
res = []
for col in nodes_card_df:
    if col == 'card_id':
        continue
    if col == 'manaCost':
        nodes_card_df[col] = nodes_card_df[col].apply(
            lambda x: ['{generic}'
                       if (y.strip('{}').isdigit() or y.strip('{}').upper()=='X')
                       else y
                       for y in re.findall(r'{.*?}', x)
                      ]
            if not pd.isnull(x) else x)
        nodes_card_df = nodes_card_df.rename(columns={'manaCost':'manas'})
        col = 'manas'
    temp = nodes_card_df[['card_id', col]].copy().dropna()
    temp = splitDataFrameList(temp, col)
    temp['entity_node_ent_type'] = col.upper()[:-1]
    # Build the name which can be maped to hexdigext
    temp['entity_node_entity'] = temp.apply(lambda x: ': '.join([x['entity_node_ent_type'], x[col].lower()]), axis=1)
    temp['entity_node'] = temp['entity_node_entity'].apply(lambda x: entity_key_to_hash_map[x])
    temp = temp.rename(columns={col: 'entity_node_desc'})
    temp = temp.drop_duplicates(subset=['card_id', 'entity_node'])
    res.append(temp)
    
res = pd.concat(res, sort=True)
res['edge_type'] = 'entity_to_card'

res.sample(5)

In [None]:
cdfg = pd.concat([cdfg, res], sort=False)

In [None]:
# Try different approach: build node by node

# Objective: card_id <- part <- root <- (children) <- entities # pop was avoided

# Token to head
# Manipulate df to generate two others: nodes and edges
# Edges relate (token to head, head to part, part to pop, pop to card, and token to entity)
# attention: head is also a token
# and set node and edge attributes (both dfs should contain the attributes)

# NODES ########################################## 
nodes = {}
nodes_cols = {}
nodes_attr = {}

# Token nodes
nodes_cols['token'] = ['token_node', 'token_node_text', 'token_node_pos', 'token_node_tag',
       'token_head_node', 'token_head_dep',
       'part_order', 'part_type', 'card_id', 'paragraph_order', 
       'pop_order', 'pop_type']
nodes['token'] = (cdfg[nodes_cols['token']]
                  .rename(columns={'token_node':'node_id'})
                  .dropna(subset=['node_id'])
                 )
nodes['token']['type'] = 'token'
nodes['token']['label'] = nodes['token'].apply(lambda x:
                                              '-'.join([x['token_node_text'],
                                                        x['token_node_pos'],
                                                        x['token_node_tag']]), axis=1)
nodes_attr['token'] = [x for x in nodes['token'].columns if x not in ['node_id']]

# Entity nodes
nodes_cols['entity'] = ['entity_node', 'entity_node_entity','entity_node_ent_type', 'entity_node_desc']
nodes['entity'] = (cdfg[nodes_cols['entity']]
                   .dropna(subset=['entity_node_ent_type'])
                   .rename(columns={'entity_node':'node_id'})
                  )
nodes['entity']['type'] = 'entity'
nodes['entity']['label'] = nodes['entity'].apply(lambda x:
                                              '-'.join([x['entity_node_entity'],
                                                        ]), axis=1)
nodes_attr['entity'] = [x for x in nodes['entity'].columns if x not in ['node_id']]

# Part nodes
nodes_cols['part'] = ['part_node',
                      'part', 'part_order', 'part_type',
                      'card_id',
                      'paragraph_order',
                      'pop_order', 'pop_type']
nodes['part'] = (cdfg[nodes_cols['part']]
                 .rename(columns={'part_node':'node_id'})
                 .dropna(subset=['node_id'])
                 )
nodes['part']['type'] = 'part'
nodes['part']['label'] = nodes['part'].apply(lambda x:
                                              '-'.join([x['part']]), axis=1)
nodes_attr['part'] = [x for x in nodes['part'].columns if x not in ['node_id']]

# Pop nodes (avoided)
# nodes_cols['pop'] = ['pop_node',
#                       'card_id',
#                       'paragraph_order',
#                       'pop', 'pop_order', 'pop_type']
# nodes['pop'] = (cdfg[nodes_cols['pop']]
#                 .rename(columns={'pop_node':'node_id'})
#                 .dropna(subset=['node_id'])
#                  )
# nodes['pop']['type'] = 'pop'
# nodes['pop']['label'] = nodes['pop'].apply(lambda x:
#                                               '-'.join([x['pop']]), axis=1)
# nodes_attr['pop'] = [x for x in nodes['pop'].columns if x not in ['node_id']]

# Card nodes
nodes_cols['card'] =  ['card_id'] + mains_col_names
nodes['card'] = cdfg[nodes_cols['card']]
nodes['card'] = nodes['card'].rename(columns={'name':'card_name'})
nodes['card']['node_id'] = nodes['card']['card_id']
nodes['card'] = nodes['card'].dropna(subset=['node_id', 'card_name'], how='any')                 
nodes['card']['type'] = 'card'
nodes['card']['label'] = nodes['card'].apply(lambda x:
                                              '-'.join([x['card_name']]), axis=1)
nodes_attr['card'] = [x for x in nodes['card'].columns if x not in ['node_id']]

# EDGES #########################################
card_as_start = True # Sets card as source and pop, part, token, entity as targets
edges = {} # k->type, v-> dataframe
edges_cols = {} # list
edges_attr = {} # list

# Token edges to head (and head to part)
edges_cols['token_to_head_part'] = ['token_node', 'token_head_node', 'token_head_dep',
                       'part_order', 'part_type', 'card_id', 'paragraph_order', 
                       'pop_order', 'pop_type']
renamer = {'token_node':'source', 'token_head_node':'target'}
if card_as_start:
    renamer = {'token_head_node':'source', 'token_node':'target'}
edges['token_to_head_part'] = (cdfg[edges_cols['token_to_head_part']]
                               .rename(columns=renamer)
                               .dropna(subset=['source', 'target'], how='any')
                              )
edges['token_to_head_part']['type'] = 'token_to_head_part'
edges['token_to_head_part']['label'] = edges['token_to_head_part'].apply(lambda x:
                                              '-'.join([x['token_head_dep'],
                                                       ]).upper(), axis=1)
edges_attr['token_to_head_part'] = [x for x in edges['token_to_head_part'].columns
                                    if x not in ['source', 'target']]

# Entity edges to Token 
edges_cols['entity_to_token'] = ['token_node', 'entity_node']
renamer = {'entity_node':'source', 'token_node':'target'}
if card_as_start:
    renamer = {'token_node':'source', 'entity_node':'target'}
edges['entity_to_token'] = (cdfg[edges_cols['entity_to_token']]
                            .dropna()
                            .rename(columns=renamer)
                           )
edges['entity_to_token']['type'] = 'entity_to_token'
edges['entity_to_token']['relation'] = 'is_class_of'
edges['entity_to_token']['label'] = edges['entity_to_token'].apply(lambda x:
                                              '-'.join([x['relation'],
                                                       ]).upper(), axis=1)
edges_attr['entity_to_token'] = [x for x in edges['entity_to_token'].columns
                                    if x not in ['source', 'target']]

# Entity edges to cards
edges_cols['entity_to_card'] = ['card_id', 'entity_node']
renamer = {'entity_node':'source', 'card_id':'target'}
if card_as_start:
    pass # Ignore in this case, because we want it reversed: card as target
#     renamer = {'card_id':'source', 'entity_node':'target'}
edges['entity_to_card'] = (cdfg[cdfg['edge_type']=='entity_to_card'][edges_cols['entity_to_card']]
                            .dropna()
                            .rename(columns=renamer)
                           )
edges['entity_to_card']['type'] = 'entity_to_card'
edges['entity_to_card']['relation'] = 'is_contained_in'
edges['entity_to_card']['label'] = edges['entity_to_card'].apply(lambda x:
                                              '-'.join([x['relation'],
                                                       ]).upper(), axis=1)
edges_attr['entity_to_card'] = [x for x in edges['entity_to_card'].columns
                                    if x not in ['source', 'target']]

# Part and pop edges (avoided)
# edges_cols['part_to_pop'] = ['part_node', 'pop_node',
#                        'part_order', 'part_type',
#                        'card_id', 'paragraph_order', 
#                        'pop_order', 'pop_type']
# renamer = {'part_node':'source', 'pop_node':'target'}
# if card_as_start:
#     renamer = {'pop_node':'source', 'part_node':'target'}
# edges['part_to_pop'] = (cdfg[edges_cols['part_to_pop']]
#                         .rename(columns=renamer)
#                         .dropna(subset=['source', 'target'], how='any')
#                         )
# edges['part_to_pop']['type'] = 'part_to_pop'
# edges['part_to_pop']['label'] = edges['part_to_pop'].apply(lambda x:
#                                               '-'.join([str(x['part_order']),
#                                                         x['part_type'],
#                                                        ]).upper(), axis=1)
# edges_attr['part_to_pop'] = [x for x in edges['part_to_pop'].columns
#                                     if x not in ['source', 'target']]

# Pop to card edges (avoided)
# edges_cols['pop_to_card'] = ['card_id', 'pop_node',
#                        'paragraph_order', 
#                        'pop_order', 'pop_type']
# renamer = {'pop_node':'source', 'card_id':'target'}
# if card_as_start:
#     renamer = {'card_id':'source', 'pop_node':'target'}
# edges['pop_to_card'] = (cdfg[edges_cols['pop_to_card']]
#                         .rename(columns=renamer)
#                         .dropna(subset=['source', 'target'], how='any')
#                         )
# edges['pop_to_card']['type'] = 'pop_to_card'
# edges['pop_to_card']['label'] = edges['pop_to_card'].apply(lambda x:
#                                               '-'.join([str(x['paragraph_order']),
#                                                         str(x['pop_order']),
#                                                         x['pop_type'],
#                                                        ]).upper(), axis=1)
# edges_attr['pop_to_card'] = [x for x in edges['pop_to_card'].columns
#                                     if x not in ['source', 'target']]

# Part and card edges (avoided)
edges_cols['part_to_card'] = ['part_node', 'card_id',
                       'part_order', 'part_type_full',
                       'paragraph_order', 
                       'pop_order', 'pop_type', 'part_type']
renamer = {'pop_node':'source', 'card_id':'target'}
if card_as_start:
    renamer = {'card_id':'source', 'part_node':'target'}
edges['part_to_card'] = (cdfg[edges_cols['part_to_card']]
                        .rename(columns=renamer)
                        .dropna(subset=['source', 'target'], how='any')
                        )
edges['part_to_card']['type'] = 'part_to_card'
edges['part_to_card']['label'] = edges['part_to_card'].apply(lambda x:
                                              '-'.join([str(int(x['paragraph_order'])),
                                                        str(int(x['pop_order'])),
                                                        str(int(x['part_order'])),
                                                        x['part_type_full'],
                                                       ]).upper(), axis=1)
edges_attr['part_to_card'] = [x for x in edges['part_to_card'].columns
                                    if x not in ['source', 'target']]

# Build dfs
nodes_df = pd.concat(nodes.values(), sort=True).drop_duplicates(subset=['node_id'])
nodes_df = nodes_df.dropna(subset=[x for x in nodes_df.columns if not x in ['node_id', 'label']], how='all')
edges_df = pd.concat(edges.values(), sort=True).drop_duplicates(subset=['source', 'target'])
edges_df = edges_df[
    (edges_df['source'].isin(nodes_df['node_id']))&
    (edges_df['target'].isin(nodes_df['node_id']))
]

In [None]:
def eliminate_and_wrap_in_quotes(text):
    return '"'+str(text).replace('"', '')+'"'

In [None]:
# Create nodes and edges from dataframe
graphs = []
# EDGES
source = 'source'
target = 'target'
for k in edges_df['type'].unique():
    #if not k=='token_to_head_part': continue
    print(k)
    
    edge_attr = edges_attr[k]
    graphs.append(
        nx.from_pandas_edgelist(edges_df[edges_df['type']==k],
                              source=source,
                              target=target,
                              edge_attr=edge_attr,
                              create_using=nx.DiGraph())
    )

G = nx.compose_all(graphs)

# NODES (set attributes)
for k in nodes_df['type'].unique():
    print(k)
    node_col = 'node_id'
    for node_attr in nodes_attr[k]: 
        temp = nodes_df[[node_attr, node_col]]
        temp = temp.dropna()
        
        # Eliminate and wrap in quotes
        temp[node_attr] = temp[node_attr].apply(eliminate_and_wrap_in_quotes)
        nx.set_node_attributes(G, pd.Series(temp[node_attr].values, index=temp[node_col].values).copy().to_dict(), name=node_attr)

In [None]:
# Export to image
draw_graph(G, 'Gtest.png')

In [None]:
# Build paths between each pair of cards
card_nodes = [x for x,y in G.nodes(data=True) if y['type']=='"card"']
temp = []
for i, s in enumerate(card_nodes):
    if not i%10: print(i)
    for j, e in enumerate(card_nodes):
        if s is e:
            continue
        # All simple paths becomes huge as it find path through many cards
        # Lets try to create a different graph with only the nodes and edges with start card id, and the last node
        # Entities do not have an attribute card_id
        start_card_nodes = [x for x,y in G.nodes(data=True) if y.get('card_id', None) == G.node[s]['card_id']]
        entity_nodes = [x for x,y in G.nodes(data=True) if y.get('type', None) == '"entity"']
        interesting_subgraph = G.subgraph(start_card_nodes+entity_nodes+[e])
        #temp_name = str('./paths_between_pairs/interesting_subgraph_{0}.png'.format(i))
        #draw_graph(interesting_subgraph, temp_name)
        for k, path in enumerate(nx.all_simple_paths(interesting_subgraph, s, e)):
            subgraph = G.subgraph(path)
            temp.append(G.subgraph(path))
print('Avoid saving {0} images'.format(len(temp)))
for i, g in enumerate(temp):
    #if not i%10: print('{0}/{1}'.format(i, len(temp)))
    #temp_name = str('./paths_between_pairs/temp_{0}.png'.format(i))
    #draw_graph(g, temp_name)
    #display(draw_graph(g, temp_name))
    pass

### Simplify link between cards

Let's contratct the paths to a simple link containing the path itself as attribute, but a simple label describing it.

ATTENTION: This approach does not seem too useful

In [None]:
# Build card -> root -> other_card (not too useful)
import copy
start_nodes = [x for x,y in G.nodes(data=True) if y['type']=='"card"']
end_nodes = [x for x,y in G.nodes(data=True) if y['type']=='"card"']
H = nx.DiGraph()
for s in start_nodes:
    for e in end_nodes:
        if s is e:
            continue
        for path in nx.all_simple_paths(G, s, e):
            H.add_nodes_from([(path[0], G.nodes[path[0]])])
            H.add_nodes_from([(path[-1], G.nodes[path[-1]])])
            #H.add_node(G[path[0]])
            #H.add_node(G[path[-1]])
            # Build edge attributes and than add it (after the loop)
            att = {}
            label = ''
            added_node = None
            edge1_label = ''
            edge2_label = ''
            for i, (a, b) in enumerate(zip(path[:-1], path[1:])):
                if not i:
                    edge1_label += G.edges[a,b].get('label','')
                if G.nodes[a].get('token_head_dep','').strip('"') =='root': #build root node
                    added_node = copy.deepcopy(a)
                    added_node_attr = G.nodes[a]
                    added_node_attr.update({'full_original_path':path})
                    H.add_nodes_from([(added_node, added_node_attr)])
                    edge2_label += G.nodes[b].get('token_head_dep','none').strip('"')
            
            H.add_edge(path[0], added_node, label=edge1_label)
            H.add_edge(added_node, path[-1], label=edge2_label)
H_temp_name = str('H.png'.format(i))
display(draw_graph(H, H_temp_name))

### Get paths between cards and join them, contracting same card nodes

In [None]:
# Card to card paths (degree here may be interesting)
import copy
H = nx.DiGraph()
card_nodes = [x for x,y in G.nodes(data=True) if y['type']=='"card"']
temp = []
for i, s in enumerate(card_nodes):
    if not i%10: print(i)
    for j, e in enumerate(card_nodes):
        if s is e:
            continue
            
        # All simple paths becomes huge as it find path through many cards
        # Lets try to create a different graph with only the nodes and edges with start card id, and the last node
        # Entities do not have an attribute card_id
        start_card_nodes = [x for x,y in G.nodes(data=True) if y.get('card_id', None) == G.node[s]['card_id']]
        entity_nodes = [x for x,y in G.nodes(data=True) if y.get('type', None) == '"entity"']
        interesting_subgraph = G.subgraph(start_card_nodes+entity_nodes+[e])
        #temp_name = str('./paths_between_pairs/interesting_subgraph_{0}.png'.format(i))
        #draw_graph(interesting_subgraph, temp_name)
        for k, path in enumerate(nx.all_simple_paths(interesting_subgraph, s, e)):
            subgraph = G.subgraph(path)
            H = nx.union(H, subgraph, rename=('H-', 'path-'))

# Contract all card nodes, so all edges begin and end at a card
# Comment this chunk and you will get all disjoint paths between cards
card_names = set([y['card_name'] for x,y in H.nodes(data=True) if y['type'].strip('"')=='card'])
groups_of_same_nodes = []
print("Start grouping")
for i, card_name in enumerate(card_names):
    if not i%10: print('{0}/{1}'.format(i, len(card_names)))
    temp = [x for x,y in H.nodes(data=True) if y.get('card_name','')==card_name]
    if len(temp)>1:
        groups_of_same_nodes.append(temp)
print("Start contraction")
for i, group in enumerate(groups_of_same_nodes):
    if not i%10: print('{0}/{1}'.format(i, len(groups_of_same_nodes)))
    for node in group[1:]:
        H = nx.contracted_nodes(H, group[0], node)

In [None]:
# THIS TAKES FOREVER
# Print to file 
print("Start writing image")
H_temp_name = str('H.png'.format(i))
display(draw_graph(H, H_temp_name))

### Export

In [None]:
# Export to cytoscape format
print('Export graphml G')
nx.write_graphml(G, 'G_test.graphml')
print('Export graphml H')
# Remove attributes of type dict
for (n,d) in H.nodes(data=True):
    if d.get("contraction", None):
        del d["contraction"]
nx.write_graphml(H, 'H_test.graphml')

In [None]:
# To nodes and edges table in postgresql
nodes_df['id'] = nodes_df['node_id']
nodes_df.to_sql('nodes', engine, index=False, if_exists='replace')
edges_df.to_sql('edges', engine, index=False, if_exists='replace')

# Compute metrics

## Count edges and their type related to nodes which are entites instances

In [None]:
entity_nodes = nodes_df[nodes_df['type']=='entity']
test_ent_id = entity_nodes.iloc[0]['node_id']
print(test_ent_id)
entity_nodes

In [None]:
in_edges = []
out_edges = []
res = []
for ent_node in entity_nodes['node_id']:
    for node in G[ent_node]: # node is a neighbour of the entity
        for in_ed in G.in_edges([node], data=True):
            s, t, d = in_ed #source, target, data
            in_edges.append(d)
        for out_ed in G.out_edges([node], data=True):
            s, t, d = out_ed #source, target, data
            out_edges.append(d)

    in_ = pd.DataFrame(in_edges)
    in_['edge_type'] = 'in'
    in_['ent_node'] = nx.get_node_attributes(G, 'label')[ent_node]
    out_ = pd.DataFrame(out_edges)
    out_['edge_type'] = 'out'
    out_['ent_node'] = nx.get_node_attributes(G, 'label')[ent_node]
    res.append(pd.concat([in_, out_]).copy())
    
res = pd.concat(res)
res['cont'] = 1
res

In [None]:
res.pivot_table(values=['cont'], index=['ent_node', 'label', 'pop_type'], columns=['edge_type'], aggfunc=pd.np.sum)

# Investigating

In [None]:
sent = 120#12352#12350#1205
test_phrase = cards_df_sentences[cards_df_sentences['sentences'].str.contains('\{W}')]['sentences'].iloc[sent]

#test_phrase = 'Tap something: get more'
doc = nlp(test_phrase)

In [None]:
for s in doc.sents:
    print(s)
    print('Change')

In [None]:
displacy.render(doc, style='ent', jupyter=True)

In [None]:
displacy.render(doc, style='dep', jupyter=True)

In [None]:
for t in doc:
    print(t, t.tag_, t.pos_)

In [None]:
sents = []
for sent in doc.sents:
    print(sent)
    for tr in sent.subtree:
        sentd = {
            'word': tr,
            'ancestors': [x for x in tr.ancestors],
            'children': [x for x in tr.children],
            'cluster': tr.cluster,
            'conjuncts': [x for x in tr.conjuncts],
            'dep': tr.dep_,
            'ent_type': tr.ent_type_,
            'head': tr.head,
            'lemma': tr.lemma_,
            'tag':tr.tag_
        }
        sents.append(sentd)
        #print(sentd)
        #print('\n')
df = pd.DataFrame(sents)
df

## Detect verbs in each sentence of a card (mainly non-abilities ones)

In [None]:
def get_main_nlp_feats(row):
    sents = []
    doc = row['doc']
    for sent in doc.sents:
        #print(sent)
        for tr in sent.subtree:
            sentd = {
                'sent': sent,
                'text': tr.text,
                'word': tr,
                'ancestors': [x for x in tr.ancestors],
                'children': [x for x in tr.children],
                'cluster': tr.cluster,
                'conjuncts': [x for x in tr.conjuncts],
                'dep': tr.dep_,
                'ent_type': tr.ent_type_,
                'head': tr.head,
                'lemma': tr.lemma_,
                'pos':tr.pos_,
                'tag':tr.tag_
            }
            sents.append(sentd)
            #print(sentd)
            #print('\n')
    df = pd.DataFrame(sents)
    df['card_id'] = row['id']
    return df

In [None]:
def get_doc(text_str):
    return nlp(text_str)

In [None]:
cards_df_sample = cards_df.sample(10000).copy()
print('creating docs')
cards_df_sample['doc'] = cards_df_sample['text_preworked'].apply(get_doc)
print('getting docs feats')
cards_df_sample['nlp_feats'] = cards_df_sample.apply(get_main_nlp_feats, axis=1)

In [None]:
# Concatanate sent_feats
sent_feats = pd.concat(cards_df_sample['nlp_feats'].values,sort=True, ignore_index=True)

In [None]:
# Counting and showing ROOT verbs
count_verbs = sent_feats[(sent_feats['dep']=="ROOT")&(sent_feats['pos']=='VERB')]['lemma'].unique()
count_verbs.sort()
print(count_verbs.shape, count_verbs)

In [None]:
# Counting and showing ROOT nouns
count_nouns = sent_feats[(sent_feats['dep']=="ROOT")&(sent_feats['pos']=='NOUN')]['lemma'].unique()
count_nouns.sort()
print(count_nouns.shape, count_nouns)

In [None]:
spacy.explain("CD")

In [None]:
t = sent_feats[sent_feats['word'].apply(lambda x: x.text=='deals')]['word'].iloc[120]
details={}
print(t)
print(t._.sent)
for c in t.children:
    details[c] = {'pos':c.pos_, 'tag':c.tag_, 'lemma':c.lemma_, 'dep_':c.dep_}
print(details)
displacy.render(t.doc, style='dep', jupyter=True)

In [None]:
for nounc in t.doc.noun_chunks:
    print(nounc)

In [None]:
def get_children_and_attributes(token):
    details = {}
    #for t in token.children
count_verbs = sent_feats[(sent_feats['dep']=="ROOT")&(sent_feats['pos']=='VERB')]['lemma'].unique()

In [None]:
# Show roots
temp = sent_feats[(sent_feats['dep']=="ROOT")][['lemma', 'children', 'sent']].copy()
temp['children'] = temp['children'].apply(lambda x: tuple(set(x)))
#temp['lemma'] = temp['lemma'].apply(lambda x: x.text)
temp.drop_duplicates(subset=['lemma', 'children'])

## Try to match types and set as entity
https://stackoverflow.com/questions/49097804/spacy-entity-from-phrasematcher-only

In [None]:
test_sents = []
test_sents.append(test_phrase)
test_sents.append('If a Sliver deals combat damage to a player, its controller may create a +1/+1 colorless Sliver creature token.')
test_sents.append('Whenever a Sliver deals combat damage to a player, its controller may create a +1/+1 colorless Sliver creature token.')
colorless = '\n'.join([x for x in cards_df[cards_df['text'].str.contains('colorless').fillna(False)]['text'].iloc[:5]])
test_sents.append(colorless)

In [None]:
doc = nlp('\n'.join(test_sents))
displacy.render(doc, style='ent', jupyter=True)

In [None]:
options = {'compact': False,
          'collapse_punct': False}
displacy.render(doc, style='dep', jupyter=True, options=options)

In [None]:
sents = []
for sent in doc.sents:
    print(sent)
    for tr in sent.subtree:
        sentd = {
            'word': tr,
            'ancestors': [x for x in tr.ancestors],
            'children': [x for x in tr.children],
            'cluster': tr.cluster,
            'conjuncts': [x for x in tr.conjuncts],
            'dep': tr.dep_,
            'ent_type': tr.ent_type_,
            'head': tr.head,
            'lemma': tr.lemma_,
            'tag':tr.tag_
        }
        sents.append(sentd)
        #print(sentd)
        #print('\n')
df = pd.DataFrame(sents)
#df

In [None]:
test_sents = ['Two target creatures get +1/+1 each until end of turn']
doc = nlp('\n'.join(test_sents))

df = []
for token in doc:
    df.append({
        'token.text': token.text,
        'token.pos_': token.pos_,
        'token.dep_':token.dep_,
        'token.ent':token.ent_type_,
        'token.head.text':token.head.text})
df = pd.DataFrame.from_dict(df)
df

In [None]:
test_sents = ['Target creature gets +1/+1 until end of turn', '+1/+1 gets target creature until end of turn']
doc = nlp('\n'.join(test_sents))

df = []
for chunk in doc.noun_chunks:
    df.append({'chunk.text': chunk.text,
               'chunk.root.text': chunk.root.text,
               'chunk.root.tag': chunk.root.tag_,
               'chunk.root.dep_':chunk.root.dep_,
               'chunk.root.head.text':chunk.root.head.text})
df = pd.DataFrame.from_dict(df)
df

In [None]:
test_sents = ['Target creature gets +1/+1 until end of turn', '+1/+1 gets target creature until end of turn']
doc = nlp('\n'.join(test_sents))

df = []
for chunk in doc:
    if chunk.pos_=='NOUN':
        df.append({'chunk.text': chunk.text,
                   'chunk.tag': chunk.tag_,
                   'chunk.dep_':chunk.dep_,
                   'chunk.head.text':chunk.head.text})
df = pd.DataFrame.from_dict(df)
df

In [None]:
test_sents = ['Target creature gets +1/+1 until end of turn', 'You may put target creature from graveyard to play']
doc = nlp('\n'.join(test_sents))

parts = defaultdict(list)
sents = []

for sent in doc.sents:
    for chunk in sent.noun_chunks:
        if chunk.root.dep_=='nsubj':
            parts['nsubj'].append(chunk.text)
            parts['nsubj_root'].append(chunk.root.head.text)
        elif chunk.root.dep_=='dobj':
            parts['dobj'].append(chunk.text)
            parts['dobj_root'].append(chunk.root.head.text)
        elif chunk.root.dep_=='pobj':
            parts['pobj'].append(chunk.text)
            parts['pobj_root'].append(chunk.root.head.text)
    
    sents.append(parts)
    parts = defaultdict(list)

sents
#df = pd.DataFrame.from_dict(parts)
#df

In [None]:
test_sents = ['Target creature gets +1/+1 until end of turn', 'You may put target creatures from graveyard into play']
doc = nlp('\n'.join(test_sents))

parts = defaultdict(list)
sents = []
extra_keys = {}

def get_all_children(root, parts):
    '''From root, get all the children'''
    while root.children:
        for child in root.children:
            key = child.dep_
            while key in parts.keys():
                key += '_'
            parts[key] = child
            extra_keys[child] = key
            parts['key_of_head_of_'+key] = extra_keys[child.head]
            get_all_children(child, parts)
        break
    return parts
            

for sent in doc.sents:
    
    for root in doc:
        if root.dep_ == 'ROOT':
            parts['root'] = root
            extra_keys[root] = 'root'
            parts = get_all_children(root, parts)
        sents.append(parts) if parts else []
        parts = defaultdict(list)

sents
#df = pd.DataFrame.from_dict(parts)
#df

In [None]:
for token in doc:
    print(token, token.pos_)

## Networkx and pygraphviz

In [None]:
import random
import networkx as nx
G = nx.petersen_graph()
pdot = nx.drawing.nx_pydot.to_pydot(G)

shapes = ['box', 'polygon', 'ellipse', 'oval', 'circle', 'egg', 'triangle', 'exagon', 'star', ]
colors = ['blue', 'black', 'red', '#db8625', 'green', 'gray', 'cyan', '#ed125b']
styles = ['filled', 'rounded', 'rounded, filled', 'dashed', 'dotted, bold']

for i, node in enumerate(pdot.get_nodes()):
    node.set_label("n%d" % i)
    node.set_shape(shapes[random.randrange(len(shapes))])
    node.set_fontcolor(colors[random.randrange(len(colors))])
    node.set_fillcolor(colors[random.randrange(len(colors))])
    node.set_style(styles[random.randrange(len(styles))])
    node.set_color(colors[random.randrange(len(colors))])

for i, edge in enumerate(pdot.get_edges()):
    edge.set_label("e%d" % i)
    edge.set_fontcolor(colors[random.randrange(len(colors))])
    edge.set_style(styles[random.randrange(len(styles))])
    edge.set_color(colors[random.randrange(len(colors))])

png_path = "test.png"
pdot.write_png(png_path)

from IPython.display import Image
Image(png_path)

# Should we train a model for POSTAGGING?

Not sure. Many verbs interpreted sometimes as nouns are also sometimes interpreted as verbs.

In [None]:
sents = '\n'.join([x for x in cards_df.sample(200)['text_preworked']])
doc = nlp(sents)

In [None]:
nouns = []
for token in doc:
    if token.pos_ == 'NOUN' and token.lower_ not in nouns:
        nouns.append(token.lower_)
nouns.sort()
nouns
# Nouns that should be verbs:
# 'attacks', 'block', 'blocks', 'cast', 'control','controls', 'deal','deals', 'dies', 'enchant', 'flip', 'gain', 'gains', 'pay', 'return', 'sacrifice', 'shares', 'tap', 'untap'

# Nouns that COULD be verbs:
# 'counter(S)','exile'

In [None]:
verbs = []
for token in doc:
    if token.pos_ == 'VERB' and token.lower_ not in verbs:
        verbs.append(token.lower_)
verbs.sort()
verbs

## Get predictions ins a format easy to correct and feed back as training data

Check here https://spacy.io/usage/training#training-simple-style.

It should be easy to train a model, as long as we have a fre things in place

Build tables like:
card | sentence | token0 | token1 | ... | tokenN
card | sentence | tag0 | tag1 | ... | tagN
card | sentence | deps0 | deps1 | ... | depsN
card | sentence | head0 | head1 | ... | headN

In [None]:
cards_df.columns

In [None]:
from copy import deepcopy
tokens = []
tags = []
deps = []
head_ids = []
card_counter=0
for idx, card in cards_df.sample(200).iterrows():
    card_counter+=1
    if not card_counter%40: print(card_counter)
    for sentence in card['text_preworked'].split('\n'):
        doc = nlp(sentence)
        basics = {
                'card': card['name'],
                'sentence': sentence,
            }
        toks, tag, dep, head = deepcopy(basics), deepcopy(basics), deepcopy(basics), deepcopy(basics)
        for i, tok in enumerate(doc):
            toks.update({'{0:04d}'.format(i): tok.text})
            tag.update({'{0:04d}'.format(i): tok.tag_})
            dep.update({'{0:04d}'.format(i): tok.dep_})
            head.update({'{0:04d}'.format(i): tok.head.i})
        tokens.append(toks)
        tags.append(tag)
        deps.append(dep)
        head_ids.append(head)
            
df_tokens = pd.DataFrame(tokens)
df_tags = pd.DataFrame(tags)
df_deps = pd.DataFrame(deps)
df_head_ids = pd.DataFrame(head_ids)

display(df_tokens.head(2), df_tags.head(2), df_deps.head(2), df_head_ids.head(2))

# NLTK testing

In [None]:
nltk.download('all')

In [None]:
# https://www.nltk.org/book/ch10.html section 5.2
dt = nltk.DiscourseTester(['A student dances', 'Every student is a person'])
dt.readings()


In [None]:
dt.add_sentence('No person dances', consistchk=True)

In [None]:
dt.retract_sentence('No person dances', verbose=True)

In [None]:
dt.add_sentence('A person dances', informchk=True)

In [None]:
from nltk.tag import RegexpTagger
tagger = RegexpTagger(
    [('^(chases|runs)$', 'VB'),
     ('^(a)$', 'ex_quant'),
     ('^(every)$', 'univ_quant'),
     ('^(dog|boy)$', 'NN'),
     ('^(He)$', 'PRP')
])
rc = nltk.DrtGlueReadingCommand(depparser=nltk.MaltParser(tagger=tagger))
dt = nltk.DiscourseTester(['Every dog chases a boy', 'He runs'], rc)
dt.readings()

# Spacy learning

In [None]:
test_sentence = "Next week I'll   be in Madrid. Maybe."
doc = nlp(test_sentence)
for token in doc:
    print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}".format(
        token.text,
        token.idx,
        token.lemma_,
        token.is_punct,
        token.is_space,
        token.shape_,
        token.pos_,
        token.tag_
    ))
    
for sent in doc.sents:
    print(sent)
    
print([(token.text, token.tag_) for token in doc])

for ent in doc.ents:
    print(ent.text, ent.label_)

In [None]:
from spacy import displacy
 
doc = nlp('I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ')
displacy.render(doc, style='ent', jupyter=True)

In [None]:
# Noun phrases
doc = nlp("Wall Street Journal just published an interesting piece on crypto currencies")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.label_, chunk.root.text)

In [None]:
# Dependency parser
doc = nlp('Wall Street Journal just published an interesting piece on crypto currencies')
 
for token in doc:
    print("{0}/{1} <--{2}-- {3}/{4}".format(
        token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))

In [None]:
doc = nlp('Wall Street Journal just published an interesting piece on crypto currencies')
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})


In [None]:
doc = nlp(test_sentence)
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})


In [None]:
nlp = spacy.load('en_core_web_lg')
print(nlp.vocab['banana'].vector)