# This notebook is the start of the "trial and error approach"

References:

- https://github.com/xurxodiz/cardwalker/tree/master/oracle
- https://laterna--magica.blogspot.com/2011/10/oracle-parser.html

In [None]:
import json
import nltk
import pandas as pd
import re
from collections import defaultdict

In [None]:
sets = json.load(open('./AllSets.json', 'rb'))

In [None]:
for k, v in sorted(sets.items()):
    print(k, v['name'])

In [None]:
cards_usaga = sets['USG']['cards']

In [None]:
cards_usaga

In [None]:
cards_all=[]
for k, sett in sets.items():
    if (k in ['UGL', 'UST']) or (len(k)>3): # Ignore Unglued, Unstable and promotional things
        continue
    cards_all.extend(sett['cards'])    

# Let's start by trying to extract static habilities from cards

In [None]:
cards_df = pd.DataFrame.from_dict(cards_usaga)

In [None]:
texts = [card['text'].replace(card['name'], 'SELF') for card in cards_usaga if 'text' in card.keys()]

In [None]:
patterns = [
    (r'^([A-Za-z]+ ?[A-Za-z]+)[$|\n|,]| \(', 'STATICABILITY'),
    (r', ([A-Za-z]+ ?[A-Za-z]+)[$|\n||,]| \(', 'STATICABILITY'),
]

In [None]:
#regexp_tagger = nltk.RegexpTagger(patterns)

In [None]:
res = defaultdict(list)
r=None
for text in texts:
#     if r: break
    for pat, tag in patterns:
        r = re.search(pat, text)
        if r:
            res[text].append((r.groups(), tag))
#             break
pretty = pd.DataFrame.from_dict(res, orient='index')
pretty

# DF version: Let's start by trying to extract static habilities from cards

In [None]:
#cards_df = pd.DataFrame.from_dict(cards_usaga)
cards_df = pd.DataFrame.from_dict(cards_all)

In [None]:
#cards_df.head(4).transpose()

## Questions

### Does parenthesis contain usefull info or only explanations of abilities/effects?

Seems like its always an explanation (so, no usefull info to discern possible targets, zones affected, etc.)

In [None]:
pattern_parenthesis = r'\((.*?)\)'
test = """('Flying',), ('Trample',), ('Paper',), ('First strike',),
       ('Phasing',), ('Haste',), ('Flash',), ('Island',), ('Defender',),
       ('Blue',), ('Reach',), ('Devour X',), ('Vigilance',),
       ('Double strike',), ('Indestructible',), ('Artifacts',),
       ('Deathtouch',), ('Lifelink',), ('Menace',), ('Werewolf',),
       ('Leviathans',), ('While voting',), ('Flying', 'Demon'),
       ('Islandwalk',), ('Hexproof',), ('Plains',), ('Instant',),
       ('Swamp',), ('Mountain',), ('Forest',), ('Dinosaur',),
       ('Dinosaur Knight',), ('Leviathan',), ('Simultaneously',),
       ('Rat',), ('During combat',), ('Investigate',),
       ('Minotaur Pirate',), ('Each noncreature',), ('Vampire',),
       ('Pyrogenius',), ('Swampwalk',), ('Bolster X',), ('Timebender',),
       ('Bold Pyromancer',), ('Scry X',), ('Desertwalk',), ('Prowess',),
       ('Martial Paragon',), ('Death Wielder',), ('Equipment',),
       ('Valiant Protector',)"""
a = re.findall(pattern_parenthesis, test)
a

In [None]:
pattern_parenthesis = r'\((.*?)\)'
cards_df['in_parentheses'] = cards_df['text'].apply(lambda x: tuple(re.findall(pattern_parenthesis, str(x))))
set(cards_df['in_parentheses'])

In [None]:
#This example is not explaning an ability, but it is explaning something (an effect)
st = 'If two or more creatures are tied for greatest power, target any one of them.'
cards_df[cards_df['text'].str.contains(st).fillna(False)]['text'].values

### Remove anything between parenthesis and replace name by SELF

In [None]:
#Replace name by SELF and remove anything between parethesis
pattern_parenthesis = r' ?\(.*?\)'
cards_df['text_preworked'] = cards_df.apply(lambda x: str(x['text']).replace(x['name'], 'SELF'), axis=1)
cards_df['text_preworked'] = cards_df['text_preworked'].apply(lambda x: re.sub(pattern_parenthesis, '', x))

In [None]:
cards_df[cards_df['text_preworked'].str.contains('\(').fillna(False)]['text_preworked']

# Domain specific vocabulary

Let's build some domain specific vocabulary for MTG. For example, let's list supertypes, types, subtypes, know all card names, this kind f thing.

In [None]:
# Create set of cards names
cards_names = set(cards_df.name.unique())

In [None]:
# Create set of supertypes
array_of_supertypes_tuples = cards_df['supertypes'].dropna().apply(tuple).unique()
cards_supertypes = tuple()
for tup in array_of_supertypes_tuples:
    cards_supertypes += tup
    
cards_supertypes = set(cards_supertypes)
cards_supertypes

In [None]:
# Create set of types
array_of_types_tuples = cards_df['types'].dropna().apply(tuple).unique()
cards_types = tuple()
for tup in array_of_types_tuples:
    cards_types += tup
    
cards_types = set(cards_types)
cards_types

In [None]:
# Create set of types
array_of_subtypes_tuples = cards_df['subtypes'].dropna().apply(tuple).unique()
cards_subtypes = tuple()
for tup in array_of_subtypes_tuples:
    cards_subtypes += tup
    
cards_subtypes = set(cards_subtypes)
#cards_subtypes

In [None]:
#cards_df.head(10).transpose()

In [None]:
import requests
r = requests.get('http://media.wizards.com/2018/downloads/MagicCompRules%2020180713.txt')
comprules = r.text
kw_abilities_pat = r'702\.\d+\. ([A-Za-z ]+)'
abilities = re.findall(kw_abilities_pat, comprules)
abilities.pop(0) # Its just the rulings 
abilities.sort()
#abilities

## How can we detect an abilities sentence?

In [None]:
df = cards_df
df['split_sentences'] = df['text_preworked'].apply(lambda x: x.split('\n'))
df['split_sentences']

Now, how to work with abilites followed by cost?

In [None]:
def detect_abilities_sentence(sentlist):
    for sent in sentlist:
        if set(sent.split(', ')).issubset(set(abilities)):
            return True
    return False
t = df['split_sentences'].apply(detect_abilities_sentence)
df[t][df['text'].str.contains('umulative upkeep').fillna(False)]['text_preworked']

In [None]:
df[df['text'].str.contains('umulative upkeep').fillna(False)]['text_preworked'].loc[30808]

### Deal with cummulative upkeep

Seems like, if followed by mana cost, cumulative upkeep COST may be followed by , (comma) or \n (newline). But if the text for cumulative upkeep is longer, it seems to end with \n everytime.

In [None]:
# Check that these things are always the same
cumulative_upkeep_pattern1 = r'(?:, )?cumulative upkeep—.*?[.]'
cumulative_upkeep_pattern2 = r'(?:, )?cumulative upkeep—.*?[.\n]'
print(cumulative_upkeep_pattern1)
def get_cumup1(xstr):
    res = re.findall(cumulative_upkeep_pattern1, str(xstr), re.IGNORECASE)
    if res:
        return res
    return pd.np.nan
def get_cumup2(xstr):
    res = re.findall(cumulative_upkeep_pattern2, str(xstr), re.IGNORECASE)
    if res:
        return res
    return pd.np.nan
df['cumup1'] = df['text_preworked'].apply(get_cumup1).fillna(False)
df['cumup2'] = df['text_preworked'].apply(get_cumup2).fillna(False)
diff = df['cumup1']==df['cumup2']
df[~diff][['cumup1', 'cumup2', 'text_preworked']]
assert diff.all()

In [None]:
# Check that this never matches anything
cumulative_upkeep_pattern1 = r'(?:, )?cumulative upkeep—.*?,'
print(cumulative_upkeep_pattern1)
def get_cumup1(xstr):
    res = re.findall(cumulative_upkeep_pattern1, str(xstr), re.IGNORECASE)
    if res:
        return res
    return pd.np.nan
df['cumup1'] = df['text_preworked'].apply(get_cumup1)
df['cumup1'].dropna()
assert df['cumup1'].dropna().empty

In [None]:
# Check that this never matches anything
cumulative_upkeep_pattern1 = r', cumulative upkeep—'
print(cumulative_upkeep_pattern1)
def get_cumup1(xstr):
    res = re.findall(cumulative_upkeep_pattern1, str(xstr), re.IGNORECASE)
    if res:
        return res
    return pd.np.nan
df['cumup1'] = df['text_preworked'].apply(get_cumup1)
df['cumup1'].dropna()
assert df['cumup1'].dropna().empty

In [None]:
#re.search('test', 'TeSt', re.IGNORECASE)
#re.match('test', 'TeSt', re.IGNORECASE)
#re.sub('test', 'xxxx', 'Testing', flags=re.IGNORECASE)
# Non capturing group https://stackoverflow.com/questions/2703029/why-regular-expressions-non-capturing-group-is-not-working

#cumulative_upkeep_pattern = r' ?cumulative upkeep[ |—].*?[.|,|\n]'
type1_cost = r' (\{[A-Z0-9]+\})+'
type2_cost = r'—.*?[.|\n]'
cumulative_upkeep_pattern = r'(?:, )?(cumulative upkeep)({0}|{1})'.format(type1_cost, type2_cost)
print(cumulative_upkeep_pattern)
def get_cumup(xstr):
    res = re.findall(cumulative_upkeep_pattern, str(xstr), re.IGNORECASE)
    if res:
        return res
    return pd.np.nan
df['cumup'] = df['text_preworked'].apply(get_cumup)
posit = 28118
display(df[['cumup', 'text_preworked']].dropna())#.loc[posit]['text_preworked'])
#display(df[['cumup', 'text_preworked']].dropna()#.loc[posit]['cumup'])

In [None]:
# check what is not contained (GREAT: the only card should not be considered anyway)
cumup_all = df[df['text_preworked'].str.contains('umulative up')]
cumup_detected = df[['cumup', 'text_preworked']].dropna()
cumup_all[~cumup_all.index.isin(cumup_detected.index)]['text_preworked'].iloc[0]

### Extend procedure to other abilities

Check what 'Enchant' ability can enchant

In [None]:
# Get everythin that can follow Enchant
def get_whats_enchanted(xstr):
    res = re.findall(r'Enchant .*?[.|\n|$]', str(xstr))#, re.IGNORECASE)
    if res:
        return tuple(res)
    return pd.np.nan
df['enchant_something'] = df['text_preworked'].apply(get_whats_enchanted)
df['enchant_something'].dropna().drop_duplicates()
enchant_abilities = set([x[0].strip('\n') for x in df['enchant_something'].dropna().drop_duplicates()])
#enchant_abilities

Regex below can detect any abilities with costs.

In [None]:
abilities

In [None]:
type1_cost = r' (\{[A-Z0-9]+\})+'
type2_cost = r'—.*?[.|\n]'
type3_cost = r' \d+[,|\n]'
abilities_lower = '|'.join(abilities).lower()
cumulative_upkeep_pattern = r'(?:, )?({abi})({cost1}|{cost2}|{cost3})'.format(
    cost1=type1_cost, cost2=type2_cost, cost3=type3_cost, abi=abilities_lower)
print(cumulative_upkeep_pattern)
def get_cumup(xstr):
    res = re.findall(cumulative_upkeep_pattern, str(xstr), re.IGNORECASE)
    if res:
        return res
    return pd.np.nan
df['cost_abilities'] = df['text_preworked'].apply(get_cumup)
posit = 227
display(df[['cost_abilities', 'text_preworked']].dropna())#.loc[posit]['text_preworked'])
#display(df[['cumup', 'text_preworked']].dropna()#.loc[posit]['cumup'])

In [None]:
# Detect other things following abilities
abilities_follower = r' .*?[.|\n]'
abilities_lower = '|'.join(abilities)
ability_w_follower = r'({abi})({fol})'.format(fol=abilities_follower, abi=abilities_lower)
print(ability_w_follower)
def get_cumup(xstr):
    res = re.findall(ability_w_follower, str(xstr))
    if res:
        return tuple(res)
    return pd.np.nan
df['ability_w_follower'] = df['text_preworked'].apply(get_cumup)
posit = 1100
display(df[['ability_w_follower', 'text_preworked']].dropna())#.loc[posit]['text_preworked'])

detected_cost_abi = df['cost_abilities'].dropna()
df[~df.index.isin(detected_cost_abi.index)]['ability_w_follower'].dropna().drop_duplicates()

## What can bt in place of X in +X/+x (or actually +|-X/+|-X)

Besides number, only X or Y will appear.

In [None]:
from itertools import chain
def get_increases(text_str, pat=r'([+-][^\d]/[+-][^\d])'):
    '''Given a text, extract a pattern and return the extraction or None'''
    res = re.findall(pat, text_str)
    return res
t = cards_df['text_preworked'].apply(get_increases)
res = set(chain(*(t.values)))
res

Which numbers may it contain?

In [None]:
from itertools import chain
def get_increases(text_str, pat=r'([+-]\d+/)|(/[+-]\d+)'):
    '''Given a text, extract a pattern and return the extraction or None'''
    res = re.findall(pat, text_str)
    return res
t = cards_df['text_preworked'].apply(get_increases)
res = set(chain(*(t.values)))
res

In [None]:
from itertools import chain
pincre_pat=r'([+-][\dXxYx]+/)'
rincre_pat=r'(/[+-][\dXxYx]+)'
def get_increases(text_str, pat=r'([+-]\d+/)|(/[+-]\d+)'):
    '''Given a text, extract a pattern and return the extraction or None'''
    res = re.findall(pat, text_str)
    return res
pincre = cards_df['text_preworked'].apply(get_increases, args=(pincre_pat,))
pincre_res = set(chain(*(pincre.values)))
rincre = cards_df['text_preworked'].apply(get_increases, args=(rincre_pat,))
rincre_res = set(chain(*(rincre.values)))
print(pincre_res, rincre_res)

There is no +\*/+\*

In [None]:
cards_df[cards_df['text_preworked'].str.contains('\-\*')]
cards_df[cards_df['text_preworked'].str.contains('\+\*')]

In [None]:
from itertools import chain
def get_increases(text_str, pat=r'([+-][*])'):
    '''Given a text, extract a pattern and return the extraction or None'''
    res = re.findall(pat, text_str)
    return res
t = cards_df['text_preworked'].apply(get_increases)
res = set(chain(*(t.values)))
res

## Detecting special symbols

In [None]:
from itertools import chain
def get_increases(text_str, pat=r'([+-][\d+XxYx]{1,4}/[+-][\d+XxYx]{1,4})'):
    '''Given a text, extract a pattern and return the extraction or None'''
    res = re.findall(pat, text_str)
    return res
t = cards_df['text_preworked'].apply(get_increases)
pr_increase_symbols = set(chain(*(t.values)))
#pr_increase_symbols

In [None]:
#https://stackoverflow.com/questions/51766157/how-to-force-a-pos-tag-in-spacy-before-after-tagger/51776803#51776803
from spacy.symbols import ORTH, POS, NOUN, VERB

nlp.tokenizer.add_special_case('{G}', [{ORTH: '{G}', POS: NOUN}])
nlp.tokenizer.add_special_case('{T}', [{ORTH: '{T}', POS: VERB}])
for symb in pr_increase_symbols:
    nlp.tokenizer.add_special_case(symb, [{ORTH: symb, POS: NOUN}])

doc = nlp('{T}: This {G} is a noun. Target creature gets +1/+1')

for token in doc:
    print('{:10}{:10}'.format(token.text, token.pos_))

# Spacy

In [None]:
#test_sentence = cards_df[cards_df['static_abilities']==('Phasing',)].text.values[0]
test_sentence ='\nWhenever SELF attacks, it gets +1/+1.' #test_sentence +'\nWhenever SELF attacks, it gets +1/+1.'
test_sentence

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp('Hello World!')
for token in doc:
    print('"' + token.text + '"')

In [None]:
doc = nlp('Hello     World!')
for token in doc:
    print('"' + token.text + '"', token.idx)

In [None]:
doc = nlp("Next week I'll   be in Madrid.")
for token in doc:
    print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}".format(
        token.text,
        token.idx,
        token.lemma_,
        token.is_punct,
        token.is_space,
        token.shape_,
        token.pos_,
        token.tag_
    ))

In [None]:
doc = nlp(test_sentence)
for token in doc:
    print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}".format(
        token.text,
        token.idx,
        token.lemma_,
        token.is_punct,
        token.is_space,
        token.shape_,
        token.pos_,
        token.tag_
    ))

In [None]:
# Sentence detection
doc = nlp("These are apples. These are oranges.")
 
for sent in doc.sents:
    print(sent)

In [None]:
# Sentence detection
doc = nlp(test_sentence)
 
for sent in doc.sents:
    print(sent)

In [None]:
# POS tagging
doc = nlp(test_sentence)
print([(token.text, token.tag_) for token in doc])

In [None]:
# NER named entity recognition
doc = nlp("Next week I'll be in Madrid.")
for ent in doc.ents:
    print(ent.text, ent.label_)

In [None]:
from nltk.chunk import conlltags2tree
 
doc = nlp("Next week I'll be in Madrid.")
iob_tagged = [
    (
        token.text, 
        token.tag_, 
        "{0}-{1}".format(token.ent_iob_, token.ent_type_) if token.ent_iob_ != 'O' else token.ent_iob_
    ) for token in doc
]
 
print(iob_tagged)
 
# In case you like the nltk.Tree format
print(conlltags2tree(iob_tagged))
 

In [None]:
from spacy import displacy
 
doc = nlp('I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ')
displacy.render(doc, style='ent', jupyter=True)

In [None]:
# Noun phrases
doc = nlp("Wall Street Journal just published an interesting piece on crypto currencies")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.label_, chunk.root.text)

In [None]:
# Dependency parser
doc = nlp('Wall Street Journal just published an interesting piece on crypto currencies')
 
for token in doc:
    print("{0}/{1} <--{2}-- {3}/{4}".format(
        token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))

In [None]:
doc = nlp('Wall Street Journal just published an interesting piece on crypto currencies')
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})


In [None]:
doc = nlp(test_sentence)
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})


In [None]:
nlp = spacy.load('en_core_web_lg')
print(nlp.vocab['banana'].vector)

# Spacy again

In [None]:
import spacy
from spacy import displacy
import re
from spacy.symbols import ORTH, LEMMA, POS, TAG

In [None]:
from spacy.tokens import Token

def get_token_sent(token):
    token_span = token.doc[token.i:token.i+1]
    return token_span.sent

Token.set_extension('sent', getter=get_token_sent)

In [None]:
#MODEL = r'C:\Users\cs294662\Downloads\programas\spacy\data\en_core_web_md-2.0.0\en_core_web_md\en_core_web_md-2.0.0'
#MODEL = r'C:\Users\cs294662\Downloads\programas\spacy\data\en_coref_lg-3.0.0\en_coref_lg\en_coref_lg-3.0.0'
MODEL = 'en_core_web_sm'
nlp = spacy.load(MODEL)

In [None]:
# https://stackoverflow.com/questions/44594759/spacy-adding-special-case-tokenization-rules-by-regular-expression-or-pattern
cost_pattern = r'{[\dWGBURTX]}'
#cost_pattern = re.compile(r'{[\dWGBURTX]}')

In [None]:
# add special case rule
#special_case = [{ORTH: cost_pattern, LEMMA: 'COST', POS: 'NOUN'}]
#nlp.tokenizer.add_special_case(cost_pattern, special_case)

In [None]:
test_phrase = cards_df.iloc[1201]['split_sentences'][1]
#test_phrase = 'Target creature has flying'
doc = nlp(test_phrase)

In [None]:
doc = nlp(test_phrase)
indexes = [m.span() for m in re.finditer(cost_pattern, test_phrase, flags=re.IGNORECASE)] +\
          [m.span() for m in re.finditer(r':', test_phrase, flags=re.IGNORECASE)]
for start,end in indexes:
    doc.merge(start_idx=start,end_idx=end)

In [None]:
for s in doc.sents:
    print(s)
    print('Change')

In [None]:
displacy.render(doc, style='ent', jupyter=True)

In [None]:
displacy.render(doc, style='dep', jupyter=True)

In [None]:
sents = []
for sent in doc.sents:
    print(sent)
    for tr in sent.subtree:
        sentd = {
            'word': tr,
            'ancestors': [x for x in tr.ancestors],
            'children': [x for x in tr.children],
            'cluster': tr.cluster,
            'conjuncts': [x for x in tr.conjuncts],
            'dep': tr.dep_,
            'ent_type': tr.ent_type_,
            'head': tr.head,
            'lemma': tr.lemma_,
            'tag':tr.tag_
        }
        sents.append(sentd)
        #print(sentd)
        #print('\n')
df = pd.DataFrame(sents)
df

## Detect verbs in each sentence of a card (mainly non-abilities ones)

In [None]:
def get_main_nlp_feats(row):
    sents = []
    doc = row['doc']
    for sent in doc.sents:
        #print(sent)
        for tr in sent.subtree:
            sentd = {
                'sent': sent,
                'text': tr.text,
                'word': tr,
                'ancestors': [x for x in tr.ancestors],
                'children': [x for x in tr.children],
                'cluster': tr.cluster,
                'conjuncts': [x for x in tr.conjuncts],
                'dep': tr.dep_,
                'ent_type': tr.ent_type_,
                'head': tr.head,
                'lemma': tr.lemma_,
                'pos':tr.pos_,
                'tag':tr.tag_
            }
            sents.append(sentd)
            #print(sentd)
            #print('\n')
    df = pd.DataFrame(sents)
    df['card_id'] = row['id']
    return df

In [None]:
def get_doc(text_str):
    return nlp(text_str)

In [None]:
cards_df_sample = cards_df.sample(10000).copy()
print('creating docs')
cards_df_sample['doc'] = cards_df_sample['text_preworked'].apply(get_doc)
print('getting docs feats')
cards_df_sample['nlp_feats'] = cards_df_sample.apply(get_main_nlp_feats, axis=1)

In [None]:
# Concatanate sent_feats
sent_feats = pd.concat(cards_df_sample['nlp_feats'].values,sort=True, ignore_index=True)

In [None]:
# Counting and showing ROOT verbs
count_verbs = sent_feats[(sent_feats['dep']=="ROOT")&(sent_feats['pos']=='VERB')]['lemma'].unique()
count_verbs.sort()
print(count_verbs.shape, count_verbs)

In [None]:
# Counting and showing ROOT nouns
count_nouns = sent_feats[(sent_feats['dep']=="ROOT")&(sent_feats['pos']=='NOUN')]['lemma'].unique()
count_nouns.sort()
print(count_nouns.shape, count_nouns)

In [None]:
a = tuple(set([1,2]))
b= tuple(set([2,1]))
a==b

In [None]:
spacy.explain("CD")

In [None]:
t = sent_feats[sent_feats['word'].apply(lambda x: x.text=='deals')]['word'].iloc[120]
details={}
print(t)
print(t._.sent)
for c in t.children:
    details[c] = {'pos':c.pos_, 'tag':c.tag_, 'lemma':c.lemma_, 'dep_':c.dep_}
print(details)
displacy.render(t.doc, style='dep', jupyter=True)

In [None]:
def get_children_and_attributes(token):
    details = {}
    for t in token.children
count_verbs = sent_feats[(sent_feats['dep']=="ROOT")&(sent_feats['pos']=='VERB')]['lemma'].unique()

In [None]:
# Show roots
temp = sent_feats[(sent_feats['dep']=="ROOT")][['lemma', 'children', 'sent']].copy()
temp['children'] = temp['children'].apply(lambda x: tuple(set(x)))
#temp['lemma'] = temp['lemma'].apply(lambda x: x.text)
temp.drop_duplicates(subset=['lemma', 'children'])

## Try to match types and set as entity
https://stackoverflow.com/questions/49097804/spacy-entity-from-phrasematcher-only

In [None]:
from spacy.matcher import PhraseMatcher, Matcher
from spacy.tokens import Span

class EntityPhraseMatcher(object):
    name = 'entity_phrase_matcher'

    def __init__(self, nlp, terms, label):
        patterns = [nlp(term) for term in terms]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add(label, None, *patterns)

    def __call__(self, doc):
        matches = self.matcher(doc)
        spans = []
        for label, start, end in matches:
            span = Span(doc, start, end, label=label)
            spans.append(span)
        doc.ents = spans
        return doc
    
class EntityMatcher(object):
    name = 'entity_matcher'

    def __init__(self, nlp, dict_label_terms):
        '''dict_label_terms shoould be a dictionary in the format
        {label(str): patterns(list)}'''
        self.matcher = Matcher(nlp.vocab)
        for label, patterns in dict_label_terms.items():
            self.matcher.add(label, None, *patterns)

    def __call__(self, doc):
        matches = self.matcher(doc)
        spans = []
        for label, start, end in matches:
            span = Span(doc, start, end, label=label)
            spans.append(span)
        doc.ents = spans
        return doc

In [None]:
from collections import defaultdict
#nlp.remove_pipe('ner')
#nlp.remove_pipe('entity_matcher')
#nlp.remove_pipe('ent_type_matcher')
#nlp.remove_pipe('ent_subtype_matcher')
#nlp.remove_pipe('ent_supertype_matcher')

dict_label_terms = defaultdict(list)

for lem in ['if', 'whenever', 'when', 'only']:
    condition_matcher = [{'LEMMA': lem}, {'IS_PUNCT': False, 'OP': '*'}, {'IS_PUNCT': True}]
    dict_label_terms['CONDITION'].append(condition_matcher)

for typ in cards_types:
    dict_label_terms['TYPE'].append([{'LOWER': t} for t in typ.lower().split()])
for typ in cards_subtypes:
    dict_label_terms['SUBTYPE'].append([{'LOWER': t} for t in typ.lower().split()])
for typ in cards_supertypes:
    dict_label_terms['SUPERTYPE'].append([{'LOWER': t} for t in typ.lower().split()])
for typ in ['white','black','blue','white','red','colorless', 'multicolored', 'multicolor']:
    dict_label_terms['COLOR'].append([{'LOWER': t} for t in typ.lower().split()])
for abi in abilities:
    dict_label_terms['ABILITY'].append([{'LOWER': abi} for t in abi.lower().split()])

entity_matcher = EntityMatcher(nlp, dict_label_terms)
nlp.add_pipe(entity_matcher)

print(nlp.pipe_names)  # see all components in the pipeline

In [None]:
test_sents = []
test_sents.append(test_phrase)
test_sents.append('If a Sliver deals combat damage to a player, its controller may create a +1/+1 colorless Sliver creature token.')
test_sents.append('Whenever a Sliver deals combat damage to a player, its controller may create a +1/+1 colorless Sliver creature token.')
colorless = '\n'.join([x for x in cards_df[cards_df['text'].str.contains('colorless').fillna(False)]['text'].iloc[:5]])
test_sents.append(colorless)

In [None]:
doc = nlp('\n'.join(test_sents))
displacy.render(doc, style='ent', jupyter=True)

In [None]:
options = {'compact': False,
          'collapse_punct': False}
displacy.render(doc, style='dep', jupyter=True, options=options)

In [None]:
sents = []
for sent in doc.sents:
    print(sent)
    for tr in sent.subtree:
        sentd = {
            'word': tr,
            'ancestors': [x for x in tr.ancestors],
            'children': [x for x in tr.children],
            'cluster': tr.cluster,
            'conjuncts': [x for x in tr.conjuncts],
            'dep': tr.dep_,
            'ent_type': tr.ent_type_,
            'head': tr.head,
            'lemma': tr.lemma_,
            'tag':tr.tag_
        }
        sents.append(sentd)
        #print(sentd)
        #print('\n')
df = pd.DataFrame(sents)
#df

In [None]:
df[df['word'].apply(lambda x: x.lower_ in ['whenever', 'if', 'only', 'as'])]

# Should we train a model for POSTAGGING?

Not sure. Many verbs interpreted sometimes as nouns are also sometimes interpreted as verbs.

In [None]:
sents = '\n'.join([x for x in cards_df.sample(200)['text_preworked']])
doc = nlp(sents)

In [None]:
nouns = []
for token in doc:
    if token.pos_ == 'NOUN' and token.lower_ not in nouns:
        nouns.append(token.lower_)
nouns.sort()
nouns
# Nouns that should be verbs:
# 'attacks', 'block', 'blocks', 'cast', 'control','controls', 'deal','deals', 'dies', 'enchant', 'flip', 'gain', 'gains', 'pay', 'return', 'sacrifice', 'shares', 'tap', 'untap'

# Nouns that COULD be verbs:
# 'counter(S)','exile'

In [None]:
verbs = []
for token in doc:
    if token.pos_ == 'VERB' and token.lower_ not in verbs:
        verbs.append(token.lower_)
verbs.sort()
verbs

## Get predictions ins a format easy to correct and feed back as training data

Check here https://spacy.io/usage/training#training-simple-style.

It should be easy to train a model, as long as we have a fre things in place

Build tables like:
card | sentence | token0 | token1 | ... | tokenN
card | sentence | tag0 | tag1 | ... | tagN
card | sentence | deps0 | deps1 | ... | depsN
card | sentence | head0 | head1 | ... | headN

In [None]:
cards_df.columns

In [None]:
from copy import deepcopy
tokens = []
tags = []
deps = []
head_ids = []
card_counter=0
for idx, card in cards_df.sample(200).iterrows():
    card_counter+=1
    if not card_counter%40: print(card_counter)
    for sentence in card['text_preworked'].split('\n'):
        doc = nlp(sentence)
        basics = {
                'card': card['name'],
                'sentence': sentence,
            }
        toks, tag, dep, head = deepcopy(basics), deepcopy(basics), deepcopy(basics), deepcopy(basics)
        for i, tok in enumerate(doc):
            toks.update({'{0:04d}'.format(i): tok.text})
            tag.update({'{0:04d}'.format(i): tok.tag_})
            dep.update({'{0:04d}'.format(i): tok.dep_})
            head.update({'{0:04d}'.format(i): tok.head.i})
        tokens.append(toks)
        tags.append(tag)
        deps.append(dep)
        head_ids.append(head)
            
df_tokens = pd.DataFrame(tokens)
df_tags = pd.DataFrame(tags)
df_deps = pd.DataFrame(deps)
df_head_ids = pd.DataFrame(head_ids)

display(df_tokens.head(2), df_tags.head(2), df_deps.head(2), df_head_ids.head(2))

# NLTK testing

In [None]:
nltk.download('all')

In [None]:
# https://www.nltk.org/book/ch10.html section 5.2
dt = nltk.DiscourseTester(['A student dances', 'Every student is a person'])
dt.readings()


In [None]:
dt.add_sentence('No person dances', consistchk=True)

In [None]:
dt.retract_sentence('No person dances', verbose=True)

In [None]:
dt.add_sentence('A person dances', informchk=True)

In [None]:
from nltk.tag import RegexpTagger
tagger = RegexpTagger(
    [('^(chases|runs)$', 'VB'),
     ('^(a)$', 'ex_quant'),
     ('^(every)$', 'univ_quant'),
     ('^(dog|boy)$', 'NN'),
     ('^(He)$', 'PRP')
])
rc = nltk.DrtGlueReadingCommand(depparser=nltk.MaltParser(tagger=tagger))
dt = nltk.DiscourseTester(['Every dog chases a boy', 'He runs'], rc)
dt.readings()