# This notebook is the start of the "trial and error approach"

References:

- https://github.com/xurxodiz/cardwalker/tree/master/oracle
- https://laterna--magica.blogspot.com/2011/10/oracle-parser.html

In [None]:
import json
import nltk
import pandas as pd
import re
from collections import defaultdict

In [None]:
sets = json.load(open('./AllSets.json', 'rb'))

In [None]:
for k, v in sorted(sets.items()):
    print(k, v['name'])

In [None]:
cards_usaga = sets['USG']['cards']

In [None]:
cards_usaga

In [None]:
cards_all=[]
for k, sett in sets.items():
    if (k in ['UGL', 'UST', 'UNH']) or (len(k)>3): # Ignore Unglued, Unstable and promotional things
        continue
    for card in sett['cards']:
        card['set'] = k
    cards_all.extend(sett['cards'])    

# Params

In [None]:
mains_col_names = ['name', 'manaCost', 'text_preworked', 'type', 'power', 'toughness']

# Let's start by trying to extract static habilities from cards

In [None]:
cards_df = pd.DataFrame.from_dict(cards_usaga).set_index('id')

In [None]:
texts = [card['text'].replace(card['name'], 'SELF') for card in cards_usaga if 'text' in card.keys()]

In [None]:
patterns = [
    (r'^([A-Za-z]+ ?[A-Za-z]+)[$|\n|,]| \(', 'STATICABILITY'),
    (r', ([A-Za-z]+ ?[A-Za-z]+)[$|\n||,]| \(', 'STATICABILITY'),
]

In [None]:
#regexp_tagger = nltk.RegexpTagger(patterns)

In [None]:
res = defaultdict(list)
r=None
for text in texts:
#     if r: break
    for pat, tag in patterns:
        r = re.search(pat, text)
        if r:
            res[text].append((r.groups(), tag))
#             break
pretty = pd.DataFrame.from_dict(res, orient='index')
pretty

# DF version: Let's start by trying to extract static habilities from cards

In [None]:
#cards_df = pd.DataFrame.from_dict(cards_usaga)
cards_df = pd.DataFrame.from_dict(cards_all)

In [None]:
# Filter out new types
#set(cards_df.types.apply(lambda x: tuple(set(x))))
ignore_types = ('Conspiracy', 'Eaturecray', 'Phenomenon', 'Plane', 'Planeswalker', 'Scheme', 'Vanguard')

In [None]:
cards_df = cards_df[cards_df.types.apply(lambda x: not set(x).intersection(ignore_types))]

In [None]:
#cards_df.head(4).transpose()

## Questions

### Does parenthesis contain usefull info or only explanations of abilities/effects?

Seems like its always an explanation (so, no usefull info to discern possible targets, zones affected, etc.)

In [None]:
pattern_parenthesis = r'\((.*?)\)'
test = """('Flying',), ('Trample',), ('Paper',), ('First strike',),
       ('Phasing',), ('Haste',), ('Flash',), ('Island',), ('Defender',),
       ('Blue',), ('Reach',), ('Devour X',), ('Vigilance',),
       ('Double strike',), ('Indestructible',), ('Artifacts',),
       ('Deathtouch',), ('Lifelink',), ('Menace',), ('Werewolf',),
       ('Leviathans',), ('While voting',), ('Flying', 'Demon'),
       ('Islandwalk',), ('Hexproof',), ('Plains',), ('Instant',),
       ('Swamp',), ('Mountain',), ('Forest',), ('Dinosaur',),
       ('Dinosaur Knight',), ('Leviathan',), ('Simultaneously',),
       ('Rat',), ('During combat',), ('Investigate',),
       ('Minotaur Pirate',), ('Each noncreature',), ('Vampire',),
       ('Pyrogenius',), ('Swampwalk',), ('Bolster X',), ('Timebender',),
       ('Bold Pyromancer',), ('Scry X',), ('Desertwalk',), ('Prowess',),
       ('Martial Paragon',), ('Death Wielder',), ('Equipment',),
       ('Valiant Protector',)"""
a = re.findall(pattern_parenthesis, test)
a

In [None]:
pattern_parenthesis = r'\((.*?)\)'
cards_df['in_parentheses'] = cards_df['text'].apply(lambda x: tuple(re.findall(pattern_parenthesis, str(x))))
set(cards_df['in_parentheses'])

In [None]:
#This example is not explaning an ability, but it is explaning something (an effect)
st = 'If two or more creatures are tied for greatest power, target any one of them.'
cards_df[cards_df['text'].str.contains(st).fillna(False)]['text'].values

### Remove anything between parenthesis and replace name by SELF

In [None]:
# Replace name by SELF and remove anything between parethesis
pattern_parenthesis = r' ?\(.*?\)'
def prework_text(card):
    t = str(card['text']).replace(card['name'], 'SELF')
    t = re.sub(pattern_parenthesis, '', t)
    return t
    
cards_df['text_preworked'] = cards_df.apply(prework_text, axis=1)
#cards_df['text_preworked']

In [None]:
sep = "ª"
if cards_df['text_preworked'].str.contains(sep).any():
    raise Exception("Bad separator symbol. It is contained in some text.")

In [None]:
# replace card names by their ids - DOES NOT WORK: It replaces stuff that are not really card names
# for example, just run name_id_dict.get('When')
# Repalcing When in the cards will obviously replace a lot of stuff we don't want to
name_id_dict = {c['name']: c['id'] for c in cards_all}
#name_id_dict
temp = sep.join(cards_df['text_preworked'])
for i, (name, id_) in enumerate(name_id_dict.items()):
    if not i%1000: print(i)
    temp = temp.replace(name, id_)
cards_df['text_preworked_name_id_replaced'] = temp.split(sep)
#cards_df['text_preworked']

In [None]:
assert cards_df[cards_df['text_preworked'].str.contains('\(').fillna(False)]['text_preworked'].empty

# Domain specific vocabulary

Let's build some domain specific vocabulary for MTG. For example, let's list supertypes, types, subtypes, know all card names, this kind f thing.

In [None]:
# Create set of cards names
cards_names = set(cards_df.name.unique())

In [None]:
# Create set of supertypes
array_of_supertypes_tuples = cards_df['supertypes'].dropna().apply(tuple).unique()
cards_supertypes = tuple()
for tup in array_of_supertypes_tuples:
    cards_supertypes += tup
    
cards_supertypes = set(cards_supertypes)
cards_supertypes

In [None]:
# Create set of types
array_of_types_tuples = cards_df['types'].dropna().apply(tuple).unique()
cards_types = tuple()
for tup in array_of_types_tuples:
    cards_types += tup
    
cards_types = set(cards_types)
cards_types

In [None]:
# Create set of types
array_of_subtypes_tuples = cards_df['subtypes'].dropna().apply(tuple).unique()
cards_subtypes = tuple()
for tup in array_of_subtypes_tuples:
    cards_subtypes += tup
    
cards_subtypes = set(cards_subtypes)
#cards_subtypes

In [None]:
#cards_df.head(10).transpose()

In [None]:
import requests
import pickle
r = requests.get('http://media.wizards.com/2018/downloads/MagicCompRules%2020180713.txt')
if not r.status_code == 200:
    r.raise_for_status()
comprules = r.text

In [None]:
with open('rules.txt', 'r', encoding='latin-1') as f:
    comprules = '\n'.join(f.readlines())

In [None]:
kw_abilities_pat = r'702\.\d+\. ([A-Za-z ]+)'
abilities = re.findall(kw_abilities_pat, comprules)
abilities.pop(0) # Its just the rulings 
abilities.sort()
#abilities

## How can we detect an abilities sentence?

We should:
- Split sentences in a card by '\n' (=card_sentences_list)
- Split each element in card_sentences_list by ', ' (=split_candidate_sentences)
- Search for the pattern r'^ability' in each item of split_candidate_sentences
- If the pattern is found for evey item, then, split_candidate_sentences is an abilities sentence

We can, at the same time, detect activated abilites sentences and "rest" sentences (which are not abilites and not triggered abilites ones).
- Split sentences in a card by '\n' (=card_sentences_list)
- Those sentences which contain : are activated abilites

Sentences which are not in any case above are "rest" sentences.

In [None]:
ability_start_pattern = r'|'.join(['^'+ab+r'\b' for ab in abilities])
print(ability_start_pattern)
def is_ability_sentence(sentence):
    elem_starting_with_ability = []
    exceptions = ['Cycling abilities you activate cost up to {2} less to activate.']
    if sentence in exceptions:
        return False
    elems = sentence.split(', ')
    for elem in elems:
        if re.search(ability_start_pattern, elem):
            elem_starting_with_ability.append(re.search(ability_start_pattern, elem))
        else:
            return False
    if len(elems)==len(elem_starting_with_ability):
        return True
    raise Exception('We should never get here')

In [None]:
df = cards_df
df['split_sentences'] = df['text_preworked'].apply(lambda x: x.split('\n')) # list of sentences
df['split_sentences_is_ability'] = df['split_sentences'].apply(lambda x: [is_ability_sentence(y) for y in x])

df[df['split_sentences_is_ability'].apply(lambda x: True in x)][
    ['split_sentences', 'split_sentences_is_ability']].iloc[1]['split_sentences']

### Detect all possible differente abilities text

In [None]:
import itertools
ability_sentences = list(itertools.chain.from_iterable(df['split_sentences']))
ability_sentences_is_ability = list(itertools.chain.from_iterable(df['split_sentences_is_ability']))
abilities_full_set = []
for a, b in zip(ability_sentences, ability_sentences_is_ability):
    if b: abilities_full_set.append(a)
abilities_full_set = set(abilities_full_set)
len(abilities_full_set)

## Lets detetect all paragraphs types (and keep each ability as a separate paragraph)

In [None]:
import uuid

In [None]:
def splitDataFrameList(df,target_column,separator=None):
    '''
    https://gist.github.com/jlln/338b4b0b55bd6984f883
    df = dataframe to split,
    target_column = the column containing the values to split
    separator = the symbol used to perform the split
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    '''
    def splitListToRows(row,row_accumulator,target_column,separator):
        split_row = row[target_column]#.split(separator)
        for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)
    new_rows = []
    df.apply(splitListToRows,axis=1,args = (new_rows,target_column,separator))
    new_df = pd.DataFrame(new_rows)
    return new_df

In [None]:
df = pd.DataFrame({'name':['a','b','c'], 'c':['a','b','c'], 'b':['a','b','c'], "items":[['a1','a2','a3'],['b1','b2','b3'],['c1','c2','c3']]})
display(df)
test = splitDataFrameList(df, target_column="items")
display(test)

In [None]:
def get_paragraph_type(paragraph):
    if is_ability_sentence(paragraph):
        return 'ability'
    elif ':' in paragraph:
        return 'activated'
    else:
        return 'rest'

def split_abilities_and_keep_the_rest(df_row):
    '''Returns a list of abilities or a list of one element, which is not ability'''
    if df_row['type'] == 'ability':
        return df_row['paragraph'].split(',')
    
    return [df_row['paragraph']]

def get_aspas(text):
    if pd.isnull(text):
        return pd.np.nan
    
    reg = re.findall(r'\"(.+?)\"', text)
    
    if not reg:
        return pd.np.nan
    
    res = reg[0]
    
    return res
        
    
def get_paragraphs_and_types_df(card_row):
    res = pd.DataFrame()
    temp = pd.DataFrame()
    
    # Get initial paragraphs
    temp['paragraph'] = card_row['text_preworked'].split('\n')
    temp['ASPAS_TEXT'] = temp['paragraph'].apply(get_aspas)
    # TODO CONTINUE FROM HERE CORRECT THIS
    temp['paragraph'] = temp.apply(lambda x: x['paragraph'].replace(x['ASPAS_TEXT'], 'ASPAS_TEXT')
                                             if not pd.isnull(x['ASPAS_TEXT']) else x['paragraph'],
                                  axis=1)
    
    temp['type'] = temp['paragraph'].apply(get_paragraph_type)
    
    # Split the abilities paragraphs into multiple rows
    temp['paragraph'] = temp.apply(split_abilities_and_keep_the_rest, axis=1)
    temp = splitDataFrameList(temp, 'paragraph')
    res = temp
    
    res['card_id'] = card_row.name
    res['paragraph_order'] = range(res.shape[0])
    res['paragraph_hash'] = [uuid.uuid4().hex for x in range(res.shape[0])]
    return res

In [None]:
cards_df['df_paragraphs'] = cards_df.apply(get_paragraphs_and_types_df, axis=1)

In [None]:
cards_df[['text_preworked','df_paragraphs']].iloc[21]['df_paragraphs']

In [None]:
cards_df_paragraphs = pd.concat(cards_df['df_paragraphs'].values)
cards_df_paragraphs.head(3)

In [None]:
temp = cards_df_paragraphs[~pd.isnull(cards_df_paragraphs['ASPAS_TEXT'])]
temp

In [None]:
# Check if we get a different paragraph order for each ability
cards_df_paragraphs[cards_df_paragraphs['type']=='ability'].sort_values(by=['card_id', 'paragraph_order'])

In [None]:
# Show cards with triggered abilities
#cards_df[cards_df['df_sentences'].apply(lambda x: 'activated' in x['type'].values)]

## Lets use the same approach and separate paragraphs in abilities-complements, costs-effects and keep the rest as is

In [None]:
ability_and_complement_regex = r'(' + ability_start_pattern +')' + r'(.*)'
ability_and_complement_regex

In [None]:
def get_pop_and_complements_df(paragraph_row):
    res = pd.DataFrame()
    pat_ability = re.compile(ability_and_complement_regex)
    
    if paragraph_row['type'] == 'ability':
        
        #print(res['pop'].iloc[0])
        #print(re.findall(pat, res['pop'].iloc[0]))
        x = paragraph_row['paragraph']
        if (not pd.isnull(x)) and re.findall(pat_ability, x):
            ability = re.findall(pat_ability, x)[0][0].strip()
            ability_complement = re.findall(pat_ability, x)[0][1].strip()
        else:
            import pdb
            pdb.set_trace()
        
        res['pop'] = [ability, ability_complement] 
        res['pop_type'] =  ['ability', 'ability_complement'] 
        res['pop_order'] = range(res['pop'].shape[0])
    
    elif paragraph_row['type'] == 'activated':
        '''Break the costs in individual ones'''
        costs, effect = paragraph_row['paragraph'].split(':')
        
        exceptions = ['Pay half your life, rounded up']
        if costs in exceptions:
            costs = costs.replace(',','')
            
        res['pop'] =  costs.split(',') + [effect]
        types = ['activation_cost' for x in costs.split(',')] + ['activated_effect']
        
        res['pop_type'] =  types
        res['pop_order'] = range(res['pop'].shape[0])
        
    else:
        '''Keep the rest as rest or effect'''
        effect = paragraph_row['paragraph']
        
        res['pop'] =  [effect]
        res['pop_type'] =  ['effect']
        res['pop_order'] = range(res['pop'].shape[0])
        
        
    res['card_id'] = paragraph_row['card_id']
    res['paragraph_order'] = paragraph_row['paragraph_order']
    res['paragraph_hash'] = paragraph_row['paragraph_hash']
    return res

In [None]:
cards_df_paragraphs['pop'] = cards_df_paragraphs.apply(get_pop_and_complements_df, axis=1)

In [None]:
cards_df_paragraphs.iloc[3]['pop']

In [None]:
cards_df_pops = pd.concat(cards_df_paragraphs['pop'].values, sort=True)
cards_df_pops['pop_hash'] = cards_df_pops['pop'].apply(lambda x: uuid.uuid4().hex)
cards_df_pops.sort_values(by=['card_id','paragraph_order','pop_order']).head(3)

In [None]:
activated_ability_paragraph_hash = cards_df_paragraphs[cards_df_paragraphs['type']=='activated'].sample(1)['paragraph_hash'].iloc[0]
cards_df_pops[cards_df_pops['paragraph_hash']==activated_ability_paragraph_hash]

In [None]:
investigate = '08038de1ded341a1b63f792d29b8dad8'
cards_df_pops[cards_df_pops['paragraph_hash']==investigate]

In [None]:
#investigate = cards_df_pops[cards_df_pops['pop']=='Creatures you control have "{T}'].iloc[0]['card_id']
investigate = '7011018896f7a9a24b7f9dff722a7e990c43922b'
cards_df_pops[cards_df_pops['card_id']==investigate]

In [None]:
#investigate = cards_df_pops[cards_df_pops['pop']=='Creatures you control have "{T}'].iloc[0]['card_id']
investigate = 'ade9880f3121cdf8db57c3f4ba0375c843ec14c0'
cards_df_pops[cards_df_pops['card_id']==investigate]

In [None]:
cards_df_pops[cards_df_pops['pop']=='Pay half your life']

In [None]:
cards_df_pops[cards_df_pops['pop_type']=='activation_cost']['pop'].dropna().unique()

In [None]:
# Count how many abilities, activated abilities and effects there are
cards_df_pops['cont'] = 1

index = ['pop_type']
values = ['cont']

pivot_pop = cards_df_pops.pivot_table(index=index, values=values, aggfunc=pd.np.sum)
pivot_pop

In [None]:
# Show cards with triggered abilities
#cards_df[cards_df['df_sentences'].apply(lambda x: 'activated' in x['type'].values)]

## Lets use the same approach and separatev conditions-"result effect"

In [None]:
condition_regex = r'((?:if |whenever |when |only ).*?[,.])'
condition_regex

In [None]:
def get_condition(text):
    if pd.isnull(text):
        return None
    
    reg = re.findall(condition_regex, text, flags=re.IGNORECASE)
    if not reg:
        return None
    
    return reg

def clean_effect_from_condition(row):
    clean_effect = row['pop']
    
    if (not row['condition']):
        return clean_effect
    
    condition = ''.join(row['condition'])
    clean_effect = clean_effect.replace(condition, '')
    return clean_effect
    
cards_df_pops['condition'] = cards_df_pops['pop'].apply(get_condition)
cards_df_pops['effect_wo_condition'] = cards_df_pops.apply(clean_effect_from_condition, axis=1)

In [None]:
idx=5
display(cards_df_pops[~pd.isnull(cards_df_pops['condition'])])
display(cards_df_pops[~pd.isnull(cards_df_pops['condition'])].iloc[idx]['pop'])
cards_df_pops[~pd.isnull(cards_df_pops['condition'])].iloc[idx]['condition']

## Investingating abilities

Now, how to work with abilites followed by cost?

In [None]:
def detect_abilities_sentence(sentlist):
    for sent in sentlist:
        if set(sent.split(', ')).issubset(set(abilities)):
            return True
    return False
t = df['split_sentences'].apply(detect_abilities_sentence)
df[t][df['text'].str.contains('umulative upkeep').fillna(False)]['text_preworked']

In [None]:
df[df['text'].str.contains('umulative upkeep').fillna(False)]['text_preworked'].loc[30808]

### Deal with cummulative upkeep

Seems like, if followed by mana cost, cumulative upkeep COST may be followed by , (comma) or \n (newline). But if the text for cumulative upkeep is longer, it seems to end with \n everytime.

In [None]:
# Check that these things are always the same
cumulative_upkeep_pattern1 = r'(?:, )?cumulative upkeep—.*?[.]'
cumulative_upkeep_pattern2 = r'(?:, )?cumulative upkeep—.*?[.\n]'
print(cumulative_upkeep_pattern1)
def get_cumup1(xstr):
    res = re.findall(cumulative_upkeep_pattern1, str(xstr), re.IGNORECASE)
    if res:
        return res
    return pd.np.nan
def get_cumup2(xstr):
    res = re.findall(cumulative_upkeep_pattern2, str(xstr), re.IGNORECASE)
    if res:
        return res
    return pd.np.nan
df['cumup1'] = df['text_preworked'].apply(get_cumup1).fillna(False)
df['cumup2'] = df['text_preworked'].apply(get_cumup2).fillna(False)
diff = df['cumup1']==df['cumup2']
df[~diff][['cumup1', 'cumup2', 'text_preworked']]
assert diff.all()

In [None]:
# Check that this never matches anything
cumulative_upkeep_pattern1 = r'(?:, )?cumulative upkeep—.*?,'
print(cumulative_upkeep_pattern1)
def get_cumup1(xstr):
    res = re.findall(cumulative_upkeep_pattern1, str(xstr), re.IGNORECASE)
    if res:
        return res
    return pd.np.nan
df['cumup1'] = df['text_preworked'].apply(get_cumup1)
df['cumup1'].dropna()
assert df['cumup1'].dropna().empty

In [None]:
# Check that this never matches anything
cumulative_upkeep_pattern1 = r', cumulative upkeep—'
print(cumulative_upkeep_pattern1)
def get_cumup1(xstr):
    res = re.findall(cumulative_upkeep_pattern1, str(xstr), re.IGNORECASE)
    if res:
        return res
    return pd.np.nan
df['cumup1'] = df['text_preworked'].apply(get_cumup1)
df['cumup1'].dropna()
assert df['cumup1'].dropna().empty

In [None]:
#re.search('test', 'TeSt', re.IGNORECASE)
#re.match('test', 'TeSt', re.IGNORECASE)
#re.sub('test', 'xxxx', 'Testing', flags=re.IGNORECASE)
# Non capturing group https://stackoverflow.com/questions/2703029/why-regular-expressions-non-capturing-group-is-not-working

#cumulative_upkeep_pattern = r' ?cumulative upkeep[ |—].*?[.|,|\n]'
type1_cost = r' (\{[A-Z0-9]+\})+'
type2_cost = r'—.*?[.|\n]'
cumulative_upkeep_pattern = r'(?:, )?(cumulative upkeep)({0}|{1})'.format(type1_cost, type2_cost)
print(cumulative_upkeep_pattern)
def get_cumup(xstr):
    res = re.findall(cumulative_upkeep_pattern, str(xstr), re.IGNORECASE)
    if res:
        return res
    return pd.np.nan
df['cumup'] = df['text_preworked'].apply(get_cumup)
posit = 28118
display(df[['cumup', 'text_preworked']].dropna())#.loc[posit]['text_preworked'])
#display(df[['cumup', 'text_preworked']].dropna()#.loc[posit]['cumup'])

In [None]:
# check what is not contained (GREAT: the only card should not be considered anyway)
cumup_all = df[df['text_preworked'].str.contains('umulative up')]
cumup_detected = df[['cumup', 'text_preworked']].dropna()
cumup_all[~cumup_all.index.isin(cumup_detected.index)]['text_preworked'].iloc[0]

### Extend procedure to other abilities

Check what 'Enchant' ability can enchant

In [None]:
# Get everythin that can follow Enchant
def get_whats_enchanted(xstr):
    res = re.findall(r'Enchant .*?[.|\n|$]', str(xstr))#, re.IGNORECASE)
    if res:
        return tuple(res)
    return pd.np.nan
df['enchant_something'] = df['text_preworked'].apply(get_whats_enchanted)
df['enchant_something'].dropna().drop_duplicates()
enchant_abilities = set([x[0].strip('\n') for x in df['enchant_something'].dropna().drop_duplicates()])
#enchant_abilities

Regex below can detect any abilities with costs.

In [None]:
type1_cost = r' (\{[A-Z0-9]+\})+'
type2_cost = r'—.*?[.|\n]'
type3_cost = r' \d+[,|\n]'
abilities_lower = '|'.join(abilities).lower()
cumulative_upkeep_pattern = r'(?:, )?({abi})({cost1}|{cost2}|{cost3})'.format(
    cost1=type1_cost, cost2=type2_cost, cost3=type3_cost, abi=abilities_lower)
print(cumulative_upkeep_pattern)
def get_cumup(xstr):
    res = re.findall(cumulative_upkeep_pattern, str(xstr), re.IGNORECASE)
    if res:
        return res
    return pd.np.nan
df['cost_abilities'] = df['text_preworked'].apply(get_cumup)
posit = 227
display(df[['cost_abilities', 'text_preworked']].dropna())#.loc[posit]['text_preworked'])
#display(df[['cumup', 'text_preworked']].dropna()#.loc[posit]['cumup'])

In [None]:
# Detect other things following abilities
abilities_follower = r' .*?[.|\n]'
abilities_lower = '|'.join(abilities)
ability_w_follower = r'({abi})({fol})'.format(fol=abilities_follower, abi=abilities_lower)
print(ability_w_follower)
def get_cumup(xstr):
    res = re.findall(ability_w_follower, str(xstr))
    if res:
        return tuple(res)
    return pd.np.nan
df['ability_w_follower'] = df['text_preworked'].apply(get_cumup)
posit = 1100
display(df[['ability_w_follower', 'text_preworked']].dropna())#.loc[posit]['text_preworked'])

detected_cost_abi = df['cost_abilities'].dropna()
df[~df.index.isin(detected_cost_abi.index)]['ability_w_follower'].dropna().drop_duplicates()

## What can bt in place of X in +X/+x (or actually +|-X/+|-X)

Besides number, only X or Y will appear.

In [None]:
from itertools import chain
def get_increases(text_str, pat=r'([+-][^\d]/[+-][^\d])'):
    '''Given a text, extract a pattern and return the extraction or None'''
    res = re.findall(pat, text_str)
    return res
t = cards_df['text_preworked'].apply(get_increases)
res = set(chain(*(t.values)))
res

Which numbers may it contain?

In [None]:
from itertools import chain
def get_increases(text_str, pat=r'([+-]\d+/)|(/[+-]\d+)'):
    '''Given a text, extract a pattern and return the extraction or None'''
    res = re.findall(pat, text_str)
    return res
t = cards_df['text_preworked'].apply(get_increases)
res = set(chain(*(t.values)))
res

In [None]:
from itertools import chain
pincre_pat=r'([+-][\dXxYx]+/)'
rincre_pat=r'(/[+-][\dXxYx]+)'
def get_increases(text_str, pat=r'([+-]\d+/)|(/[+-]\d+)'):
    '''Given a text, extract a pattern and return the extraction or None'''
    res = re.findall(pat, text_str)
    return res
pincre = cards_df['text_preworked'].apply(get_increases, args=(pincre_pat,))
pincre_res = set(chain(*(pincre.values)))
rincre = cards_df['text_preworked'].apply(get_increases, args=(rincre_pat,))
rincre_res = set(chain(*(rincre.values)))
print(pincre_res, rincre_res)

There is no +\*/+\*

In [None]:
cards_df[cards_df['text_preworked'].str.contains('\-\*')]
cards_df[cards_df['text_preworked'].str.contains('\+\*')]

In [None]:
from itertools import chain
def get_increases(text_str, pat=r'([+-][*])'):
    '''Given a text, extract a pattern and return the extraction or None'''
    res = re.findall(pat, text_str)
    return res
t = cards_df['text_preworked'].apply(get_increases)
res = set(chain(*(t.values)))
res

## Detecting special symbols

In [164]:
import itertools

In [166]:
patt = r'\{.*?\}'
t = cards_df_pops['pop'].apply(lambda x: re.findall(patt, str(x))
                             if re.findall(patt, str(x)) else pd.np.nan)
symbols_set=set(itertools.chain.from_iterable(t.dropna()))
#symbols_set

In [167]:
weird_symbols = []
worth_ignoring = ['{hr}','{½}','{∞}'] # Unglued or similar
worth_ignoring.append('{CHAOS}')
symbols_explanation = {
    '{S}': {'explanation': 'Snow mana', 'example_card': 'Glacial Plating'},
    '{R/P}': {'explanation': 'can be paid with either {R} or 2 life', 'example_card': 'Rage Extractor'},
    '{Q}': {'explanation': '{Q} is the untap symbol', 'example_card': 'Order of Whiteclay'},
    '{E}': {'explanation': 'Energy counter', 'example_card': 'Consulate Surveillance'},
    '{C}': {'explanation': 'Colorless mana', 'example_card': 'Skarrg, the Rage Pits'},
    '{CHAOS}': {'explanation': 'It is only in Plane cards and for a specific kind of game',
                'example_card': 'Glimmervoid Basin'},
}
weird_cards = []
for item in weird_symbols:
    weird = cards_df_sentences[cards_df_sentences['sentences'].str.contains(item)]
    weird_cards.append(cards_df[cards_df['id'].isin(weird['card_id'])])
if weird_symbols:
    weird_cards = pd.concat(weird_cards)
    weird_cards[mains_col_names]

In [168]:
from itertools import chain
def get_increases(text_str, pat=r'([+-][\d+XxYx]{1,4}/[+-][\d+XxYx]{1,4})'):
    '''Given a text, extract a pattern and return the extraction or None'''
    res = re.findall(pat, text_str)
    return res
t = cards_df['text_preworked'].apply(get_increases)
pr_increase_symbols = set(chain(*(t.values)))
#pr_increase_symbols

# Spacy

In [None]:
#https://stackoverflow.com/questions/51766157/how-to-force-a-pos-tag-in-spacy-before-after-tagger/51776803#51776803
from spacy.symbols import ORTH, POS, NOUN, VERB

nlp.tokenizer.add_special_case('{G}', [{ORTH: '{G}', POS: NOUN}])
nlp.tokenizer.add_special_case('{T}', [{ORTH: '{T}', POS: VERB}])
for symb in pr_increase_symbols:
    nlp.tokenizer.add_special_case(symb, [{ORTH: symb, POS: NOUN}])

doc = nlp('{T}: This {G} is a noun. Target creature gets +1/+1')

for token in doc:
    print('{:10}{:10}'.format(token.text, token.pos_))

In [None]:
# Interpret {something} as NOUN (but tap and untap as verb)
#https://stackoverflow.com/questions/51766157/how-to-force-a-pos-tag-in-spacy-before-after-tagger/51776803#51776803
from spacy.symbols import ORTH, POS, NOUN, VERB, LOWER,LEMMA, TAG, NounType_com, nn, VerbForm_inf
import spacy
from spacy import displacy
 
doc = nlp('I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ')
displacy.render(doc, style='ent', jupyter=True)

In [None]:
# Noun phrases
doc = nlp("Wall Street Journal just published an interesting piece on crypto currencies")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.label_, chunk.root.text)

In [None]:
# Dependency parser
doc = nlp('Wall Street Journal just published an interesting piece on crypto currencies')
 
for token in doc:
    print("{0}/{1} <--{2}-- {3}/{4}".format(
        token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))

In [None]:
doc = nlp('Wall Street Journal just published an interesting piece on crypto currencies')
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})


In [None]:
#test_sentence = cards_df[cards_df['static_abilities']==('Phasing',)].text.values[0]
test_sentence ='\nWhenever SELF attacks, it gets +1/+1.' #test_sentence +'\nWhenever SELF attacks, it gets +1/+1.'
test_sentence

In [None]:
doc = nlp(test_sentence)
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})


# Spacy applied

In [183]:
from spacy.symbols import ORTH, POS, NOUN, VERB, LOWER,LEMMA, TAG, nn#, VerbForm_inf,NounType_com,
import spacy
from spacy import displacy

In [184]:
#MODEL = 'en_core_web_lg'
MODEL = 'en_core_web_sm'

In [185]:
from spacy.tokens import Token

def get_token_sent(token):
    token_span = token.doc[token.i:token.i+1]
    return token_span.sent

Token.set_extension('sent', getter=get_token_sent, force=True)

In [186]:
#MODEL = r'C:\Users\cs294662\Downloads\programas\spacy\data\en_core_web_md-2.0.0\en_core_web_md\en_core_web_md-2.0.0'
#MODEL = r'C:\Users\cs294662\Downloads\programas\spacy\data\en_coref_lg-3.0.0\en_coref_lg\en_coref_lg-3.0.0'
nlp = spacy.load(MODEL)

## Set custom tags for special cases

In [201]:
#symbols_explanation

In [188]:
symbols_set_valid = symbols_set.difference(set(worth_ignoring))

In [189]:
# Add {SYMBOL} to NOUN recognizer
for sym in symbols_set_valid:
    if not sym in ['{T}', '{Q}']:
        nlp.tokenizer.add_special_case(sym, [{ORTH: sym, POS: NOUN, TAG:nn}])
    else:
        nlp.tokenizer.add_special_case(sym, [{ORTH: sym, POS: VERB, TAG:'VB'}])

In [196]:
# Add power and toughness in/decresing symbols to NOUN recognizer
for sym in pr_increase_symbols:
    nlp.tokenizer.add_special_case(sym, [{ORTH: sym, POS: NOUN, TAG:nn}])

In [190]:
# https://stackoverflow.com/questions/44594759/spacy-adding-special-case-tokenization-rules-by-regular-expression-or-pattern
#cost_pattern = r'{[\dWGBURTX]}'
#cost_pattern = re.compile(r'{[\dWGBURTX]}')
# add special case rule
#special_case = [{ORTH: cost_pattern, LEMMA: 'COST', POS: 'NOUN'}]
#nlp.tokenizer.add_special_case(cost_pattern, special_case)

In [None]:
should_be_verbs = ['attacks', 'block', 'blocks', 'cast', 'control','controls', 'deal','deals', 'dies', 'enchant', 'flip', 'gain', 'gains', 'pay', 'return', 'sacrifice', 'shares', 'tap', 'untap']
#for token in should_be_verbs:
#    nlp.tokenizer.add_special_case(token, [{ORTH: token, POS: VERB}])
#    nlp.tokenizer.add_special_case(token.title(), [{ORTH: token.title(), POS: VERB}])

In [197]:
test_phrase = '{G}: Target creature gets +1/-1'
#test_phrase = 'Target creature has flying'
doc = nlp(test_phrase)
displacy.render(doc, style='dep', jupyter=True)

## Create custom entity matcher

In [213]:
from spacy.matcher import PhraseMatcher, Matcher
from spacy.tokens import Span

class EntityPhraseMatcher(object):
    '''https://stackoverflow.com/questions/49097804/spacy-entity-from-phrasematcher-only'''
    
    name = 'entity_phrase_matcher'

    def __init__(self, nlp, terms, label):
        patterns = [nlp(term) for term in terms]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add(label, None, *patterns)

    def __call__(self, doc):
        matches = self.matcher(doc)
        spans = []
        for label, start, end in matches:
            span = Span(doc, start, end, label=label)
            spans.append(span)
        doc.ents = spans
        return doc
    
class EntityMatcher(object):
    name = 'entity_matcher'

    def __init__(self, nlp, dict_label_terms):
        '''dict_label_terms shoould be a dictionary in the format
        {label(str): patterns(list)}'''
        self.matcher = Matcher(nlp.vocab)
        for label, patterns in dict_label_terms.items():
            self.matcher.add(label, None, *patterns)

    def __call__(self, doc):
        matches = self.matcher(doc)
        spans = []
        for label, start, end in matches:
            span = Span(doc, start, end, label=label)
            spans.append(span)
        doc.ents = spans
        return doc

In [221]:
zones = ['graveyard', 'play', 'library', 'hand', 'battlefield', 'exile', 'stack']
players = ['opponent', 'you']

In [222]:
from collections import defaultdict
#nlp.remove_pipe('ner')
nlp.remove_pipe('entity_matcher')
#nlp.remove_pipe('ent_type_matcher')
#nlp.remove_pipe('ent_subtype_matcher')
#nlp.remove_pipe('ent_supertype_matcher')

dict_label_terms = defaultdict(list)

for typ in cards_types:
    dict_label_terms['TYPE'].append([{'LOWER': t} for t in typ.lower().split()])
for typ in cards_subtypes:
    dict_label_terms['SUBTYPE'].append([{'LOWER': t} for t in typ.lower().split()])
for typ in cards_supertypes:
    dict_label_terms['SUPERTYPE'].append([{'LOWER': t} for t in typ.lower().split()])
for typ in ['white','black','blue','white','red','colorless', 'multicolored', 'multicolor']:
    dict_label_terms['COLOR'].append([{'LOWER': t} for t in typ.lower().split()])
for abi in abilities:
    dict_label_terms['ABILITY'].append([{'LOWER': abi} for t in abi.lower().split()])
for zone in zones:
    dict_label_terms['ZONE'].append([{'LOWER': zone, 'POS': NOUN} for t in abi.lower().split()])
for player in players:
    dict_label_terms['PLAYER'].append([{'LOWER': player, 'POS':spacy.symbols.PRON} for t in abi.lower().split()])
    dict_label_terms['PLAYER'].append([{'LOWER': player, 'POS':spacy.symbols.NOUN} for t in abi.lower().split()])

entity_matcher = EntityMatcher(nlp, dict_label_terms)
nlp.add_pipe(entity_matcher)

print(nlp.pipe_names)  # see all components in the pipeline

['tagger', 'parser', 'entity_matcher']


In [223]:
test_phrase = "{G}: You can put target creature from an opponent's graveyard into play under your control"
#test_phrase = 'Target creature has flying'
doc = nlp(test_phrase)
displacy.render(doc, style='ent', jupyter=True)
displacy.render(doc, style='dep', jupyter=True)

In [None]:
#nlp2 = spacy.load(MODEL)

# Investigating

In [None]:
sent = 120#12352#12350#1205
test_phrase = cards_df_sentences[cards_df_sentences['sentences'].str.contains('\{W}')]['sentences'].iloc[sent]

#test_phrase = 'Tap something: get more'
doc = nlp(test_phrase)

In [None]:
for s in doc.sents:
    print(s)
    print('Change')

In [None]:
displacy.render(doc, style='ent', jupyter=True)

In [None]:
displacy.render(doc, style='dep', jupyter=True)

In [None]:
for t in doc:
    print(t, t.tag_, t.pos_)

In [None]:
sents = []
for sent in doc.sents:
    print(sent)
    for tr in sent.subtree:
        sentd = {
            'word': tr,
            'ancestors': [x for x in tr.ancestors],
            'children': [x for x in tr.children],
            'cluster': tr.cluster,
            'conjuncts': [x for x in tr.conjuncts],
            'dep': tr.dep_,
            'ent_type': tr.ent_type_,
            'head': tr.head,
            'lemma': tr.lemma_,
            'tag':tr.tag_
        }
        sents.append(sentd)
        #print(sentd)
        #print('\n')
df = pd.DataFrame(sents)
df

## Detect verbs in each sentence of a card (mainly non-abilities ones)

In [None]:
def get_main_nlp_feats(row):
    sents = []
    doc = row['doc']
    for sent in doc.sents:
        #print(sent)
        for tr in sent.subtree:
            sentd = {
                'sent': sent,
                'text': tr.text,
                'word': tr,
                'ancestors': [x for x in tr.ancestors],
                'children': [x for x in tr.children],
                'cluster': tr.cluster,
                'conjuncts': [x for x in tr.conjuncts],
                'dep': tr.dep_,
                'ent_type': tr.ent_type_,
                'head': tr.head,
                'lemma': tr.lemma_,
                'pos':tr.pos_,
                'tag':tr.tag_
            }
            sents.append(sentd)
            #print(sentd)
            #print('\n')
    df = pd.DataFrame(sents)
    df['card_id'] = row['id']
    return df

In [None]:
def get_doc(text_str):
    return nlp(text_str)

In [None]:
cards_df_sample = cards_df.sample(10000).copy()
print('creating docs')
cards_df_sample['doc'] = cards_df_sample['text_preworked'].apply(get_doc)
print('getting docs feats')
cards_df_sample['nlp_feats'] = cards_df_sample.apply(get_main_nlp_feats, axis=1)

In [None]:
# Concatanate sent_feats
sent_feats = pd.concat(cards_df_sample['nlp_feats'].values,sort=True, ignore_index=True)

In [None]:
# Counting and showing ROOT verbs
count_verbs = sent_feats[(sent_feats['dep']=="ROOT")&(sent_feats['pos']=='VERB')]['lemma'].unique()
count_verbs.sort()
print(count_verbs.shape, count_verbs)

In [None]:
# Counting and showing ROOT nouns
count_nouns = sent_feats[(sent_feats['dep']=="ROOT")&(sent_feats['pos']=='NOUN')]['lemma'].unique()
count_nouns.sort()
print(count_nouns.shape, count_nouns)

In [None]:
spacy.explain("CD")

In [None]:
t = sent_feats[sent_feats['word'].apply(lambda x: x.text=='deals')]['word'].iloc[120]
details={}
print(t)
print(t._.sent)
for c in t.children:
    details[c] = {'pos':c.pos_, 'tag':c.tag_, 'lemma':c.lemma_, 'dep_':c.dep_}
print(details)
displacy.render(t.doc, style='dep', jupyter=True)

In [None]:
for nounc in t.doc.noun_chunks:
    print(nounc)

In [None]:
def get_children_and_attributes(token):
    details = {}
    #for t in token.children
count_verbs = sent_feats[(sent_feats['dep']=="ROOT")&(sent_feats['pos']=='VERB')]['lemma'].unique()

In [None]:
# Show roots
temp = sent_feats[(sent_feats['dep']=="ROOT")][['lemma', 'children', 'sent']].copy()
temp['children'] = temp['children'].apply(lambda x: tuple(set(x)))
#temp['lemma'] = temp['lemma'].apply(lambda x: x.text)
temp.drop_duplicates(subset=['lemma', 'children'])

## Try to match types and set as entity
https://stackoverflow.com/questions/49097804/spacy-entity-from-phrasematcher-only

In [None]:
from spacy.matcher import PhraseMatcher, Matcher
from spacy.tokens import Span

class EntityPhraseMatcher(object):
    '''https://stackoverflow.com/questions/49097804/spacy-entity-from-phrasematcher-only'''
    
    name = 'entity_phrase_matcher'

    def __init__(self, nlp, terms, label):
        patterns = [nlp(term) for term in terms]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add(label, None, *patterns)

    def __call__(self, doc):
        matches = self.matcher(doc)
        spans = []
        for label, start, end in matches:
            span = Span(doc, start, end, label=label)
            spans.append(span)
        doc.ents = spans
        return doc
    
class EntityMatcher(object):
    name = 'entity_matcher'

    def __init__(self, nlp, dict_label_terms):
        '''dict_label_terms shoould be a dictionary in the format
        {label(str): patterns(list)}'''
        self.matcher = Matcher(nlp.vocab)
        for label, patterns in dict_label_terms.items():
            self.matcher.add(label, None, *patterns)

    def __call__(self, doc):
        matches = self.matcher(doc)
        spans = []
        for label, start, end in matches:
            span = Span(doc, start, end, label=label)
            spans.append(span)
        doc.ents = spans
        return doc

In [None]:
from collections import defaultdict
#nlp.remove_pipe('ner')
#nlp.remove_pipe('entity_matcher')
#nlp.remove_pipe('ent_type_matcher')
#nlp.remove_pipe('ent_subtype_matcher')
#nlp.remove_pipe('ent_supertype_matcher')

dict_label_terms = defaultdict(list)

for typ in cards_types:
    dict_label_terms['TYPE'].append([{'LOWER': t} for t in typ.lower().split()])
for typ in cards_subtypes:
    dict_label_terms['SUBTYPE'].append([{'LOWER': t} for t in typ.lower().split()])
for typ in cards_supertypes:
    dict_label_terms['SUPERTYPE'].append([{'LOWER': t} for t in typ.lower().split()])
for typ in ['white','black','blue','white','red','colorless', 'multicolored', 'multicolor']:
    dict_label_terms['COLOR'].append([{'LOWER': t} for t in typ.lower().split()])
for abi in abilities:
    dict_label_terms['ABILITY'].append([{'LOWER': abi} for t in abi.lower().split()])

entity_matcher = EntityMatcher(nlp, dict_label_terms)
nlp.add_pipe(entity_matcher)

print(nlp.pipe_names)  # see all components in the pipeline

In [None]:
test_sents = []
test_sents.append(test_phrase)
test_sents.append('If a Sliver deals combat damage to a player, its controller may create a +1/+1 colorless Sliver creature token.')
test_sents.append('Whenever a Sliver deals combat damage to a player, its controller may create a +1/+1 colorless Sliver creature token.')
colorless = '\n'.join([x for x in cards_df[cards_df['text'].str.contains('colorless').fillna(False)]['text'].iloc[:5]])
test_sents.append(colorless)

In [None]:
doc = nlp('\n'.join(test_sents))
displacy.render(doc, style='ent', jupyter=True)

In [None]:
options = {'compact': False,
          'collapse_punct': False}
displacy.render(doc, style='dep', jupyter=True, options=options)

In [None]:
sents = []
for sent in doc.sents:
    print(sent)
    for tr in sent.subtree:
        sentd = {
            'word': tr,
            'ancestors': [x for x in tr.ancestors],
            'children': [x for x in tr.children],
            'cluster': tr.cluster,
            'conjuncts': [x for x in tr.conjuncts],
            'dep': tr.dep_,
            'ent_type': tr.ent_type_,
            'head': tr.head,
            'lemma': tr.lemma_,
            'tag':tr.tag_
        }
        sents.append(sentd)
        #print(sentd)
        #print('\n')
df = pd.DataFrame(sents)
#df

In [None]:
df[df['word'].apply(lambda x: x.lower_ in ['whenever', 'if', 'only', 'as'])]

# Should we train a model for POSTAGGING?

Not sure. Many verbs interpreted sometimes as nouns are also sometimes interpreted as verbs.

In [None]:
sents = '\n'.join([x for x in cards_df.sample(200)['text_preworked']])
doc = nlp(sents)

In [None]:
nouns = []
for token in doc:
    if token.pos_ == 'NOUN' and token.lower_ not in nouns:
        nouns.append(token.lower_)
nouns.sort()
nouns
# Nouns that should be verbs:
# 'attacks', 'block', 'blocks', 'cast', 'control','controls', 'deal','deals', 'dies', 'enchant', 'flip', 'gain', 'gains', 'pay', 'return', 'sacrifice', 'shares', 'tap', 'untap'

# Nouns that COULD be verbs:
# 'counter(S)','exile'

In [None]:
verbs = []
for token in doc:
    if token.pos_ == 'VERB' and token.lower_ not in verbs:
        verbs.append(token.lower_)
verbs.sort()
verbs

## Get predictions ins a format easy to correct and feed back as training data

Check here https://spacy.io/usage/training#training-simple-style.

It should be easy to train a model, as long as we have a fre things in place

Build tables like:
card | sentence | token0 | token1 | ... | tokenN
card | sentence | tag0 | tag1 | ... | tagN
card | sentence | deps0 | deps1 | ... | depsN
card | sentence | head0 | head1 | ... | headN

In [None]:
cards_df.columns

In [None]:
from copy import deepcopy
tokens = []
tags = []
deps = []
head_ids = []
card_counter=0
for idx, card in cards_df.sample(200).iterrows():
    card_counter+=1
    if not card_counter%40: print(card_counter)
    for sentence in card['text_preworked'].split('\n'):
        doc = nlp(sentence)
        basics = {
                'card': card['name'],
                'sentence': sentence,
            }
        toks, tag, dep, head = deepcopy(basics), deepcopy(basics), deepcopy(basics), deepcopy(basics)
        for i, tok in enumerate(doc):
            toks.update({'{0:04d}'.format(i): tok.text})
            tag.update({'{0:04d}'.format(i): tok.tag_})
            dep.update({'{0:04d}'.format(i): tok.dep_})
            head.update({'{0:04d}'.format(i): tok.head.i})
        tokens.append(toks)
        tags.append(tag)
        deps.append(dep)
        head_ids.append(head)
            
df_tokens = pd.DataFrame(tokens)
df_tags = pd.DataFrame(tags)
df_deps = pd.DataFrame(deps)
df_head_ids = pd.DataFrame(head_ids)

display(df_tokens.head(2), df_tags.head(2), df_deps.head(2), df_head_ids.head(2))

# NLTK testing

In [None]:
nltk.download('all')

In [None]:
# https://www.nltk.org/book/ch10.html section 5.2
dt = nltk.DiscourseTester(['A student dances', 'Every student is a person'])
dt.readings()


In [None]:
dt.add_sentence('No person dances', consistchk=True)

In [None]:
dt.retract_sentence('No person dances', verbose=True)

In [None]:
dt.add_sentence('A person dances', informchk=True)

In [None]:
from nltk.tag import RegexpTagger
tagger = RegexpTagger(
    [('^(chases|runs)$', 'VB'),
     ('^(a)$', 'ex_quant'),
     ('^(every)$', 'univ_quant'),
     ('^(dog|boy)$', 'NN'),
     ('^(He)$', 'PRP')
])
rc = nltk.DrtGlueReadingCommand(depparser=nltk.MaltParser(tagger=tagger))
dt = nltk.DiscourseTester(['Every dog chases a boy', 'He runs'], rc)
dt.readings()

# Spacy learning

In [None]:
test_sentence = "Next week I'll   be in Madrid. Maybe."
doc = nlp(test_sentence)
for token in doc:
    print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}".format(
        token.text,
        token.idx,
        token.lemma_,
        token.is_punct,
        token.is_space,
        token.shape_,
        token.pos_,
        token.tag_
    ))
    
for sent in doc.sents:
    print(sent)
    
print([(token.text, token.tag_) for token in doc])

for ent in doc.ents:
    print(ent.text, ent.label_)

In [None]:
from spacy import displacy
 
doc = nlp('I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ')
displacy.render(doc, style='ent', jupyter=True)

In [None]:
# Noun phrases
doc = nlp("Wall Street Journal just published an interesting piece on crypto currencies")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.label_, chunk.root.text)

In [None]:
# Dependency parser
doc = nlp('Wall Street Journal just published an interesting piece on crypto currencies')
 
for token in doc:
    print("{0}/{1} <--{2}-- {3}/{4}".format(
        token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))

In [None]:
doc = nlp('Wall Street Journal just published an interesting piece on crypto currencies')
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})


In [None]:
doc = nlp(test_sentence)
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})


In [None]:
nlp = spacy.load('en_core_web_lg')
print(nlp.vocab['banana'].vector)