<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Test" data-toc-modified-id="Test-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Test</a></span></li><li><span><a href="#Displacy" data-toc-modified-id="Displacy-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Displacy</a></span></li></ul></div>

In [1]:
from __future__ import unicode_literals

from collections import defaultdict
import srsly

from spacy.errors import Errors
from spacy.compat import basestring_
from spacy.util import ensure_path
from spacy.tokens import Span
from spacy.matcher import Matcher, PhraseMatcher

from spacy import displacy

class EntityMatcher(object):
    name = "entity_matcher"

    def __init__(self, nlp,**cfg):
        self.nlp = nlp
        self.overwrite = cfg.get("overwrite_ents", False)
        self.token_patterns = defaultdict(list)
        self.phrase_patterns = defaultdict(list)
        self.matcher = Matcher(nlp.vocab)
        self.phrase_matcher = PhraseMatcher(nlp.vocab)

        patterns = cfg.get("patterns")
        if patterns is not None:
            self.add_patterns(patterns)
            
            
    def __len__(self):
        """The number of all patterns added to the entity ruler."""
        n_token_patterns = sum(len(p) for p in self.token_patterns.values())
        n_phrase_patterns = sum(len(p) for p in self.phrase_patterns.values())
        return n_token_patterns + n_phrase_patterns
    
    def __contains__(self, label):
        """Whether a label is present in the patterns."""
        return label in self.token_patterns or label in self.phrase_patterns

    def __call__(self, doc):
        """Find matches in document and add them as entities.

        doc (Doc): The Doc object in the pipeline.
        RETURNS (Doc): The Doc with added entities, if available.

        DOCS: https://spacy.io/api/entityruler#call
        """
        
        matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
        
        matches = set(
            [(m_id, start, end) for m_id, start, end in matches if start != end]
        )
        get_sort_key = lambda m: (m[2] - m[1], m[1])
        matches = sorted(matches, key=get_sort_key, reverse=False)
        
        
        entities = list(doc.ents)
        new_entities = []
        seen_tokens = set()
        for match_id, start, end in matches:
            
            if any(t.ent_type for t in doc[start:end]) and not self.overwrite:
                continue
            
            if start not in seen_tokens and end - 1 not in seen_tokens:

                new_entities.append(Span(doc, start, end, label=match_id))

                entities = [e for e in entities if not (e.start < end and e.end > start)]
                seen_tokens.update(range(start, end))
                
        doc.ents = entities + new_entities

        return doc

    @property
    def labels(self):
        """All labels present in the match patterns.

        RETURNS (set): The string labels.

        DOCS: https://spacy.io/api/entityruler#labels
        """
        all_labels = set(self.token_patterns.keys())
        all_labels.update(self.phrase_patterns.keys())
        return tuple(all_labels)

    @property
    def patterns(self):
        """Get all patterns that were added to the entity ruler.

        RETURNS (list): The original patterns, one dictionary per pattern.

        DOCS: https://spacy.io/api/entityruler#patterns
        """
        all_patterns = []
        for label, patterns in self.token_patterns.items():
            for pattern in patterns:
                all_patterns.append({"label": label, "pattern": pattern})
        for label, patterns in self.phrase_patterns.items():
            for pattern in patterns:
                all_patterns.append({"label": label, "pattern": pattern.text})
        return all_patterns

    def add_patterns(self, patterns):
        """Add patterns to the entitiy ruler. A pattern can either be a token
        pattern (list of dicts) or a phrase pattern (string). For example:
        {'label': 'ORG', 'pattern': 'Apple'}
        {'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]}

        patterns (list): The patterns to add.

        DOCS: https://spacy.io/api/entityruler#add_patterns
        """
        for entry in patterns:
            label = entry["label"]
            pattern = entry["pattern"]
            on_match = entry['on_match']
            
            if on_match == 'None':
                on_matcher = None
            else:

                print(label)
                def on_matcher(matcher, doc, id, matches):
                    match_id, start, end = matches[-1]
                    print('This is the on match[0]:')
                    print(on_match[0])
                    for callback in on_match:
                        print('This is the callback:')
                        print(callback)
                        if 'TRUNCR' in callback.keys():
                            end = end - callback['TRUNCR']
                            matches[id] = (match_id, start, end)

                    
                        if 'TRUNCL' in callback.keys():
                            start = start + callback['TRUNCL']
                            matches[id] = (match_id, start, end)
                            
#                    matches.append(('sub-CLUE',start - callback['TRUNCL'],start))
                    print(matches)
                        

            if isinstance(pattern, basestring_):
                self.phrase_patterns[label].append(self.nlp(pattern))
            elif isinstance(pattern, list):
                self.token_patterns[label].append((on_matcher,pattern))

            else:
                raise ValueError(Errors.E097.format(pattern=pattern))
                


        for label, match in self.token_patterns.items():
            for on_matcher,pattern in match:
                self.matcher.add(label, on_matcher, pattern)
        for label, patterns in self.phrase_patterns.items():
            self.phrase_matcher.add(label, None, *patterns)

    def from_bytes(self, patterns_bytes, **kwargs):
        """Load the entity ruler from a bytestring.

        patterns_bytes (bytes): The bytestring to load.
        **kwargs: Other config paramters, mostly for consistency.
        RETURNS (EntityRuler): The loaded entity ruler.

        DOCS: https://spacy.io/api/entityruler#from_bytes
        """
        patterns = srsly.msgpack_loads(patterns_bytes)
        self.add_patterns(patterns)
        return self

    def to_bytes(self, **kwargs):
        """Serialize the entity ruler patterns to a bytestring.

        RETURNS (bytes): The serialized patterns.

        DOCS: https://spacy.io/api/entityruler#to_bytes
        """
        return srsly.msgpack_dumps(self.patterns)

    def from_disk(self, path, **kwargs):
        """Load the entity ruler from a file. Expects a file containing
        newline-delimited JSON (JSONL) with one entry per line.

        path (unicode / Path): The JSONL file to load.
        **kwargs: Other config paramters, mostly for consistency.
        RETURNS (EntityRuler): The loaded entity ruler.

        DOCS: https://spacy.io/api/entityruler#from_disk
        """
        path = ensure_path(path)
        path = path.with_suffix(".jsonl")
        patterns = srsly.read_jsonl(path)
        self.add_patterns(patterns)
        return self

    def to_disk(self, path, **kwargs):
        """Save the entity ruler patterns to a directory. The patterns will be
        saved as newline-delimited JSON (JSONL).

        path (unicode / Path): The JSONL file to load.
        **kwargs: Other config paramters, mostly for consistency.
        RETURNS (EntityRuler): The loaded entity ruler.

        DOCS: https://spacy.io/api/entityruler#to_disk
        """
        path = ensure_path(path)
        path = path.with_suffix(".jsonl")
        srsly.write_jsonl(path, self.patterns)

        
def render_doc(doc, entity = False):
    '''This function render the text of the documents with the entity signed'''
    
    out = displacy.parse_ents(doc)
    print(out)
    adder = 0
    
    for i,ent in enumerate(out['ents']):
                
        #do not add the first entity match
        out['ents'][i]['start'] += i*adder
        out['ents'][i]['end'] += i*adder
        
        if entity == False:
            adder = 4
            if ent['label'] == 'sub-CLUE':
                out['text'] = out['text'][:ent['start']] + '_*' + out['text'][ent['start']:ent['end']] + '*_' + out['text'][ent['end']:]
            else:
                out['text'] = out['text'][:ent['start']] + '**' + out['text'][ent['start']:ent['end']] + '**' + out['text'][ent['end']:]
            
            
        else:
            adder = len(ent['label']) + 6
            if ent['label'] == 'sub-CLUE':
                out['text'] = out['text'][:ent['start']] + '_*' + out['text'][ent['start']:ent['end']] + '*_{' + ent['label'] + '}' + out['text'][ent['end']:]
            else:
                out['text'] = out['text'][:ent['start']] + '**' + out['text'][ent['start']:ent['end']] + '**{' + ent['label'] + '}' + out['text'][ent['end']:]
            
        
    return out['text']

### Test

In [55]:
import spacy
from spacy.lang.en import English

nlp = English()
nlp = spacy.load("en_core_web_sm")


entity_matcher = EntityMatcher(nlp).from_disk("/Users/nicolamelluso/Data_Science/RED/RED_Toolkit/redtoolkit/textanalysis/extractors/resources/soft_skill.jsonl")
nlp.add_pipe(entity_matcher)

doc = nlp('Decision Making is an important soft skill as the problem solving.')
doc = nlp('Positive relationship is the most important soft skill of the world')
doc.ents
render_doc(doc)

CLUE
{'text': 'Positive relationship is the most important soft skill of the world', 'ents': [], 'title': None, 'settings': {'lang': 'en', 'direction': 'ltr'}}


  "__main__", mod_spec)


'Positive relationship is the most important soft skill of the world'

In [4]:
! open /Users/nicolamelluso/Data_Science/RED/RED_Toolkit/redtoolkit/textanalysis/extractors/resources/

In [26]:
doc.ents

(Decision Making, problem solving, be empathetic)

### Displacy

Add in the json the choise to have the _clue_ and modify the class in order to manage the clue properly

In [46]:
from spacy import displacy
from IPython.core.display import display, HTML

colors = {"CLUE": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
options = {"ents": ["CLUE",'SOFT SKILL','sub-CLUE'], "colors": colors}

#doc = nlp('Decision Making is an important soft skill as the problem solving. The ability to be empathetic and the user rotates the tool.')
#doc2 = nlp('Making decision is a soft skill')

html = displacy.render([doc,doc2], style="ent")#, options=options)
display(HTML(html))

Html_file= open("filename.md","w")
Html_file.write(html)
Html_file.close()


In [54]:
[(token.text, token.pos_,token.dep_) for token in doc]

[('Positive', 'ADJ', 'amod'),
 ('relationship', 'NOUN', 'nsubj'),
 ('is', 'VERB', 'ROOT'),
 ('the', 'DET', 'det'),
 ('most', 'ADV', 'advmod'),
 ('important', 'ADJ', 'amod'),
 ('soft', 'ADJ', 'amod'),
 ('skill', 'NOUN', 'attr'),
 ('of', 'ADP', 'prep'),
 ('the', 'DET', 'det'),
 ('world', 'NOUN', 'pobj')]

In [103]:
import pypandoc

with open('somefile.md', 'w') as f:
    f.write ('col \n')
    f.write ('-------- \n')
    f.write(render_doc(doc))
output = pypandoc.convert_file('somefile.md', 'docx', outputfile="somefile.docx")
    


{'text': 'Decision Making is an important soft skill as the problem solving. The ability to be empathetic. Data science could be defined as a science', 'ents': [{'start': 0, 'end': 15, 'label': 'ORG'}, {'start': 50, 'end': 65, 'label': 'SOFT SKILL'}, {'start': 71, 'end': 81, 'label': 'sub-CLUE'}, {'start': 82, 'end': 95, 'label': 'CLUE'}], 'title': None, 'settings': {'lang': 'en', 'direction': 'ltr'}}


In [62]:
import json
jsonl_content = '''{"label":"SOFT SKILL 1","on_match": "None", "pattern": [{"LEMMA":"decision"},{"OP":"*"},{"LEMMA":"make"}]}
{"label":"SOFT SKILL 2","on_match": "None", "pattern": [{"LEMMA":"make"},{"OP":"*"},{"LEMMA":"decision"}]}
{"label":"SOFT SKILL 3","on_match": "None", "pattern": [{"LEMMA":"solve"},{"OP":"*"},{"DEP":"dobj"}]}
{"label":"SOFT SKILL 4","on_match": "None", "pattern": [{"LEMMA":"problem"},{"OP":"*"},{"LEMMA":"solve"}]}
{"label":"SOFT SKILL 5","on_match": "None", "pattern": [{"TEXT": {"REGEX": "judge"}}]}
{"label":"SOFT SKILL 6","on_match": "None", "pattern": [{"TEXT": {"REGEX": "originality"}}]}
{"label":"SOFT SKILL 7","on_match": "None", "pattern": [{"LEMMA":"active"},{"OP":"*"},{"TEXT":{"REGEX":"ing$"}}]}
{"label":"SOFT SKILL 8","on_match": "None", "pattern": [{"TEXT":{"REGEX":"learn"}},{"TEXT":{"REGEX":".*?"}, "IS_LOWER": true},{"TEXT":{"REGEX":"active"}}]}
{"label":"SOFT SKILL 9","on_match": "None", "pattern": [{"POS":"ADJ"},{"TEXT":{"REGEX":"think|thinking"}}]}
{"label":"SOFT SKILL 10","on_match": "None", "pattern": [{"POS":{"REGEX": "ADJ|NOUN|VERB"}},{"LEMMA":"comprehension"}]}
{"label":"SOFT SKILL 11","on_match": "None", "pattern": [{"POS":"ADJ","OP":"?"},{"LOWER":"reasoning"}]}
{"label":"SOFT SKILL 12","on_match": "None", "pattern": [{"LEMMA":{"IN":["write","oral","read"]}},{"LEMMA":"expression"}]}
{"label":"SOFT SKILL 13","on_match": "None", "pattern": [{"LEMMA":"cope"},{"OP":"*"},{"POS":"NOUN"},{"LOWER":"and","OP":"?"},{"POS":"NOUN","OP":"?"}]}
{"label":"SOFT SKILL 14","on_match": "None", "pattern": [{"LEMMA":"team","DEP":"compound"},{"TEXT":{"REGEX":".*?"},"OP":"?"},{"DEP":"conj"}]}
{"label":"SOFT SKILL 15","on_match": "None", "pattern": [{"DEP":"conj"},{"TEXT":{"REGEX":".*?"},"OP":"?"},{"LEMMA":"team","DEP":"dobj"}]}
{"label":"SOFT SKILL 16","on_match": "None", "pattern": [{"LEMMA":"work"},{"POS":"ADV"}]}
{"label":"SOFT SKILL 17","on_match": "None", "pattern": [{"LEMMA":"work"},{"OP":"*"},{"DEP":"pobj"}]}
{"label":"SOFT SKILL 18","on_match": "None", "pattern": [{"DEP":{"REGEX":"amod|compound"}},{"OP":"*"},{"LEMMA":{"REGEX":"think|thinking"}}]}
{"label":"SOFT SKILL 19","on_match": "None", "pattern": [{"TAG":{"REGEX":"VB"}},{"OP":"*"},{"LEMMA":{"REGEX":" relation"}}]}
{"label":"SOFT SKILL 20","on_match": "None", "pattern": [{"DEP":"amod"},{"LEMMA":{"REGEX":" relation"}}]}
{"label":"SOFT SKILL 21","on_match": "None", "pattern": [{"LOWER":"act"},{"LOWER":"as"},{"OP":"*"},{"POS":"NOUN"}]}
{"label":"CLUE","on_match": [{"TRUNCR":0},{"TRUNCL":2}], "pattern": [{"LOWER":"ability"},{"LOWER":"to"},{"OP":"*"},{"DEP":"dobj"}]}'''

result = [json.loads(jline) for jline in jsonl_content.split('\n')]
result

[{'label': 'SOFT SKILL 1',
  'on_match': 'None',
  'pattern': [{'LEMMA': 'decision'}, {'OP': '*'}, {'LEMMA': 'make'}]},
 {'label': 'SOFT SKILL 2',
  'on_match': 'None',
  'pattern': [{'LEMMA': 'make'}, {'OP': '*'}, {'LEMMA': 'decision'}]},
 {'label': 'SOFT SKILL 3',
  'on_match': 'None',
  'pattern': [{'LEMMA': 'solve'}, {'OP': '*'}, {'DEP': 'dobj'}]},
 {'label': 'SOFT SKILL 4',
  'on_match': 'None',
  'pattern': [{'LEMMA': 'problem'}, {'OP': '*'}, {'LEMMA': 'solve'}]},
 {'label': 'SOFT SKILL 5',
  'on_match': 'None',
  'pattern': [{'TEXT': {'REGEX': 'judge'}}]},
 {'label': 'SOFT SKILL 6',
  'on_match': 'None',
  'pattern': [{'TEXT': {'REGEX': 'originality'}}]},
 {'label': 'SOFT SKILL 7',
  'on_match': 'None',
  'pattern': [{'LEMMA': 'active'}, {'OP': '*'}, {'TEXT': {'REGEX': 'ing$'}}]},
 {'label': 'SOFT SKILL 8',
  'on_match': 'None',
  'pattern': [{'TEXT': {'REGEX': 'learn'}},
   {'TEXT': {'REGEX': '.*?'}, 'IS_LOWER': True},
   {'TEXT': {'REGEX': 'active'}}]},
 {'label': 'SOFT SKILL 