In [None]:
# ! pip install spacy

In [None]:
# ! python -m spacy download en_core_web_lg

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
doc = nlp("Nepal is a country situated in the South Asian region")

In [4]:
doc.ents

(Nepal, South Asian)

In [5]:
doc.ents[0], type(doc.ents[0])

(Nepal, spacy.tokens.span.Span)

In [6]:
from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)

In [7]:
import json
file = '/Users/prabinnepal/Downloads/Corona2.json'

In [8]:
with open(file, 'r') as f:
    data=json.load(f)
#     print(lines)

In [9]:
data['examples'][0]

{'id': '18c2f619-f102-452f-ab81-d26f7e283ffe',
 'content': "While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]",
 'metadata': {},
 'annotations': [{'id': '0825a1

In [10]:
# Creating training data
training_data = {'classes' : ['MEDICINE', "MEDICALCONDITION", "PATHOGEN"], 'annotations' : []}
for example in data['examples']:
    
    temp_dict = {}
    temp_dict['text'] = example['content']
    temp_dict['entities'] = []
    for annotation in example['annotations']:
        start = annotation['start']
        end = annotation['end']
        label = annotation['tag_name'].upper()
        temp_dict['entities'].append((start, end, label))
    training_data['annotations'].append(temp_dict)

print(training_data)

{'classes': ['MEDICINE', 'MEDICALCONDITION', 'PATHOGEN'], 'annotations': [{'text': "While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]", 'entities': [(360, 371, 

In [11]:
# spacy uses docBin for annotated classes, so we need to create DocBin object for our training examples
from spacy.tokens import DocBin


In [12]:
nlp = spacy.blank('en')
doc_bin=DocBin()

In [14]:
from spacy.util import filter_spans

for training_example in training_data['annotations']:
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode='contract')
        if span is None:
            print('Skipping entity')
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents
    doc_bin.add(doc)

doc_bin.to_disk('training_data.spacy')

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity


In [17]:
! python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [18]:
! python -m spacy train config.cfg --output ./output --paths.train ./training_data.spacy --paths.dev ./training_data.spacy

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
[2023-03-07 13:03:15,352] [INFO] Set up nlp object from config
[2023-03-07 13:03:15,356] [INFO] Pipeline: ['tok2vec', 'ner']
[2023-03-07 13:03:15,358] [INFO] Created vocabulary
[2023-03-07 13:03:16,003] [INFO] Added vectors: en_core_web_lg
[2023-03-07 13:03:17,714] [INFO] Finished initializing nlp object
[2023-03-07 13:03:21,609] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     59.36    0.00    0.00    0.00    0.00
  3     200        325.87   3270.19   70.42   72.02   68.90    0.70
  7     400         82.63   1001.90   91.09   89.69   92.52    0.91
 

In [21]:
nlp_ner = spacy.load("output/model-last")

doc = nlp_ner("Antiretroviral therapy (ART) is recommended for all HIV-infected\
individuals to reduce the risk of disease progression.\nART also is recommended \
for HIV-infected individuals for the prevention of transmission of HIV.\nPatients \
starting ART should be willing and able to commit to treatment and understand the\
benefits and risks of therapy and the importance of adherence. Patients may choose\
to postpone therapy, and providers, on a case-by-case basis, may elect to defer\
therapy on the basis of clinical and/or psychosocial factors.")

colors = {"PATHOGEN": "#F67DE3", "MEDICINE": "#7DF6D9", "MEDICALCONDITION":"#FFFF44"}
options = {"colors": colors} 

displacy.render(doc, style="ent", options= options, jupyter=True)

In [None]:
def load_file(file_path):
    with open(file_path, 'r') as f:
        data=json.load(f)
    return data

In [None]:
data = load_file(file)

In [None]:
# creating training data
def training_data():
    # Creating training data
    training_data = {'classes' : ['MEDICINE', "MEDICALCONDITION", "PATHOGEN"], 'annotations' : []}
    for example in data['examples']:

        temp_dict = {}
        temp_dict['text'] = example['content']
        temp_dict['entities'] = []
        for annotation in example['annotations']:
            start = annotation['start']
            end = annotation['end']
            label = annotation['tag_name'].upper()
            temp_dict['entities'].append((start, end, label))
        training_data['annotations'].append(temp_dict)

#     print(training_data)
    return training_data

In [None]:
# Convert dataset into docbin object
def convert(lang, output_path):
    nlp = spacy.blank(lang)
    doc_bin=DocBin()
    
    training_data = training_data()
    
    for training_example in training_data['annotations']:
        text = training_example['text']
        labels = training_example['entities']
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in labels:
            span = doc.char_span(start, end, label=label, alignment_mode='contract')
            if span is None:
                print('Skipping entity')
            else:
                ents.append(span)
        filtered_ents = filter_spans(ents)
        doc.ents = filtered_ents
        doc_bin.add(doc)

    doc_bin.to_disk('training_data.spacy')
    