In [33]:
import spacy
import random
import json

In [36]:
# Read json file and transform it to array 
def transform_json():
    file1 = open('med-corpus.json', 'r') 
    Lines = file1.readlines() 
  
    TRAIN_DATA = [] #array of train data with marked up medical entities
    count = 0
    # read lines from file and parsing them into special array
    for line in Lines: 
        res = json.loads(line)   
        text = res['content']
        entities = []
        for annotation in res['annotation']:
            point = annotation['points'][0]
            labels = annotation['label']
            if not isinstance(labels, list):
                labels = [labels]
            for label in labels:
                entities.append((point['start'], point['end'] + 1 ,label))
        TRAIN_DATA.append((text, {"entities" : entities}))
    return TRAIN_DATA
TRAIN_DATA = transform_json()

In [37]:

def train_spacy(data,iterations):
    TRAIN_DATA = data
    nlp = spacy.blank('de')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
       

    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print("Statring iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
    return nlp


prdnlp = train_spacy(TRAIN_DATA, 20)

# Save our trained Model
modelfile = input("Enter your Model Name: ")
prdnlp.to_disk(modelfile)

#Test your text
test_text = input("Enter your testing text: ")
doc = prdnlp(test_text)
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Statring iteration 0
{'ner': 921.5246044749737}
Statring iteration 1
{'ner': 455.332005107521}
Statring iteration 2
{'ner': 279.35376301661586}
Statring iteration 3
{'ner': 191.45634355634186}
Statring iteration 4
{'ner': 166.4258137773611}
Statring iteration 5
{'ner': 113.97412044521946}
Statring iteration 6
{'ner': 95.71254520028393}
Statring iteration 7
{'ner': 70.31656847515129}
Statring iteration 8
{'ner': 103.21219870754994}
Statring iteration 9
{'ner': 71.99977922312672}
Statring iteration 10
{'ner': 64.97002894169115}
Statring iteration 11
{'ner': 93.47373777777473}
Statring iteration 12
{'ner': 36.97314015377853}
Statring iteration 13
{'ner': 30.076374769847774}
Statring iteration 14
{'ner': 47.88700323790153}
Statring iteration 15
{'ner': 40.13505013365079}
Statring iteration 16
{'ner': 42.42222016517817}
Statring iteration 17
{'ner': 35.32690847428409}
Statring iteration 18
{'ner': 31.906044981813064}
Statring iteration 19
{'ner': 42.275175095313685}
Enter your Model Name: m