In [20]:
import platform
print(platform.python_version())
print(spacy.__version__)
print(json.__version__)

3.7.8
2.3.2
2.0.9


In [21]:
import json
import logging
import sys

def tsv_to_json(input_path, output_path, unknown_label):
    try:
        input_file = open(input_path, 'r', encoding='UTF-8')
        output_file = open(output_path, 'w', encoding='UTF-8')
        data_dict = {}
        annotations = []
        label_dict = {}
        s = ''
        start = 0
        for line in input_file:
            if line[0:len(line)-1] != '.\tO':
                word, entity = line.split('\t')
                s += word+" "
                entity = entity[:len(entity)-1]
                if entity != unknown_label and len(entity) != 1:
                    d = {}
                    d['text'] = word
                    d['start'] = start
                    d['end'] = start+len(word)-1
                    try:
                        label_dict[entity].append(d)
                    except:
                        label_dict[entity] = []
                        label_dict[entity].append(d)
                start += len(word)+1
            else:
                data_dict['content'] = s
                s = ''
                label_list = []
                for ents in list(label_dict.keys()):
                    for i in range(len(label_dict[ents])):
                        if(label_dict[ents][i]['text'] != ''):
                            l = [ents, label_dict[ents][i]]
                            for j in range(i+1, len(label_dict[ents])):
                                if(label_dict[ents][i]['text'] == label_dict[ents][j]['text']):
                                    di = {}
                                    di['start'] = label_dict[ents][j]['start']
                                    di['end'] = label_dict[ents][j]['end']
                                    di['text'] = label_dict[ents][i]['text']
                                    l.append(di)
                                    label_dict[ents][j]['text'] = ''
                            label_list.append(l)

                for entities in label_list:
                    label = {}
                    label['label'] = [entities[0]]
                    label['points'] = entities[1:]
                    annotations.append(label)
                data_dict['annotation'] = annotations
                annotations = []
                json.dump(data_dict, output_file, ensure_ascii=False)
                output_file.write('\n')
                data_dict = {}
                label_dict = {}
                start = 0                
    except Exception as e:
        logging.exception("Unable to process file\nerror = " + str(e))
        raise e

tsv_to_json("med-corpus.tsv", 'med-corpus.json', 'abc')

In [22]:
import spacy
import json
import random
# Read json file and transform it to array 
def transform_json(json_file_path):
    corpus = open(json_file_path, 'r') 
    lines = corpus.readlines() 
  
    training_data = [] #array of train data with marked up medical entities
    count = 0
    # read lines from file and parsing them into special array
    for line in lines: 
        res = json.loads(line)   
        text = res['content']
        entities = []
        for annotation in res['annotation']:
            point = annotation['points'][0]
            labels = annotation['label']
            if not isinstance(labels, list):
                labels = [labels]
            for label in labels:
                entities.append((point['start'], point['end'] + 1 ,label))
        training_data.append((text, {"entities" : entities}))
    return training_data
training_data = transform_json('med-corpus.json')

In [23]:

def train_spacy(data,iterations):    
    nlp = spacy.blank('de')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
       

    # add labels
    for _, annotations in data:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print("Statring iteration " + str(itn))
            random.shuffle(data)
            losses = {}
            for text, annotations in data:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
    return nlp


prdnlp = train_spacy(training_data, 20)

# Save our trained Model
modelfile = 'models/' + input("Enter your Model Name: ")
prdnlp.to_disk(modelfile)

Statring iteration 0
{'ner': 967.3993598310248}
Statring iteration 1
{'ner': 464.7793052301019}
Statring iteration 2
{'ner': 323.11242348422314}
Statring iteration 3
{'ner': 224.23507634328385}
Statring iteration 4
{'ner': 181.11386628508345}
Statring iteration 5
{'ner': 151.26647821432104}
Statring iteration 6
{'ner': 131.36345293346355}
Statring iteration 7
{'ner': 70.64205344676871}
Statring iteration 8
{'ner': 62.13054990646648}
Statring iteration 9
{'ner': 73.58742452194633}
Statring iteration 10
{'ner': 79.40042562043193}
Statring iteration 11
{'ner': 122.54378101807364}
Statring iteration 12
{'ner': 60.725216368008624}
Statring iteration 13
{'ner': 60.85079054901087}
Statring iteration 14
{'ner': 59.25991630099659}
Statring iteration 15
{'ner': 54.13558897673037}
Statring iteration 16
{'ner': 43.39382800352808}
Statring iteration 17
{'ner': 45.852393317934144}
Statring iteration 18
{'ner': 57.0371787529803}
Statring iteration 19
{'ner': 37.60677288285902}


In [24]:
#Test your text
test_text = input("Enter your testing text: ")
doc = prdnlp(test_text)
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Medikament 54 64 b-med
Placebos 116 124 b-med
Medikamente 155 166 b-med
Wirkstoff 213 222 b-med
Wirkstoff 403 412 b-med
Körper 416 422 b-med
Wirkstoff 493 502 b-med
Zopiclon 779 787 b-med
