In [None]:
import json
from IPython import get_ipython
import pandas as pd
import numpy as np
from spacy import displacy
from spacy.training import Example
from pathlib import Path
from tqdm import tqdm
import numpy as np
import shutil
import spacy
import os
import re

In [None]:
with open('annotation_data.json') as json_file:
    data = json.load(json_file)

# First Format

In [None]:
train_format = []
for i in range(len(data)):
    jd = data[i]['data']
    label = data[i]['label']
    label = [tuple(j) for j in label] 
    label = {'entities': label}
    row = (jd, label)
    train_format.append(row)
    

In [None]:
def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')
    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])

    return cleaned_data

In [None]:
cleaned_data = trim_entity_spans(train_format)

In [None]:
len(cleaned_data) - round(len(cleaned_data)*0.8)

In [None]:
X_train = cleaned_data[:51]
X_val = cleaned_data[51:]

# Final Format

In [None]:
import spacy
from spacy.tokens import DocBin


def create_spacy_model(data):
    nlp = spacy.load("en_core_web_sm") # load a new spacy model
    db = DocBin() # create a DocBin object
    all_ents = []
    for text, annot in tqdm(data): # data in previous format
        doc = nlp.make_doc(text) # create doc object from text
        ents = []
        for element in annot["entities"]:
            for start, end, label in [element]: # add character indexes
                span = doc.char_span(start, end, label=label, alignment_mode="contract")
                if span is None:
                    print("Skipping entity")
                else:
                    ents.append(span)        
        doc.ents = ents # label the text with the ents
        db.add(doc)                
    #all_ents.append(ents)               
    #doc.ents = [val for ent in all_ents for val in ent]
    return db

In [None]:
train_spacy = create_spacy_model(X_train)
train_spacy.to_disk('train.spacy')
valid_spacy = create_spacy_model(X_val)
valid_spacy.to_disk('valid.spacy')

In [None]:
! python -m spacy init fill-config base_config.cfg config.cfg

In [None]:
! python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./train.spacy 

In [None]:
text = "Address scientific issues within drug discovery using existing and new innovative statistical methods through collaborating closely with internal and external partners. Designing experiments to address scientific questions clearly and accurately. Deliver statistical methodology training. Communicating statistical concepts, design recommendations, and the results of statistical analyses clearly and accurately to others. Validate the statistical aspects of Specific Pharmacology Reports written by scientists for the Dossier for all programs. Lead statistician on multiple complex projects regarding Non-Clinical Efficacy and Safety (NCES) related Research activities with minimal direction from group head. Working with other statisticians on internal department initiatives, e.g. to develop specific technical or consulting skills, new processes, etc. Reviewing literature and writing simulations to evaluate statistical methods for application to drug discovery problems. Representing our statistics team in collaborations with other internal statistics teams and scientists. Contribute to the implementation of end-user statistical applications for routine analyses within labs. Represents statistics in regulatory meetings Represent statistics in cross function working groups. , Master’s degree or PhD in Biostatistics or related field. Experience with developing new statistical methods to improve the design, analysis, and interpretation of scientific studies. Ability to collaborate well with statisticians and scientists from other disciplines. Strong user of SAS/R (R-Shiny). Explain novel and standard statistical methods clearly to scientific and non-scientific colleagues. To evaluate, recommend, and implement statistical tools for addressing scientific problems. Experience in Pre-Clinical & Research Development. Experience with some of the following techniques; Bayesian Statistics, Mixed Models, Machine Learning Algorithms, Experimental Design, Analysis of high-dimensional Data, Linear and non-Linear Regression."

In [None]:
nlp = spacy.load(os.getcwd() + "/output/model-best") #load the best model

In [None]:
#doc = nlp("s. You develop machine learning solutions and translate them into mircoservices. At the same time, you guarantee a high quality of service including the corresponding monitoring. You are responsible for the implementation of our MLOps approach and bring in your experience in the context of software delivery processes. You build data flows with relational and non-relational data using appropriate technologies in a hybrid cloud environment. In close cooperation with our IT department, you will support the development and improvement of our data science service landscape and thus help shape the future data architecture of MYTOYS Group. , Degree with a scientific/technical focus or equivalent working experience 2 - 3 years of practical experience in the development/ live provisioning of data science services (ideally in the e-commerce sector) incl. profound experience with at least one of the following languages: Python, R, Java Very good knowledge of SQL/NoSQL Databases and confident handling of software development tools (e.g. Git, GitHub, IDEs, ...) Understanding of container technologies (such as Docker), and microservice architectures in the context of Big Data and experience in building CI/CD pipelines and Infrastructure as Code processes for cloud platforms (AWS or GC) is desirable Very good analytical and conceptual skills and high willingness to learn and enjoy an iterative, experimental approach You have a hands-on mentality, high degree of initiative and a structured way of working and very good language skills in German or English , A demanding and interesting role with a lot of autonomy, responsibility, and individual training opportunities Truly flexible working hours and self-reliant work for an optimal compatibility of family and job A ")


doc  = nlp(text)
spacy.displacy.render(doc, style="ent", jupyter=True)

doc.ents

# Import from function