In [1]:
import pandas as pd
import numpy as np

# Visualization library
import altair as alt
alt.data_transformers.enable('default', max_rows=None)

# Dates management
import datetime

# For the computation of Kaplan-Meier estimates and log-rank tests
import lifelines

#Utiliser edsnlp pour extraitre les 3 entités ci dessus : 
import spacy
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Load declared pipelines
# from edsnlp import components
from edsnlp.processing.parallel import pipe as parallel_pipe
from spacy import displacy

  from .autonotebook import tqdm as notebook_tqdm


In [29]:
# Patients
from dedup_proba import df_person_dedup_proba as df_person
# Visits
from dedup_proba import df_visit_dedup_proba as df_visit
# Diagnosis (condition)
from dedup_proba import df_condition_dedup_proba as df_condition
# Cancer patients
from dedup_proba import df_cancer_dedup_proba as df_cancer
# nbre de patients de cancer
from dedup_proba import nbre_patients_cancer_dedup_proba as nbre_patients_cancer
# risk factors
from dedup_proba import risk_factors

In [5]:
df_note = pd.read_pickle('data/df_note.pkl')

In [6]:
# Clinicians indicated that the following synonyms could be used

terms = dict(
   tabac=['tabac','fumeur','tabagisme','nicotine','cigarette','cigarettes','fume','fumer','fumait','tabagique','fumeuse','fumé'],
   alcool=['alcool','alcoolisme','Alcoolodépendance','Alcoolodépendant','Alcoolisation','alcoolique'],
   diabete=['Diabète','Hyperglycémie','Hypoglycémie','Diabétique','Polyurie','Polydipsie','Polyphagie','diabète','hyperglycémie','hypoglycémie','diabétique','polyurie','polydipsie','polyphagie'],
   sub_psy=['Psychoactive','Psychotrope','Psychostimulant','Psychodysleptique','Psychodélirant','Narcotique','Sédatif','Hypnotique','Anxiolytique','Antidépresseur','Stimulant','Hallucinogène','Dépresseur','Opiacé','Cannabinoïde','Dissociatif','Inhalant','psychoactive','psychotrope','psychostimulant','psychodysleptique','psychodélirant','narcotique','sédatif','hypnotique','anxiolytique','antidépresseur','stimulant','hallucinogène','dépresseur','opiacé','cannabinoïde','dissociatif','inhalant'],
   tum_herit=['antécédents familiaux de tumeur maligne du sein']
)

In [7]:
def pick_results(doc):
    """
    This function provides the entities that must be collected by the nlp process.
    """
    return [{
             'note_id':e.doc._.note_id,
             'visit_occurrence_id':e.doc._.visit_occurrence_id,
             'lexical_variant':e.text,
             'label':e.label_,
             'negation':e._.negation
             } 
             for e in doc.ents if doc.ents]

In [8]:
def calc_value(x):
    state = {'tabac':'unknown', 'alcool':'unknown', 'diabete':'unknown', 'sub_psy':'unknown', 'tum_herit':'unknown', }
    for _, row in x.iterrows():
        for factor in state.keys():
            if row['label'] == factor and row['negation']:
                if state[factor] == "unknown" :
                    state[factor] = "False"
                elif state[factor] == "True" :
                    state[factor] = "ambiguous"
            if row['label'] == factor and not(row['negation']):
                if state[factor] == "unknown" :
                    state[factor] = "True"
                elif state[factor] == "False" :
                    state[factor] = "ambiguous"
    return state

In [10]:
nlp = spacy.blank("fr")
# sentencizer component
nlp.add_pipe('eds.sentences')
nlp.add_pipe("eds.normalizer")
# Matcher component
nlp.add_pipe("eds.matcher", config=dict(terms=terms))
nlp.add_pipe("eds.negation")

<edsnlp.pipelines.qualifiers.negation.negation.Negation at 0x1e1acdf51c8>

In [11]:
ents = parallel_pipe(
                df_note,
                nlp,
                context=['note_id', 'visit_occurrence_id'],
                progress_bar=False,
                n_jobs=1, 
                results_extractor = pick_results,
                )



In [24]:
ents_grouped = ents.groupby('visit_occurrence_id').apply(calc_value).to_frame('state').reset_index()

    
    
for factor in risk_factors :
    ents_grouped[factor] = ents_grouped.state.apply(lambda x : x[factor]=='True')

In [25]:
ents_grouped.head()
ents_grouped.drop(columns=['state'])


Unnamed: 0,visit_occurrence_id,tabac,alcool,diabete,sub_psy,tum_herit
0,80001324.0,True,False,False,False,False
1,80001693.0,True,False,False,False,False
2,80002115.0,False,False,False,False,False
3,80002464.0,False,False,False,False,False
4,80002814.0,False,False,False,False,False
...,...,...,...,...,...,...
16104,89998804.0,True,False,False,False,False
16105,89999192.0,False,False,False,False,False
16106,89999415.0,True,False,False,False,False
16107,89999467.0,True,True,False,False,False


In [32]:
df_visit.person_id.nunique()


16168