In [5]:
import pandas as pd
import numpy as np

# Visualization library
import altair as alt
alt.data_transformers.enable('default', max_rows=None)

# Dates management
import datetime

# For the computation of Kaplan-Meier estimates and log-rank tests
import lifelines

#Utiliser edsnlp pour extraitre les 3 entités ci dessus : 
import spacy
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Load declared pipelines
# from edsnlp import components
from edsnlp.processing.parallel import pipe as parallel_pipe
from spacy import displacy

In [6]:
df_person = pd.read_pickle('data/df_person.pkl')
df_visit = pd.read_pickle('data/df_visit.pkl')
df_condition = pd.read_pickle('data/df_condition.pkl')
df_dedup_proba = pd.read_pickle('data/df_dedup_proba.pkl')
df_note = pd.read_pickle('data/df_note.pkl')
df_bio = pd.read_pickle('data/df_bio.pkl')

In [7]:
cancer_condition = {'C50','C500','C501','C502','C503','C504'}
risk_factors = {'tabac':['Z587','Z720'], 'alcool':['T51','K70','F10'], 'diabete':['E10','E11','E12'], 'sub_psy':['Z864'], 'tum_herit':['Z803']}

In [8]:
# Clinicians indicated that the following synonyms could be used

terms = dict(
   tabac=['tabac','fumeur','tabagisme','nicotine','cigarette','cigarettes','fume','fumer','fumait','tabagique','fumeuse','fumé'],
   alcool=['alcool','alcoolisme','Alcoolodépendance','Alcoolodépendant','Alcoolisation','alcoolique'],
   diabete=['Diabète','Hyperglycémie','Hypoglycémie','Diabétique','Polyurie','Polydipsie','Polyphagie','diabète','hyperglycémie','hypoglycémie','diabétique','polyurie','polydipsie','polyphagie'],
   sub_psy=['Psychoactive','Psychotrope','Psychostimulant','Psychodysleptique','Psychodélirant','Narcotique','Sédatif','Hypnotique','Anxiolytique','Antidépresseur','Stimulant','Hallucinogène','Dépresseur','Opiacé','Cannabinoïde','Dissociatif','Inhalant','psychoactive','psychotrope','psychostimulant','psychodysleptique','psychodélirant','narcotique','sédatif','hypnotique','anxiolytique','antidépresseur','stimulant','hallucinogène','dépresseur','opiacé','cannabinoïde','dissociatif','inhalant'],
   tum_herit=['antécédents familiaux de tumeur maligne du sein']
)

In [9]:
def deduplicate_proba(df_person: pd.DataFrame, df_dedup_proba: pd.DataFrame, score: int):
    #Only keep rows with a probability above the value score
    df_dedup_proba_score = df_dedup_proba[df_dedup_proba['prob'] > score]
    # Outer Join
    df_person_dedup_proba  = pd.merge(df_person, df_dedup_proba_score, on = 'person_id', how = 'outer')
    # Only unique ids in unique_person_id
    df_person_dedup_proba['unique_person_id'] = df_person_dedup_proba['unique_person_id'].fillna(df_person_dedup_proba['person_id'])
    # Only keep one row per patient
    df_person_dedup_proba = df_person_dedup_proba.drop_duplicates(['unique_person_id'], keep = 'first')
    return df_person_dedup_proba

In [10]:
def pick_results(doc):
    """
    This function provides the entities that must be collected by the nlp process.
    """
    return [{
             'note_id':e.doc._.note_id,
             'visit_occurrence_id':e.doc._.visit_occurrence_id,
             'lexical_variant':e.text,
             'label':e.label_,
             'negation':e._.negation
             } 
             for e in doc.ents if doc.ents]

In [11]:
def calc_value(x):
    state = {'tabac':'unknown', 'alcool':'unknown', 'diabete':'unknown', 'sub_psy':'unknown', 'tum_herit':'unknown', }
    for _, row in x.iterrows():
        for factor in state.keys():
            if row['label'] == factor and row['negation']:
                if state[factor] == "unknown" :
                    state[factor] = "False"
                elif state[factor] == "True" :
                    state[factor] = "ambiguous"
            if row['label'] == factor and not(row['negation']):
                if state[factor] == "unknown" :
                    state[factor] = "True"
                elif state[factor] == "False" :
                    state[factor] = "ambiguous"
    return state

In [12]:
df_person['gender_source_value'] = df_person['gender_source_value'].replace(['female', 'f'], 'f')
df_person['gender_source_value'] = df_person['gender_source_value'].replace(['male', 'm'], 'm')

In [13]:
df_person = deduplicate_proba(df_person, df_dedup_proba, score=0.90)
df_condition = df_condition[df_condition.person_id.isin(df_person.person_id)]
df_visit = df_visit[df_visit.person_id.isin(df_person.person_id)]
df_note = df_note[df_note.visit_occurrence_id.isin(df_visit.visit_occurrence_id)]
df_bio = df_bio[df_bio.visit_occurrence_id.isin(df_visit.visit_occurrence_id)]
df_cancer = df_condition[df_condition['condition_source_value'].isin(cancer_condition)]
nbre_patients_cancer = df_cancer.person_id.nunique()

In [14]:
nlp = spacy.blank("fr")
# sentencizer component
nlp.add_pipe('eds.sentences')
nlp.add_pipe("eds.normalizer")
# Matcher component
nlp.add_pipe("eds.matcher", config=dict(terms=terms))
nlp.add_pipe("eds.negation")

<edsnlp.pipelines.qualifiers.negation.negation.Negation at 0x27692b169c8>

In [15]:
ents = parallel_pipe(
                df_note,
                nlp,
                context=['note_id', 'visit_occurrence_id'],
                progress_bar=False,
                n_jobs=1, 
                results_extractor = pick_results,
                )



In [76]:
ents_grouped = ents.groupby('visit_occurrence_id').apply(calc_value).to_frame('state').reset_index()

    
    
for factor in risk_factors :
    ents_grouped[factor] = ents_grouped.state.apply(lambda x : x[factor]=='True')

In [77]:
ents_grouped.head()
ents_grouped = ents_grouped.drop(columns=['state'])


In [78]:
code_to_risk_factor = {}
for key,code_list in risk_factors.items() :
    for code in code_list :
        code_to_risk_factor[code] = key

In [79]:
df_condition["risk_factor"] = df_condition.condition_source_value.map(code_to_risk_factor)
df_risk_factors = df_condition[df_condition.risk_factor.notna()].drop(columns=['condition_occurrence_id','condition_source_value'])
df_risk_factors['value'] = True
df_risk_factors = df_risk_factors.pivot(index='visit_occurrence_id', columns=['risk_factor'], values='value').reset_index()
df_risk_factors = df_risk_factors.fillna(False)


In [80]:
df_risk_factors = df_risk_factors.merge(ents_grouped, on='visit_occurrence_id', how='outer').fillna(False)
df_risk_factors

Unnamed: 0,visit_occurrence_id,alcool_x,diabete_x,sub_psy_x,tabac_x,tum_herit_x,tabac_y,alcool_y,diabete_y,sub_psy_y,tum_herit_y
0,80001693.0,True,False,False,False,False,True,False,False,False,False
1,80003678.0,False,False,True,False,False,True,False,False,False,False
2,80007123.0,True,False,False,False,False,False,False,False,False,False
3,80008076.0,True,False,False,False,True,False,False,False,False,False
4,80008149.0,True,True,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
15360,89998804.0,False,False,False,False,False,True,False,False,False,False
15361,89999192.0,False,False,False,False,False,False,False,False,False,False
15362,89999415.0,False,False,False,False,False,True,False,False,False,False
15363,89999467.0,False,False,False,False,False,True,True,False,False,False


In [87]:
df_risk_factors['tabac'] = df_risk_factors.tabac_x | df_risk_factors.tabac_y
df_risk_factors['alcool'] = df_risk_factors.alcool_x | df_risk_factors.alcool_y
df_risk_factors['diabete'] = df_risk_factors.diabete_x | df_risk_factors.diabete_y
df_risk_factors['sub_psy'] = df_risk_factors.sub_psy_x | df_risk_factors.sub_psy_y
df_risk_factors['tum_herit'] = df_risk_factors.tum_herit_x | df_risk_factors.tum_herit_y

df_risk_factors = df_risk_factors.drop(columns=['tabac_x','alcool_x','diabete_x','sub_psy_x','tum_herit_x','tabac_y','alcool_y','diabete_y','sub_psy_y','tum_herit_y'])

In [89]:
df_risk_factors = df_risk_factors.merge(df_visit[['visit_occurrence_id','person_id',]])

Unnamed: 0,visit_occurrence_id,alcool,tabac,diabete,sub_psy,tum_herit
0,80001693.0,True,True,False,False,False
1,80003678.0,False,True,False,True,False
2,80007123.0,True,False,False,False,False
3,80008076.0,True,False,False,False,True
4,80008149.0,True,True,True,False,True
...,...,...,...,...,...,...
15360,89998804.0,False,True,False,False,False
15361,89999192.0,False,False,False,False,False
15362,89999415.0,False,True,False,False,False
15363,89999467.0,True,True,False,False,False
