# Clinical notes and natural language processing

In [2]:
import pandas as pd
import numpy as np

# Visualization library
import altair as alt
alt.data_transformers.enable('default', max_rows=None)

# Dates management
import datetime

# For the computation of Kaplan-Meier estimates and log-rank tests
import lifelines

 # Table of content

# 1. Data Exploration

## 1.1 Data extracted from the Clinical Data Warehouse

In [3]:
# Patients
from dedup_proba import df_person_dedup_proba as df_person
# Visits
from dedup_proba import df_visit_dedup_proba as df_visit
# Diagnosis (condition)
from dedup_proba import df_condition_dedup_proba as df_condition
# Cancer patients
from dedup_proba import df_cancer_dedup_proba as df_cancer
# nbre de patients de cancer
from dedup_proba import nbre_patients_cancer_dedup_proba as nbre_patients_cancer

In [4]:
df_visit_cancer = pd.merge(df_visit, df_cancer, on = ['visit_occurrence_id','person_id'], how = 'left')
df_visit_cancer.head()

Unnamed: 0,visit_occurrence_id,care_site_id,visit_start_datetime,visit_end_datetime,visit_source_value,person_id,condition_occurrence_id,condition_source_value
0,82199313.0,Clinique L.Pasteur,2020-01-12,2020-01-27,Hospitalisés,87118775,,
1,84796321.0,Hopital M.Bres,2023-09-05,2023-09-21,Hospitalisés,85828583,,
2,83914646.0,Centre F.Sinoussi,2019-12-28,2020-01-12,Hospitalisés,88459112,,
3,84893973.0,Centre F.Sinoussi,2020-12-26,2020-12-26,Hospitalisés,88291057,,
4,88339649.0,GHU A.Fleming,2024-12-14,2025-01-03,Hospitalisés,89239332,,


How many patients are in the study?
<br>We suppose deduplication has already been done.

In [5]:
print(f"We have {df_person.person_id.nunique()} unique patient ids in this dataset.")

We have 15406 unique patient ids in this dataset.


In [6]:
df_value_count = df_visit_cancer.person_id.value_counts()
n_numerous = df_value_count[df_value_count > 1].size
print("{} patients have more than one visit".format(n_numerous))

0 patients have more than one visit


How many patients have had cancer

In [7]:
print(f" {nbre_patients_cancer} patients have had cancer.")

 4114 patients have had cancer.


How many patients have had no cancer

In [8]:
print(f"{df_person.person_id.nunique() - nbre_patients_cancer} patients have had no cancer.")

11292 patients have had no cancer.


## 1.2 Clinical Notes

In [9]:
df_note = pd.read_pickle('data/df_note.pkl')
df_note.info()
df_note.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16168 entries, 0 to 216
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   visit_occurrence_id  16168 non-null  float64       
 1   note_datetime        16168 non-null  datetime64[ns]
 2   note_id              16168 non-null  float64       
 3   cdm_source           16168 non-null  object        
 4   note_text            16168 non-null  object        
dtypes: datetime64[ns](1), float64(2), object(2)
memory usage: 757.9+ KB


Unnamed: 0,visit_occurrence_id,note_datetime,note_id,cdm_source,note_text
0,86526573.0,2023-03-21,87594092.0,EHR 1,Compte rendu de consultation\n\nPatient : [Nom...
1,82217661.0,2024-07-19,89266782.0,EHR 1,Compte rendu de consultation\n\nPatient : [Nom...
2,80088693.0,2022-07-22,88212468.0,EHR 1,Compte rendu de consultation\n\nPatient : [Nom...
3,80333633.0,2025-03-07,80660564.0,EHR 1,Compte rendu de consultation\n\nPatient : [Nom...
4,84533955.0,2020-08-26,80382651.0,EHR 1,Compte rendu de consultation\n\nPatient : [Nom...


In [10]:
print(df_note.sample()['note_text'].squeeze())

Compte rendu de consultation

Patient : [Nom du patient]
Âge : [Âge du patient]
Sexe : [Sexe du patient]
Numéro de dossier : [Numéro de dossier du patient]

Motif de la consultation:
La patiente a été informée que son dépistage au cancer du sein était négatif, mettant ainsi en évidence l'absence de tumeur maligne ou de signes préoccupants dans ses résultats d'imagerie.

Antecedents familiaux :
Elle a été encouragée à maintenir un mode de vie sain et à continuer à éviter l'exposition à la fumée de cigarette, même à l'extérieur de son foyer.

Examen du patient:
La patiente a admis ouvertement être une fumeuse active et rapporte consommer environ un paquet de cigarettes par jour.

Signature du médecin :
[Nom du médecin]
[Titre/Spécialité]
[Hôpital/Service]


# 2. First steps with natural language processing


Let's now define a new rule-based NLP algorithm that extracts drugs mentioned in clinical notes. 

We define a rule-based algorithm that consists in looking for mentions of cancer and its risk factors in the texts and in discarding false positive detections by predicting modifiers (negation, etc.).

**Step 1: Definition of the vocabularies**

In [45]:
# Clinicians indicated that the following synonyms could be used

terms = dict(
   tabac=['tabac','fumeur','tabagisme','nicotine','cigarette','cigarettes','fume','fumer','fumait','tabagique','fumeuse','fumé'],
   alcool=['alcool','alcoolisme','Alcoolodépendance','Alcoolodépendant','Alcoolisation','alcoolique']
)


**Step 2: Definition of a natural language processing pipeline (rule-based)**

We now integrate this dictionary in a NLP-pipeline as described in the *eds-nlp* documentation, in order to realize the various pre-processing steps necessary to extract a meaningful variable:

In [46]:
#Utiliser edsnlp pour extraitre les 3 entités ci dessus : 
import spacy
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Load declared pipelines
# from edsnlp import components
from edsnlp.processing.parallel import pipe as parallel_pipe
from spacy import displacy

Creating a first pipeline using :

In [47]:
nlp = spacy.blank("fr")
# sentencizer component
nlp.add_pipe('eds.sentences')
nlp.add_pipe("eds.normalizer")
# Matcher component
nlp.add_pipe("eds.matcher", config=dict(terms=terms))
nlp.add_pipe("eds.negation")

<edsnlp.pipelines.qualifiers.negation.negation.Negation at 0x24eb55dce48>

In [48]:
text = df_note.query("note_id==85488297.0")['note_text'].squeeze()
# Process your text in one call !
doc = nlp(text)

colors = {
    "tabac": "orange",
    "alcool":"red"
}
options = {
    "colors": colors,
}

displacy.render(doc, style="ent", options=options)

**Step 3: Application of the natural language processing pipeline on all the clinical notes**

Apply this NLP pipeline to the texts of our dataset to extract entities by using the `edsnlp.parallel_pipe()` function

A `pick_result` function is given to standardise the output.


In [49]:
def pick_results(doc):
    """
    This function provides the entities that must be collected by the nlp process.
    """
    return [{
             'note_id':e.doc._.note_id,
             'visit_occurrence_id':e.doc._.visit_occurrence_id,
             'lexical_variant':e.text,
             'label':e.label_,
             'negation':e._.negation
             } 
             for e in doc.ents if doc.ents]

In [50]:
ents = parallel_pipe(
                df_note,
                nlp,
                context=['note_id', 'visit_occurrence_id'],
                progress_bar=False,
                n_jobs=1, 
                results_extractor = pick_results,
                )



Show the first rows of the dataframe `ents`

In [51]:
ents.head()

Unnamed: 0,note_id,visit_occurrence_id,lexical_variant,label,negation
0,87594092.0,86526573.0,tabagisme,tabac,False
1,89266782.0,82217661.0,fumer,tabac,True
2,89266782.0,82217661.0,tabac,tabac,False
3,89266782.0,82217661.0,fume,tabac,False
4,88212468.0,80088693.0,tabagisme,tabac,False


**Step 4: Create rules to detect each risk factor**

In [69]:
def calc_value(x):
    state = {'tabac':'unknown'}
    for _, row in x.iterrows():
        for factor in state.keys():
            if row['label'] == factor and row['negation']:
                if state[factor] == "unknown" :
                    state[factor] = "False"
                elif state[factor] == "True" :
                    state[factor] = "ambiguous"
            if row['label'] == factor and not(row['negation']):
                if state[factor] == "unknown" :
                    state[factor] = "True"
                elif state[factor] == "False" :
                    state[factor] = "ambiguous"
    return state

undesired_state = ['unknown', 'ambiguous']
ents_grouped = ents.groupby('visit_occurrence_id').apply(calc_value).to_frame('state').reset_index()
ents_grouped['tabac'] = ents_grouped.state.apply(lambda x : x['tabac'])
ents_grouped.head()

Unnamed: 0,visit_occurrence_id,state,tabac
0,80001324.0,{'tabac': 'True'},True
1,80001693.0,{'tabac': 'True'},True
2,80002115.0,{'tabac': 'ambiguous'},ambiguous
3,80002464.0,{'tabac': 'False'},False
4,80002814.0,{'tabac': 'False'},False


**Step 5: Compare nlp results to structured data provided by the hospitals**

In [72]:
df_state = df_visit[['visit_occurrence_id','person_id']].merge(ents_grouped[['visit_occurrence_id','tabac']], on='visit_occurrence_id', how='left')
df_state = df_state.fillna({'tabac':'False'})
print("le nombre de patients qui fument détecté par l'algorithme : ", df_state.query("tabac=='True'").person_id.nunique())

le nombre de patients qui fument détecté par l'algorithme :  8352


In [60]:
from plot_hist import risk_factors
print('le nombre de patients qui fument en utilisant la table condition : ',df_condition[df_condition['condition_source_value'].isin(risk_factors['tabac'])].person_id.nunique())

le nombre de patients qui fument en utilisant la table condition :  663


**Step 6: evaluate the performance of the nlp algorithm**

On a vérifié à la main les résultats fournis par notre algorithme de détection du tabagisme pour 50 patients afin de vérifier sa perfomrance

On a calculé les vrais positifs, vrais négatifs, faux positifs et faux négatifs

In [130]:
VP = 20
VN = 17
FP = 3
FN = 10
print('sensitivity = ', round(VP/(VP+FN),2), '\nspecificity = ', round(VN/(VN+FP),2))  

sensitivity =  0.67 
specificity =  0.85


la sensitivité de notre test est faible dans notre cas et on ne peut pas l'accepter car elle peut engendrer beaucoup d'erreurs.