## Import libraries and define parameters

In [1]:
import os
import sys  

from tqdm import tqdm

import pandas as pd
import numpy as np
import json
import re

import nlpaug.augmenter.word as naw

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.chdir('/Users/Antoine/data_science_projects/3_nlp/drug_prescription_understanding')
from src.data_augmentation.back_translation import find_position, process
from src.Utils.utils import read_jsonl_df
os.chdir('/Users/Antoine/data_science_projects/3_nlp/drug_prescription_understanding/notebooks')

In [6]:
DATA_PATH = "/Users/Antoine/Desktop/Cours/X-HEC/HEC/Cours/Quinten - drug prescription"

## Import and shape data

In [4]:
with open(os.path.join(DATA_PATH, 'trainset.jsonl'), 'r') as json_file:
    json_list = list(json_file)
ids = np.array([])
texts = np.array([])
labels = []
comments = []

for json_str in tqdm(json_list):
    result = json.loads(json_str)
    ids = np.append(ids, result['id'])
    texts = np.append(texts, result['text'])
    labels.append(result['labels'])
    comments.append(result['Comments'])
df = pd.DataFrame([ids, texts, labels, comments]).T
df.columns = ['id', 'text', 'labels', 'comments']
df.id = df.id.astype(int)

print(df.shape)
df.head()

100%|██████████| 567/567 [00:13<00:00, 41.32it/s] 


(567, 4)


Unnamed: 0,id,text,labels,comments
0,1,Un homme âgé de 77 ans présentait des signes c...,"[[692, 712, Treatment]]",[]
1,2,"Mr. R,R âgé de 53 ans sans antécédents patholo...","[[1224, 1237, Treatment]]",[]
2,3,Un homme de 47 ans aux antécédents d’infarctus...,[],[]
3,4,"Mme C.S..., âgée de 25 ans, célibataire, sans ...",[],[]
4,5,Un adénocarcinome prostatique avait été diagno...,"[[183, 196, Treatment], [1374, 1388, Treatment...",[]


In [5]:
# Explode the labelled word in columns
# OPTIONAL
#for index, row in tqdm(df.iterrows()):
#    text = df.text[index]
#    for label in df.labels[index]:
#        type = label[-1]
#        df.at[index, f"{type}"] = text[label[0]: label[1]]
#df.head()

## Apply model
### We use a back translation model taken from nlpaug

In [6]:
back_translation_aug = naw.BackTranslationAug(
    from_model_name='Helsinki-NLP/opus-mt-fr-en', 
    to_model_name='Helsinki-NLP/opus-mt-en-fr'
)

Back translation applied on whole sentences change the number of words. It is hence impossible to track the position of labeled words.  
We try to use translation on specific words or expressions.  

In [7]:
#def find_position(word, text, type):
#    '''
#    Finds the positions of all the occurences of the word in the text.
#    '''
#    word_length = len(word)
#    first_letter_positions = [m.start() for m in re.finditer(word, text)]
#    return [[start, start+word_length, type] for start in first_letter_positions]

In [8]:
#def process(text, label_position_list, nlpaug_model):
#    '''
#    Process back-translation to generate new text.
#    Finds the labels in the new text.
#
#    Input : 
#        text, str
#        label_position_list, list(int, int, str)
#    Output : 
#        transformed_text, str
#        new_labels, list(int, int, str)
#    '''
#    transformed_text= nlpaug_model.augment(text)[0]
#
#    if len(label_position_list)==0:
#        return transformed_text, []
#
#    # Extract labelled words and back-translate them
#    label_list = [text[int(position[0:2][0]): int(position[0:2][1])] for position in np.array(label_position_list)[0:1]] 
#    transformed_labels = nlpaug_model.augment(label_list)
#
#    if transformed_text==text:
#        print('Back-translation did not change the text.')
#        return None, None
#    
#    # Itterating through the labelled word to see if we can find them in the back-translated text
#    new_labels = []
#    for label_ind in range(len(label_list)):   
#        type = np.array(label_position_list)[:,2][label_ind]
#        
#        label = label_list[label_ind]
#        label_transformed = transformed_labels[label_ind] 
#        
#        # If a labelled word is lost, we drop the back-translation
#        if label not in transformed_text and label_transformed not in transformed_text :
#            print('Label lost in translation.')
#            print(f"Label: {label}")
#            print(f"Transformed label: {label_transformed}")
#            return None, None
#
#        elif label in transformed_text :
#            print(f'Label "{label}" found in transformed text.')
#            new_positions = find_position(label_list[label_ind], transformed_text, type)
#            new_labels += new_positions
#
#        elif label_transformed in transformed_text:
#            print(f'Transformed label "{label_transformed}" found in transformed text.')
#            new_positions = find_position(transformed_labels[label_ind], transformed_text, type)
#            new_labels += new_positions
#
#    return transformed_text, new_labels

## Apply back-translating and update dataframe

In [9]:
df2 = df.copy().loc[:1, :]
res = df2[["text", "labels"]].apply(lambda x: process(x[0], x[1], back_translation_aug), axis=1)
df2[["transformed_text", "new_labels"]] = pd.DataFrame(res.to_list())
df2.to_csv(os.path.join(DATA_PATH, 'data_with_bt.csv'), header=True, index=False)
print(df2.shape)
df2.head()

Transformed label "blocage androgène" found in transformed text.
Label "radiothérapie" found in transformed text.
(2, 6)


Unnamed: 0,id,text,labels,comments,transformed_text,new_labels
0,1,Un homme âgé de 77 ans présentait des signes c...,"[[692, 712, Treatment]]",[],Un mâle de 77 ans a montré des signes clinique...,"[[656, 673, Treatment]]"
1,2,"Mr. R,R âgé de 53 ans sans antécédents patholo...","[[1224, 1237, Treatment]]",[],L'examen clinique n'était pas inhabituel. Le t...,"[[755, 768, Treatment]]"


In [7]:
df = pd.read_csv(os.path.join(DATA_PATH, "data_with_bt.csv"))
df.head()

Unnamed: 0,id,text,labels,Comments,transformed_text,new_labels
0,1,Un homme âgé de 77 ans présentait des signes c...,"[[692, 712, 'Treatment']]",[],Un mâle de 77 ans a montré des signes clinique...,"[[656, 673, 'Treatment']]"
1,2,"Mr. R,R âgé de 53 ans sans antécédents patholo...","[[1224, 1237, 'Treatment']]",[],L'examen clinique n'était pas inhabituel. Le t...,"[[755, 768, 'Treatment']]"
