In [1]:
!pip install -q -r requirements.txt

You should consider upgrading via the '/Users/nicolas.dominutti/Desktop/ml/medical-qa-system/.venv/bin/python3.10 -m pip install --upgrade pip' command.[0m


In [2]:
import pandas as pd
pd.set_option('display.max_colwidth', None)  
RANDOM_SEED = 42
CHUNK_MAX_LENGTH = 512
TEST_DATA_FRAC = .2

## Read data

In [3]:
df = pd.read_csv('data/intern_screening_dataset.csv')

In [4]:
# we have null answers (5)
df = df[~df.answer.isnull()]

## Text cleansing

In [5]:
# repetition of "(are)", like in What is (are) Hyperthyroidism ?
df.loc[:, 'question'] = df['question'].apply(lambda q: q.replace("(are)", "").replace('? ?','?').strip())

In [6]:
import re
def remove_pii(text: str) -> str:
    text = re.sub(r'\b\S+@\S+\.\S+\b', '', text)
    text = re.sub(r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', '', text)
    text = re.sub(r'\b\d{3}-\d{2}-\d{4}\b', '', text)
    text = re.sub(r'\d{1,5}\s[\w\s]{2,30}(Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln)\b', '', text, flags=re.IGNORECASE)
    return text
df.loc[:, 'answer'] = df['answer'].apply(lambda ans: remove_pii(ans.strip()))

## Chunking

In [7]:
from llama_index.core.node_parser import SentenceSplitter 

In [8]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5")

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
splitter = SentenceSplitter(
    chunk_size=CHUNK_MAX_LENGTH,
    tokenizer=tokenizer.tokenize,
    chunk_overlap=200,
    separator=".",
    paragraph_separator="\n"
)

In [10]:
df.loc[:, 'answer'] = df['answer'].apply(lambda ans: splitter.split_text(ans))
df = df.explode('answer').reset_index(drop=True)

Token indices sequence length is longer than the specified maximum sequence length for this model (1191 > 512). Running this sequence through the model will result in indexing errors


In [11]:
#remove some duplicates
print(f"previous: {df.shape}")
df = df.drop_duplicates(subset=['question', 'answer'])
print(f"after: {df.shape}")

previous: (20044, 2)
after: (19935, 2)


In [12]:
from sentence_transformers import SentenceTransformer
import torch
NEGATIVES_N = 15
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer('BAAI/bge-small-en-v1.5', device=DEVICE)

In [13]:
from utils import TripletsMiner
tm = TripletsMiner(model, RANDOM_SEED)
negatives = tm.get_negatives(df, 'soft', NEGATIVES_N)

In [14]:
unique_questions = pd.DataFrame(df.question.unique(), columns=['question']).reset_index(names='question_id')
unique_answers = pd.DataFrame(df.answer.unique(), columns=['answer']).reset_index(names='chunk_id')

df = df.merge(
    unique_answers,
    how='left',
    on='answer',
)

df = df.merge(
    unique_questions,
    how='left',
    on='question',
)

In [15]:
gb_gen = df.groupby(['question_id','question'])
df = gb_gen['answer'].apply(list).reset_index()
chunks = gb_gen['chunk_id'].apply(list).reset_index()
df['neg'] = negatives
df = df.merge(chunks[['question_id','chunk_id']], on='question_id', how='left')

In [16]:
df = df.rename(columns={'answer': 'pos'})
df = df.rename(columns={'question': 'query'})

In [17]:
from datasets import Dataset
dataset = Dataset.from_pandas(df, preserve_index=False)
splits = dataset.train_test_split(test_size=TEST_DATA_FRAC, seed=RANDOM_SEED)
train_df = splits['train']
val_df = splits['test']

In [18]:
train_df.to_json("data/training.json")

Creating json from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Creating json from Arrow format: 100%|██████████| 12/12 [00:01<00:00,  9.43ba/s]


217313593

### Test sets

In [19]:
queries = val_df.select_columns(column_names=["question_id", "query"])
queries = queries.rename_columns({"query": "text", "question_id": "id"})
queries[0]

{'id': 12782,
 'text': 'How many people are affected by von Hippel-Lindau syndrome ?'}

In [20]:
corpus = Dataset.from_pandas(unique_answers, preserve_index=False)
corpus = corpus.select_columns(column_names=["chunk_id", "answer"])
corpus = corpus.rename_columns({"answer": "text", "chunk_id": "id"})

In [21]:
qrels = val_df.select_columns(["question_id"])
qrels = qrels.rename_column("question_id", "qid")
qrels = qrels.add_column("docid", list(val_df["chunk_id"]))
qrels = qrels.add_column("relevance", [1]*len(list(val_df["chunk_id"])))
qrels[0]

Flattening the indices: 100%|██████████| 2996/2996 [00:00<00:00, 213771.58 examples/s]


{'qid': 12782, 'docid': [15870], 'relevance': 1}

In [22]:
queries.to_json("data/test_queries.jsonl")
corpus.to_json("data/corpus.jsonl")
qrels.to_json("data/test_qrels.jsonl")

Creating json from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 303.03ba/s]
Creating json from Arrow format: 100%|██████████| 19/19 [00:00<00:00, 88.22ba/s]
Creating json from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 621.22ba/s]


133591

### Saving to S3

In [23]:
import boto3
import os
from dotenv import load_dotenv

load_dotenv(".env")
AWS_REGION = os.getenv("AWS_REGION_NAME")
AWS_KEY = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET = os.getenv("AWS_SECRET_ACCESS_KEY")

boto3_client = boto3.client(
    "s3",
    aws_access_key_id=AWS_KEY,
    aws_secret_access_key=AWS_SECRET,
    region_name=AWS_REGION,
)

In [24]:
# for file in ["training.json", "test_queries.jsonl", "corpus.jsonl", "test_qrels.jsonl"]:
#     boto3_client.upload_file(
#         f"data/{file}",
#         "medical-qa-data",
#         file,
#     )