# Система выбора подходящих медицинских записей

По итогам тестирования модели выберем модель голосования как наиболее успешную модель, так как мера precision у метки YES у этой модели наибольшая, а значит, более точно отбираются те пациенты, которые действительно подходят под критерии исследования.

В соответствии с этим выбором построим систему, которая по заданному тексту критериев исследования будет отбирать из базы (папки с медицинскими записями с .txt формате) подходящие медицинские записи и выдавать название соответствующего файла.

## 1. Prerequisites

In [None]:
!pip install langchain

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting langchain
  Downloading langchain-0.0.188-py3-none-any.whl (969 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m969.4/969.4 kB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain)
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m54.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting async-timeout<5.0.0,>=4.0.0 (from langchain)
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting dataclasses-json<0.6.0,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.5.7-py3-none-any.whl (25 kB)
Collecting openapi-schema-pydantic<2.0,>=1.2 (from langchain)
  Downloading openapi_schema_pydantic-1.2.4-py3-none-any.whl (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

In [None]:
!pip install openai

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.27.7-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.27.7


In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.indexes.vectorstore import VectorstoreIndexCreator
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.chains import SequentialChain

In [None]:
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
TOKEN = #вставить ключ openAI

In [None]:
import os, glob

## 2. Определение всех моделей, участвующих в модели голосования

In [None]:
strict_forward = """Read the clinical trial criteria provided below. After that, decide by the patient's medical record if the patient is suitable for the trial. Answer YES or NO only.  

Clinical Trial Criteria: {trial}

Medical Record: {record}

Answer:"""
PROMPT = PromptTemplate(
    template=strict_forward, input_variables=["trial", "record"]
)
llm = ChatOpenAI(openai_api_key=TOKEN, model_name="gpt-3.5-turbo", max_tokens=1, temperature=0)


In [None]:
llm4 = ChatOpenAI(openai_api_key=TOKEN, model_name="gpt-3.5-turbo", max_tokens=1500, n=2)
trial_preprocess = """Rewrite and print the clinical trial text following the next rules:
1. If there is an age criterion:
1.1 If there is a minimum and maximum age, rewrite the age criterion as 'Age: x-y years old', where x is the minimum age and y is the maximum age.
1.2 If there is no minimum age, rewrite the age criterion as 'Age: 0-x years old', where x is the maximum age.
1.3 If there is no maximum age, rewrite the age criterion as 'Age: x and older', where x is the minimum age
2. If there is a gender criterion, rename gender criterion to sex criterion.
3. If there is a weight criterion, convert weight to kilograms.
4. If there is a height criterion, convert height to meters.
5. Do not generate new information.

Clinical Trial Text: {trial}
Result:"""
PROMPT4 = PromptTemplate(
    template=trial_preprocess, input_variables=['trial'])
chain1 = LLMChain(llm=llm4, prompt=PROMPT4, output_key='preprocessed_trial')

llm5 = ChatOpenAI(openai_api_key=TOKEN, model_name="gpt-3.5-turbo", max_tokens=1500, n=2)
record_preprocess = """Rewrite and print the medical record text following the next rules:
1. If the patient's weight is mentioned, convert weight to kilograms.
2. If the patient's height is mentioned, convert height to meters.
3. Do not generate new information.

Medical Record Text: {record}
Result:"""
PROMPT5 = PromptTemplate(
    template=record_preprocess, input_variables=['record'])
chain2 = LLMChain(llm=llm5, prompt=PROMPT5, output_key='preprocessed_record')

strict_forward = """Read the clinical trial criteria provided below. After that, decide by the patient's medical record if the patient is suitable for the trial. Answer YES or NO only.  

Clinical Trial Criteria: {preprocessed_trial}

Medical Record: {preprocessed_record}

Answer:"""
simplePROMPT = PromptTemplate(
    template=strict_forward, input_variables=["preprocessed_trial", "preprocessed_record"]
)
llm0 = ChatOpenAI(openai_api_key=TOKEN, model_name="gpt-3.5-turbo", max_tokens=1, temperature=0)
simple_chain = LLMChain(llm=llm0, prompt=simplePROMPT, output_key='answer')

In [None]:
llm1 = ChatOpenAI(openai_api_key=TOKEN, model_name="gpt-3.5-turbo", max_tokens=1500, n=2)
criteria_list = """Print all criteria from the given clinical trial as one list with points. Start each point with 'Must'.
Example:
  - Must be between 15 and 25 years old.
  - Must not have HIV.
  - Must not be pregnant.


Clinical Trial: {trial}.

List of criteria:"""
PROMPT1 = PromptTemplate(
    template=criteria_list, input_variables=['trial'])
chain3 = LLMChain(llm=llm1, prompt=PROMPT1, output_key='criteria_list')

llm2 = ChatOpenAI(openai_api_key=TOKEN, model_name="gpt-3.5-turbo", max_tokens=1500, n=1)
yes_no_list = """Given the medical record and the criteria list, for each criterion of the list print YES if the patient fits the criterion, or print NO if the patient does not fit the criterion.
Follow these rules:
  1. If any disease from the criterion is not mentioned in the medical record, the patient does not have it.
  2. If the activity from the criterion is not mentioned in the medical record, the patient does not have any limitations to perform it.
  3. If the point starts with 'Must', the patient must fall under the criterion.
  4. If the point starts with 'Must not', the patient must fall under the criterion.
  5. If the criterion lists some points with 'or', the patient must fall under at least one criterion from the list.
  6. If the criterion lists some points with 'and', the patient must fall under all criteria from the list.

Criteria List: {criteria_list}.
Medical Record: {record}

List of YES and NO:"""
PROMPT2 = PromptTemplate(
    template=yes_no_list, input_variables=['criteria_list', 'record'])
chain4 = LLMChain(llm=llm2, prompt=PROMPT2, output_key='tagged_list')

llm3 = ChatOpenAI(openai_api_key=TOKEN, model_name="gpt-3.5-turbo", max_tokens=1, temperature=1)
yes_no_answer = """Read the criteria list with YES or NO tags.
If the tags are all YES, print YES only. If there is at least one NO, print NO only.

Criteria List: {tagged_list}

Answer:"""
PROMPT3 = PromptTemplate(
    template=yes_no_answer, input_variables=['tagged_list'])
chain5 = LLMChain(llm=llm3, prompt=PROMPT3, output_key='answer')



In [None]:
llm4 = ChatOpenAI(openai_api_key=TOKEN, model_name="gpt-3.5-turbo", max_tokens=1500, n=2)
trial_preprocess = """Rewrite and print the clinical trial text following the next rules:
1. If there is an age criterion:
1.1 If there is a minimum and maximum age, rewrite the age criterion as 'Age: x-y years old', where x is the minimum age and y is the maximum age.
1.2 If there is no minimum age, rewrite the age criterion as 'Age: 0-x years old', where x is the maximum age.
1.3 If there is no maximum age, rewrite the age criterion as 'Age: x and older', where x is the minimum age
2. If there is a gender criterion, rename gender criterion to sex criterion.
3. If there is a weight criterion, convert weight to kilograms.
4. If there is a height criterion, convert height to meters.
5. Do not generate new information.

Clinical Trial Text: {trial}
Result:"""
PROMPT4 = PromptTemplate(
    template=trial_preprocess, input_variables=['trial'])
chain11 = LLMChain(llm=llm4, prompt=PROMPT4, output_key='preprocessed_trial')

llm5 = ChatOpenAI(openai_api_key=TOKEN, model_name="gpt-3.5-turbo", max_tokens=1500, n=2)
record_preprocess = """Rewrite and print the medical record text following the next rules:
1. If the patient's weight is mentioned, convert weight to kilograms.
2. If the patient's height is mentioned, convert height to meters.
3. Do not generate new information.

Medical Record Text: {record}
Result:"""
PROMPT5 = PromptTemplate(
    template=record_preprocess, input_variables=['record'])
chain22 = LLMChain(llm=llm5, prompt=PROMPT5, output_key='preprocessed_record')

llm1 = ChatOpenAI(openai_api_key=TOKEN, model_name="gpt-3.5-turbo", max_tokens=1500, n=2)
criteria_list = """Print all criteria for the given clinical trial as one list with points. Start each point with 'Must'.
Example:
  - Must be between 15 and 25 years old.
  - Must not have HIV.
  - Must not be pregnant.


Clinical Trial: {preprocessed_trial}.

List of criteria:"""
PROMPT1 = PromptTemplate(
    template=criteria_list, input_variables=['preprocessed_trial'])
chain33 = LLMChain(llm=llm1, prompt=PROMPT1, output_key='criteria_list')

llm2 = ChatOpenAI(openai_api_key=TOKEN, model_name="gpt-3.5-turbo", max_tokens=150, n=1)
yes_no_list = """Given the medical record and the criteria list, for each criterion of the list print YES if the patient is suitable for the criterion, or print NO if the patient is not suitable for the criterion.
Follow these rules:
  1. If any disease from the criterion is not mentioned in the medical record, the patient does not have it.
  2. If the activity from the criterion is not mentioned in the medical record, the patient does not have any limitations to perform it. 
  3. If the point starts with 'Must', the patient must fall under the criterion.
  4. If the point starts with 'Must not', the patient must fall under the criterion.
  5. If the criterion lists some points with 'or', the patient must fall under at least one criterion from the list.
  6. If the criterion lists some points with 'and', the patient must fall under all criteria from the list.

Criteria List: {criteria_list}.
Medical Record: {preprocessed_record}

List of YES and NO:"""
PROMPT2 = PromptTemplate(
    template=yes_no_list, input_variables=['criteria_list', 'preprocessed_record'])
chain44 = LLMChain(llm=llm2, prompt=PROMPT2, output_key='tagged_list')

llm3 = ChatOpenAI(openai_api_key=TOKEN, model_name="gpt-3.5-turbo", max_tokens=1, n=1)
yes_no_answer = """Read the criteria list with YES or NO tags. 
If the tags are all YES, print YES only. If there is at least one NO, print NO only.

Criteria List: {tagged_list}

Answer:"""
PROMPT3 = PromptTemplate(
    template=yes_no_answer, input_variables=['tagged_list'])
chain55 = LLMChain(llm=llm3, prompt=PROMPT3, output_key='answer')

In [None]:
simple = LLMChain(llm=llm, prompt=PROMPT)


preprocessed_simple = SequentialChain(
    chains=[chain1, chain2, simple_chain],
    input_variables=["trial", "record"],
    # Here we return our variable
    output_variables=["answer"])

list_criteria = SequentialChain(
    chains=[chain3, chain4, chain5],
    input_variables=["trial", "record"],
    # Here we return our variable
    output_variables=["answer"])

preprocessed_list_criteria = SequentialChain(
    chains=[chain11, chain22, chain33, chain44, chain55],
    input_variables=["trial", "record"],
    # Here we return our variable
    output_variables=["answer"])

## 3. Определение модели голосования

In [None]:
def voting(criteria, path_to_records, model1=simple, model2=preprocessed_simple, model3=list_criteria, model4=preprocessed_list_criteria):
  suitable = []
  for filename in glob.glob(os.path.join(path_to_records, '*.txt')):
    with open(os.path.join(os.getcwd(), filename), 'r') as f:
      record = f.read()
      ans1 = model1.run({'trial': criteria, 'record': record})
      ans2 = model2.run({'trial': criteria, 'record': record})
      ans3 = model3.run({'trial': criteria, 'record': record})
      ans4 = model4.run({'trial': criteria, 'record': record})
      for k in [ans1, ans2, ans3, ans4]:
        num = k.count('YES')
        if num > 2:
          suitable.append(filename)
  return suitable

## 4. Пример работы системы

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
criteria = '''Inclusion Criteria: Must meet one of the following criteria: - Overweight - Previous knee injury or surgery - Knee pain during the past year. Participants do not need to have current knee pain to take part in the study. - Parent or sibling who had knee replacement Exclusion Criteria: - Rheumatoid arthritis - Joint replacements in both knees - Unable to walk without assistance - Unable to undergo MRI of the knee'''

In [None]:
path = '/content/drive/MyDrive/mag diplom/system'

In [None]:
docs = voting(criteria, path)

In [None]:
docs # к данным критериям не нашлось подходящих документов из представленной папки

[]