In [2]:
from utils import *

config = load_config()
PROJECT_PATH = config.project_path
DATA_PATH = PROJECT_PATH.joinpath('data/processed')

with open(DATA_PATH.joinpath("zeroshot_mistral.pkl"), 'rb') as f :
    mistral_zeroshot_result = pickle.load(f)
    mistral_zeroshot_result = list(map(lambda x : x[0]['generated_text'], mistral_zeroshot_result))

with open(DATA_PATH.joinpath("fewshot_mistral.pkl"), 'rb') as f :
    mistral_fewshot_result = pickle.load(f)
    mistral_fewshot_result = list(map(lambda x : x[0]['generated_text'], mistral_fewshot_result))


with open(DATA_PATH.joinpath("mergedData.pkl"), 'rb') as f :
    mergedData = pickle.load(f)

In [3]:
print(mistral_zeroshot_result[54])


1. Rheumatoid arthritis - This is a chronic autoimmune disorder that affects the joints and causes inflammation and pain.
2. High cholesterol - Also known as hyperlipidemia, this condition increases the risk of heart disease and stroke.
3. Hypothyroidism - This is a condition where the thyroid gland does not produce enough thyroid hormones, leading to various symptoms such as fatigue, weight gain, and cold intolerance.
4. History of Hodgkin's lymphoma - This is a type of cancer that affects the lymphatic system. The patient had undergone chemotherapy and radiation many years ago.
5. History of bilateral bunionectomies - This is a surgical procedure to remove or realign a bunion, a deformity of the big toe joint.
6. Positive hepatitis C antibody - This means the patient has been exposed to the hepatitis C virus, but it does not necessarily mean they have an active infection. The patient has negative viral loads and normal liver function tests.
7. Carpal tunnel syndrome - This is a cond

In [4]:
mergedData['zeroshot'] = mistral_zeroshot_result
mergedData['fewshot'] = mistral_fewshot_result

In [5]:
mergedData['Phrase'] = mergedData['Phrase'].str.replace(',','\n')

In [157]:
mergedData[['Phrase', 'zeroshot']].to_excel('../data/processed/mistral_zeroshot.xlsx')
mergedData[['Phrase', 'fewshot']].to_excel('../data/processed/mistral_fewshot.xlsx')

## ============================= Analyzing the results using Bertscore

In [69]:
from evaluate import load
bertscore = load('bertscore')

In [70]:
# compute zeroshot

zeroshot_results = bertscore.compute(predictions=mistral_zeroshot_result, 
                  references=mergedData['Phrase'],
                  lang='en')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [80]:
import numpy as np
import pandas as pd

p = np.average(zeroshot_results['precision'])
r = np.average(zeroshot_results['recall'])
f1 = np.average(zeroshot_results['f1'])



In [74]:
# compute fewshot

fewshot_results = bertscore.compute(predictions=mistral_fewshot_result, 
                  references=mergedData['Phrase'],
                  lang='en')

## ==================== After Reformatting, analyzing the results

In [114]:
def classifier(word1, word2) :

    # normalization
    word1, word2 = word1.lower(), word2.lower()
    if (word1 in word2) or (word2 in word1) :
        return 1
    
    else :
        return 0

In [2]:
def precision_function(label, pred) :

    denominator = len(pred)
    tp = 0
    for prediction in pred :
        for true_label in label :
            score = classifier(prediction, true_label)

            if score : tp += 1
            else : pass

    return tp / denominator

def recall_function(label, pred) :

    denominator = len(label)
    tp = 0
    for true_label in label :
        for prediction in pred :
            score = classifier(prediction, true_label)

            if score : tp += 1
            else : pass

    return tp / denominator


In [72]:
import pandas as pd

mz = pd.read_excel(DATA_PATH.joinpath("mistral_zeroshot_formatted.xlsx"))
mz = mz.drop(columns = 'Unnamed: 0')

In [109]:
import re
p = re.compile("\d+\.\s+(.+)")
label = mz['Phrase'].str.findall(p)
pred =  mz['zeroshot'].str.findall(p)



In [143]:
mz['zeroshot'][23]

'1. Abdominal pain that has been increasing for two weeks and becoming debilitating.\n2. Loss of appetite and nausea, leading to reduced food intake.\n3. Pharyngitis, for which the patient has been prescribed amoxicillin.\n4. Mid-back pain, which could be related to the abdominal pain.\n5. Possible gastrointestinal issues, such as a problem with the gallbladder or liver, given the abdominal pain and tender mass in the right mid epigastrium.\n6. History of chronic myelogenous leukemia and chronic obstructive pulmonary disease.'

In [119]:
precision_scores = []
for l, p in zip(label, pred) :
    score = precision_function(l, p)
    precision_scores.append(score)

TypeError: object of type 'float' has no len()

In [117]:
score

0.5

In [87]:
label[1], pred[1]

(['a1c', 'diabetes', 'CLYCOHEMOGLOBIN A1C', 'HGBA1C', ''],
 ['Very high A1c and glucose levels.',
  'Diabetes - The note mentions the patient needs to follow up in the diabetes clinic to adjust their medications.',
  'Neck pain and prior fusion - The patient had a cervical spine examination with 5 views and obliques.',
  'Mild osteopenia - Observed in the cervical spine examination.',
  'Interbody fusion with solid bony bridging with interbody graft and anterior plating at C5 and 6 is stable.',
  'Mild degenerative changes at C3-C4, C6-C7, and the mid to lower facet arthropathy.',
  'Uncovertebral degenerative changes.',
  'Mild progression in degenerative changes.',
  'Neural foramina are normal on the right side, but there is probable mild narrowing at the C3-C4 left side.',
  'High blood urea nitrogen (BUN) and creatinine levels, which may indicate kidney issues.',
  'High triglycerides and low HDL cholesterol levels, which may indicate issues with lipid metabolism.',
  'High glycoh

In [79]:
mz['Phrase'][0]

'1. Diarrhea-predominant\n2. irritable bowel syndrome\n3. sigmoidoscopy\n4. IBD\n5. screening colonoscopy\n6. hemorrhoids\n7. migraines\n8. postnasal\n9. pituitary microadenoma\n10. steatohepatitis\n11. Hypertriglyceridemia\n12. vestibular neuritis\n13. hypertension\n14. Claritin\n15. Lisinopril\n16. Prilosec\n17. irritable bowel syndrome flareup\n18. Gastroesophageal reflux\n19. '

## ============================================= Fewshot results using openai

In [1]:
from openai import OpenAI

key = "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"
client = OpenAI(api_key=key)

In [39]:
ex = mistral_fewshot_result[0]

In [53]:
MODEL='gpt-3.5-turbo'

response = client.chat.completions.create(
    model=MODEL,
    messages=[
        {"role": "system", 
         "content": """
         You are a helpful assistant. Get the key points from the given text and summarize it using short words or phrases, not as sentences.
         Separate the items using commas and remove all the numberings or any * or - in the front. 
         Also, do not include any descriptive texts.  
         The format should be like
        
         keypoint1, keypoint2, .... keypoint
         """},
        {"role": "user", 
         "content": ex},
    ],
    temperature=0.9,
)

In [6]:
MODEL = 'gpt-3.5-turbo'

responses = []
for text in mistral_fewshot_result :
    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", 
            "content": """
            You are a helpful assistant. Get the key points from the given text and summarize it using short words or phrases, not as sentences.
            Separate the items using commas and remove all the numberings or any * or - in the front. 
            Also, do not include any descriptive texts.  
            The format should be like
            
            keypoint1, keypoint2, .... keypoint
            """},
            {"role": "user", 
            "content": text},
        ],
        temperature=0.9,
    )
    responses.append(response)


NameError: name 'MODEL' is not defined

In [55]:
print(response.choices[0].message.content)

follow-up appointment, reassess symptoms, effectiveness of treatment, further evaluation, medication regimen, smoking cessation, weight loss, blood sugar monitoring, complete eye examination, follow-up with primary care provider, vaccination, overweight, elevated blood pressure, lifestyle modifications


In [60]:
responses = list(map(lambda x : x.choices[0].message.content, responses))

In [62]:
mergedData['fewshot'] = responses

In [10]:
import pandas as pd
# zeroshot = pd.read_excel("../data/processed/reformatted_mistral_zeroshot.xlsx")

def preprocess_goldstandard(text) :

    splited = text.split("\n")
    splited = list(map(lambda x : x.lower().strip(), splited))
    splited = list(filter(lambda x : len(x) > 0, splited))
    splited = set(splited)
    splited = list(splited)

    return splited

def preprocess_zeroshot(text) :

    splited = text.split(',')
    splited = list(map(lambda x : x.lower().strip(), splited))
    splited = list(filter(lambda x : len(x) > 0, splited))
    splited = set(splited)
    splited = list(splited)

    return splited



def calculate_precision_recall_f1(true_list, pred_list) :

    cnt = 0
    for true in true_list :
        for pred in pred_list :
            if (true in pred) | (pred in true) :
                cnt += 1
    
    recall, precision = len(true_list), len(pred_list)
    recall = cnt / recall
    precision = cnt / precision
    if precision+recall == 0 : 
        f1 = 2*(precision*recall)/0.001
    else : 
        f1 = 2*(precision*recall)/(precision+recall)
    return precision, recall, f1



def get_scores(results) :
    num_cases = len(results)

    recall, precision, f1 = 0, 0, 0
    for res in results : 
        precision += res[0]
        recall += res[1]
        f1 += res[2]
    
    return precision/num_cases, recall/num_cases, f1/num_cases

# zeroshot['phrase']= zeroshot["phrase"].apply(preprocess_goldstandard)
# zeroshot['zeroshot'] = zeroshot['zeroshot'].apply(preprocess_zeroshot)
# results = zeroshot.apply(lambda x : calculate_precision_recall_f1(x['phrase'], x['zeroshot']), axis=1)

# zp, zr, rf1 = get_scores(results)
# print("the zeroshot result precision : %.3f" %zp)
# print("the zeroshot result recall : %.3f" %zr)
# print("the zeroshot result f1-score : %.3f" %rf1)


## =========== Calculating Mixtral fewshot results

In [11]:
mergedData[['Phrase','fewshot']].to_excel('../data/processed/mistral_fewshot.xlsx')

In [22]:
print(mergedData['fewshot'].iloc[3])


1. Nonischemic cardiomyopathy
2. GERD
3. Fatty liver
4. Hives
5. Sinus polyposis
6. Asthma
7. Aspirin allergy
8. Knee pain
9. Left bicipital tendon tear
10. Hypertension.


In [12]:
fewshot = mergedData[['Phrase', 'fewshot']].copy()
fewshot = fewshot.rename(columns = {'Phrase': 'phrase'})

In [13]:
fewshot['phrase']= fewshot["phrase"].apply(preprocess_goldstandard)
fewshot['fewshot'] = fewshot['fewshot'].apply(preprocess_zeroshot)


In [None]:
results = fewshot.apply(lambda x : calculate_precision_recall_f1(x['phrase'], x['fewshot']), axis=1)

zp, zr, rf1 = get_scores(results)
print("the fewshot result precision : %.3f" %zp)
print("the fewshot result recall : %.3f" %zr)
print("the fewshot result f1-score : %.3f" %rf1)

0                                     (0.0, 0.0, 0.0)
1                                     (0.0, 0.0, 0.0)
2       (0.4166666666666667, 5.0, 0.7692307692307693)
3                   (0.06666666666666667, 1.0, 0.125)
4      (0.46153846153846156, 6.0, 0.8571428571428571)
                            ...                      
101                                   (0.0, 0.0, 0.0)
102                                   (0.0, 0.0, 0.0)
103                                   (0.0, 0.0, 0.0)
104                                   (0.0, 0.0, 0.0)
105                                   (0.0, 0.0, 0.0)
Length: 106, dtype: object

In [1]:
mergedData['fewshot']

NameError: name 'mergedData' is not defined