In [1]:
#%pip install --upgrade pytorch torchvision torchaudio -c pytorch-nightly
%pip install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu


[0mLooking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/nightly/cpu
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import os
from mistralai import Mistral
from dotenv import load_dotenv
load_dotenv()
import os

In [3]:
prompts = {
    "Johnson": {
        "medical_notes": "A 60-year-old male patient, Mr. Johnson, presented with symptoms of increased thirst, frequent urination, fatigue, and unexplained weight loss. Upon evaluation, he was diagnosed with diabetes, confirmed by elevated blood sugar levels. Mr. Johnson's weight is 210 lbs. He has been prescribed Metformin to be taken twice daily with meals. It was noted during the consultation that the patient is a current smoker. ",
        "golden_answer": {
            "age": 60,
            "gender": "male",
            "diagnosis": "diabetes",
            "weight": 210,
            "smoking": "yes",
        },
    },
    "Smith": {
        "medical_notes": "Mr. Smith, a 55-year-old male patient, presented with severe joint pain and stiffness in his knees and hands, along with swelling and limited range of motion. After a thorough examination and diagnostic tests, he was diagnosed with arthritis. It is important for Mr. Smith to maintain a healthy weight (currently at 150 lbs) and quit smoking, as these factors can exacerbate symptoms of arthritis and contribute to joint damage.",
        "golden_answer": {
            "age": 55,
            "gender": "male",
            "diagnosis": "arthritis",
            "weight": 150,
            "smoking": "yes",
        },
    },
}

In [4]:
def run_mistral(user_message, model="mistral-large-latest"):
    client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))
    messages = [{"role": "user", "content": user_message}]
    chat_response = client.chat.complete(
        model=model,
        messages=messages,
        response_format={"type": "json_object"},
    )
    return chat_response.choices[0].message.content


# define prompt template
prompt_template = """
Extract information from the following medical notes:
{medical_notes}

Return json format with the following JSON schema: 

{{
        "age": {{
            "type": "integer"
        }},
        "gender": {{
            "type": "string",
            "enum": ["male", "female", "other"]
        }},
        "diagnosis": {{
            "type": "string",
            "enum": ["migraine", "diabetes", "arthritis", "acne", "common cold"]
        }},
        "weight": {{
            "type": "integer"
        }},
        "smoking": {{
            "type": "string",
            "enum": ["yes", "no"]
        }},
        
}}
"""

In [5]:
import json

def compare_json_objects(obj1, obj2):
    total_fields = 0
    identical_fields = 0
    common_keys = set(obj1.keys()) & set(obj2.keys())
    for key in common_keys:
        identical_fields += obj1[key] == obj2[key]
    percentage_identical = (identical_fields / max(len(obj1.keys()), 1)) * 100
    return percentage_identical

In [6]:
accuracy_rates = []

# for each test case
for name in prompts:

    # define user message
    user_message = prompt_template.format(medical_notes=prompts[name]["medical_notes"])

    # run LLM
    response = json.loads(run_mistral(user_message))

    # calculate accuracy rate for this test case
    accuracy_rates.append(
        compare_json_objects(response, prompts[name]["golden_answer"])
    )

# calculate accuracy rate across test cases
sum(accuracy_rates) / len(accuracy_rates)

100.0

## Another Mistral Use-case

In [7]:
#!pip -q install langchain
!pip -q install huggingface_hub
!pip -q install  git+https://github.com/huggingface/transformers # need to install from github
!pip -q install kor
#!pip install pandas --upgrade
!pip install Pathlib
%pip install protobuf==3.20.0

[0mNote: you may need to restart the kernel to use updated packages.


In [8]:
!pip show langchain

Name: langchain
Version: 0.2.15
Summary: Building applications with LLMs through composability
Home-page: https://github.com/langchain-ai/langchain
Author: 
Author-email: 
License: MIT
Location: /Users/mattsalomon/miniconda3/lib/python3.10/site-packages
Requires: aiohttp, async-timeout, langchain-core, langchain-text-splitters, langsmith, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: langchain-community


In [9]:
!pip show kor

Name: kor
Version: 2.0.0
Summary: Extract information with LLMs from text
Home-page: https://www.github.com/eyurtsev/kor
Author: Eugene Yurtsev
Author-email: eyurtsev@gmail.com
License: MIT
Location: /Users/mattsalomon/miniconda3/lib/python3.10/site-packages
Requires: langchain-core, pandas, pydantic
Required-by: 


In [10]:
!pip show protobuf

Name: protobuf
Version: 3.20.3
Summary: Protocol Buffers
Home-page: https://developers.google.com/protocol-buffers/
Author: 
Author-email: 
License: BSD-3-Clause
Location: /Users/mattsalomon/miniconda3/lib/python3.10/site-packages
Requires: 
Required-by: googleapis-common-protos, onnxruntime, opentelemetry-proto, tensorboard, tensorflow, tensorflow-datasets, tensorflow-metadata


In [11]:
import warnings
warnings.filterwarnings("ignore")

import torch
from dotenv import load_dotenv
import os
import textwrap
import enum
import re
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm

from operator import itemgetter
from typing import List, Union, Optional

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from transformers import GenerationConfig

import langchain
from langchain.schema.output_parser import StrOutputParser
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.prompts.chat import ChatPromptTemplate,SystemMessagePromptTemplate,AIMessagePromptTemplate,HumanMessagePromptTemplate
from langchain.prompts import StringPromptTemplate

from langchain import LLMChain
from langchain.llms import HuggingFaceHub, HuggingFacePipeline
from langchain.schema.runnable import ConfigurableField

from kor.extraction import create_extraction_chain
from kor.nodes import Object, Text, Number
from pydantic import BaseModel, Field, validator
from kor import extract_from_documents, from_pydantic

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
load_dotenv()
pd.set_option('display.max_colwidth', 500)
os.environ['HUGGINGFACEHUB_API_TOKEN'] = os.environ['hf_token']

In [12]:
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"
device

'mps'

In [13]:
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# print(device)

## Load Dataset
**Restaurant Reviews with following aspects**

- Ambience
- Anecdotes
- Food
- Price
- Staff
- Miscellaneous

In [14]:
review_domain = 'restaurant'

input_dir = Path(f'/Users/mattsalomon/Downloads/datasets/{review_domain}')
data_dir = Path('/Users/mattsalomon/Downloads/datasets/')

In [15]:
def load_dataset(filename):
    f = open(filename, 'r', encoding='utf-8')
    all_reviews = f.readlines()
    print('Total Reviews: ', len(all_reviews))
    f.close()

    sentences = []
    for i,review in enumerate(all_reviews):
        sentences.append(review.strip('\n'))

    col = 'labels' if 'test_label' in str(filename) else 'text_org'

    df = pd.DataFrame({col:sentences})
    return df

In [16]:
df_label = load_dataset(input_dir/'test_label.txt')
print(df_label.shape)

all_labels = list(set(df_label['labels'].str.split(' ').sum()))
print(all_labels)
drop_labels = ['Positive', 'Neutral']
all_labels = sorted([label for label in all_labels if label not in drop_labels])

drop_index = pd.Series(False, index=df_label.index)
for label in drop_labels:
    drop_index = drop_index | df_label['labels'].str.contains(label)
    
print(all_labels)

df_label = df_label[~drop_index].copy()
print(df_label.shape)

for label in all_labels:
    df_label[label] = np.where(df_label['labels'].str.contains(label), 1, 0)
print(df_label.shape)

df_label.head()

Total Reviews:  3328
(3328, 1)
['Staff', 'Neutral', 'Ambience', 'Food', 'Miscellaneous', 'Anecdotes', 'Price', 'Positive']
['Ambience', 'Anecdotes', 'Food', 'Miscellaneous', 'Price', 'Staff']
(3315, 1)
(3315, 7)


Unnamed: 0,labels,Ambience,Anecdotes,Food,Miscellaneous,Price,Staff
0,Food Ambience,1,0,1,0,0,0
1,Staff,0,0,0,0,0,1
2,Ambience,1,0,0,0,0,0
3,Miscellaneous,0,0,0,1,0,0
4,Miscellaneous,0,0,0,1,0,0


In [17]:
df = load_dataset(input_dir/'test.txt')
print(df.shape)

df = df[~drop_index].copy()
print(df.shape)

df = pd.concat([df, df_label], axis=1)

df.head()

Total Reviews:  3328
(3328, 1)
(3315, 1)


Unnamed: 0,text_org,labels,Ambience,Anecdotes,Food,Miscellaneous,Price,Staff
0,Always a fun place ... the food is deeelish !,Food Ambience,1,0,1,0,0,0
1,"The staff is n't the friendliest or most competent , and I am stickler for service , but everything else about this place makes up for it .",Staff,0,0,0,0,0,1
2,"Great for groups , great for a date , great for early brunch or a nightcap .",Ambience,1,0,0,0,0,0
3,Another great place to take out-of-towners !,Miscellaneous,0,0,0,1,0,0
4,: ),Miscellaneous,0,0,0,1,0,0


In [18]:
df[all_labels].mean()

Ambience         0.113424
Anecdotes        0.129412
Food             0.370136
Miscellaneous    0.276621
Price            0.089894
Staff            0.166817
dtype: float64

## Setup LLM: Google T5-Base

In [19]:
# !pip install transformers==4.20
# %pip install sentencepiece


In [62]:
# repo_id = 'distilbert/distilgpt2'
repo_id = 'openai-community/gpt2'
model = AutoModelForCausalLM.from_pretrained(repo_id, torch_dtype="auto",low_cpu_mem_usage=True)
tokenizer = AutoTokenizer.from_pretrained(repo_id, torch_dtype="auto",low_cpu_mem_usage=True)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [63]:
# Load model directly
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# repo_id = 'google-t5/t5-base'
# tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
# model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")

In [64]:
pipe = pipeline(
    "text-generation",
    model=model, 
    tokenizer=tokenizer, 
    max_new_tokens = 50,
    device=device,
    pad_token_id=tokenizer.eos_token_id,
    temperature=0.01,
)

llm = HuggingFacePipeline(pipeline=pipe)

In [65]:
llm

HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x3c43b93c0>)

## Setup Aspect Extractor

In [66]:
class Aspects(enum.Enum):
    staff = "Staff"
    ambience = "Ambience"
    anecdotes = "Anecdotes"
    food = "Food"
    price = 'Price'
    miscellaneous = 'Miscellaneous'
    
class ReviewABSA(BaseModel):
    aspect: List[Aspects] = Field(
        description=f"The key features that customers are talking about in their {review_domain} reviews.",
        examples=[
            ("The Singapore Mai Fun had NO curry flavor whatsoever.", "Food"),
            ("Go for it !", "Miscellaneous"),
            ("While the staff at this little bistro is very friendly , I have never experienced more incompetency.", "Staff"),
            ("Went on a double date with friend and his girlfriend for a few drinks and appetizers .", "Anecdotes"),
            ("When you enter , you feel like you have entered your mom 's kitchen .", "Ambience"),
            ("Pricey , but worth a try , at least once .", "Price")
        ],
        many=True,
    )

In [67]:
instruction_template = PromptTemplate(
    input_variables=["format_instructions", "type_description"],
    template=(
        "<|im_start|>system\nYour goal is to extract structured information from the user's input that matches the form described below. When extracting information please make sure it matches the type information exactly. Do not add any attributes that do not appear in the schema shown below.\n\n"
        "{type_description}\n<|im_end|>\n<|im_start|>user\n{format_instructions}\n"
    ),
)

In [68]:
#from langchain.chains import create_extraction_chain
schema, validator = from_pydantic(ReviewABSA)   
chain = create_extraction_chain(
    llm, schema, validator=validator, 
    encoder_or_encoder_class="json",
    instruction_template=instruction_template
)

In [69]:
print(chain.get_prompts()[0].format_prompt(text="[user input]").to_string())

<|im_start|>system
Your goal is to extract structured information from the user's input that matches the form described below. When extracting information please make sure it matches the type information exactly. Do not add any attributes that do not appear in the schema shown below.

```TypeScript

reviewabsa: { // 
 aspect: Array<"Staff" | "Ambience" | "Anecdotes" | "Food" | "Price" | "Miscellaneous"> // The key features that customers are talking about in their restaurant reviews.
}
```

<|im_end|>
<|im_start|>user
Please output the extracted information in JSON format. Do not output anything except for the extracted information. Do not add any clarifying information. Do not add any fields that are not in the schema. If the text contains attributes that do not appear in the schema, please ignore them. All output must be in JSON format and follow the schema specified above. Wrap the JSON in <json> tags.


Input: The Singapore Mai Fun had NO curry flavor whatsoever.
Output: <json>{"re

In [70]:
#print(chain.prompt.format_prompt(text="[user input]").to_string())

## Aspect Extraction

In [71]:
%%time
sample = 100
reviews = df['text_org'].values.tolist()
reviews_sampled=reviews[0:sample]
responses = []
aspects = []
    
for text in tqdm(reviews_sampled):
    response = chain.invoke(input=text+'<|im_end|>\n')
    responses.append(response)
    aspects.append(response['data']['reviewabsa']['aspect'] if len(response['data'].keys())>0 
                   else ['Miscellaneous'])


all_labels_sampled=all_labels[0:sample]
aspects_sampled=aspects[0:sample]

len(reviews_sampled), len(responses), len(aspects)

100%|█████████████████████████████████████████| 100/100 [06:32<00:00,  3.92s/it]

CPU times: user 6min, sys: 31.1 s, total: 6min 31s
Wall time: 6min 32s





(100, 100, 100)

In [72]:
df_test_pred = pd.DataFrame(columns=[f'predicted_{label}' for label in all_labels_sampled])
df_test_pred['predicted_labels'] = [' '.join(aspect) for aspect in aspects_sampled]
for label in all_labels_sampled:
    df_test_pred[f'predicted_{label}'] = np.where(df_test_pred['predicted_labels'].str.contains(label), 1, 0)
df_test_pred.index = df.iloc[0:sample].index
print(df_test_pred.shape)

(100, 7)


## Evaluate Results

In [73]:
df_final = pd.concat([df.iloc[0:sample], df_test_pred], axis=1)
print(df_final.shape)
df_final = df_final[['text_org', 'labels', 'predicted_labels']+all_labels_sampled+[f'predicted_{label}' for label in all_labels_sampled]] 
df_final.head()

(100, 15)


Unnamed: 0,text_org,labels,predicted_labels,Ambience,Anecdotes,Food,Miscellaneous,Price,Staff,predicted_Ambience,predicted_Anecdotes,predicted_Food,predicted_Miscellaneous,predicted_Price,predicted_Staff
0,Always a fun place ... the food is deeelish !,Food Ambience,Ambience,1,0,1,0,0,0,1,0,0,0,0,0
1,"The staff is n't the friendliest or most competent , and I am stickler for service , but everything else about this place makes up for it .",Staff,Ambience,0,0,0,0,0,1,1,0,0,0,0,0
2,"Great for groups , great for a date , great for early brunch or a nightcap .",Ambience,Ambience,1,0,0,0,0,0,1,0,0,0,0,0
3,Another great place to take out-of-towners !,Miscellaneous,Ambience,0,0,0,1,0,0,1,0,0,0,0,0
4,: ),Miscellaneous,Miscellaneous,0,0,0,1,0,0,0,0,0,1,0,0


In [74]:
interested_labels = ['Ambience', 'Food', 'Price', 'Staff']
for label in interested_labels:
    print('Label: ', label)
    y_true = df_final[label]
    y_pred = df_final[f'predicted_{label}']
    print('Accuracy: ', np.round(accuracy_score(y_true, y_pred),3))
    print('F1 Score: ', np.round(f1_score(y_true, y_pred),3))
    print('Precision: ', np.round(precision_score(y_true, y_pred),3))
    print('Recall: ', np.round(recall_score(y_true, y_pred),3))
    print('***************')

Label:  Ambience
Accuracy:  0.26
F1 Score:  0.302
Precision:  0.18
Recall:  0.941
***************
Label:  Food
Accuracy:  0.76
F1 Score:  0.0
Precision:  0.0
Recall:  0.0
***************
Label:  Price
Accuracy:  0.97
F1 Score:  0.0
Precision:  0.0
Recall:  0.0
***************
Label:  Staff
Accuracy:  0.81
F1 Score:  0.0
Precision:  0.0
Recall:  0.0
***************


In [75]:
other_labels = ['Anecdotes', 'Miscellaneous']
for label in other_labels:
    print('Label: ', label)
    y_true = df_final[label]
    y_pred = df_final[f'predicted_{label}']
    print('Accuracy: ', np.round(accuracy_score(y_true, y_pred),3))
    print('F1 Score: ', np.round(f1_score(y_true, y_pred),3))
    print('Precision: ', np.round(precision_score(y_true, y_pred),3))
    print('Recall: ', np.round(recall_score(y_true, y_pred),3))
    print('***************')

Label:  Anecdotes
Accuracy:  0.82
F1 Score:  0.0
Precision:  0.0
Recall:  0.0
***************
Label:  Miscellaneous
Accuracy:  0.72
F1 Score:  0.364
Precision:  0.727
Recall:  0.242
***************
