In [0]:
%pip install openai==1.12.0
%pip install tiktoken==0.6.0
%pip install langchain==0.1.16
%pip install langchain-openai==0.0.36
%pip install langchain-community==0.0.33
%pip install mlflow==2.12.1

In [0]:
import pandas as pd
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import OpenAI
from langchain.document_loaders import CSVLoader
from langchain.text_splitter import CharacterTextSplitter

In [0]:
# Load dataset
file_path = "/Volumes/ads-predictive-analytics/default/test/clinical_notes_dataset.csv"  
df = pd.read_csv(file_path)

In [0]:
# Reformat clinical notes
documents = df[["Patient_ID", "Notes"]].astype(str).apply(lambda x: "Patient " + x["Patient_ID"] + ": " + x["Notes"], axis=1).tolist()

In [0]:
!pip install sentence-transformers # Initialize embeddings and vector database

embeddings = HuggingFaceEmbeddings()
vectorstore = FAISS.from_texts(documents, embeddings)
retriever = vectorstore.as_retriever()

In [0]:
# Initialize RAG summarization model
llm = OpenAI(model_name="gpt-3.5")
qa_chain = RetrievalQA(llm=llm, retriever=retriever)

In [0]:
# Create function to summarize notes using RAG
def summarize_text_rag(text):
    if isinstance(text, str) and len(text) > 0:
        query = f"Summarize the following clinical note: {text}"
        response = qa_chain.run(query)
        return response
    return "No summary available"


In [0]:
# Apply RAG summarization to clinical notes
df["Generated_Summary"] = df["Notes"].apply(summarize_text_rag)

In [0]:
# Save the new dataset with summaries
output_file = "/Volumes/ads-predictive-analytics/default/test/clinical_notes_with_rag_summaries.csv"
df.to_csv(output_file, index=False)

In [0]:
print(f"Summarized clinical notes saved to {output_file}")

In [0]:
import pandas as pd
from rouge_score import rouge_scorer

# Load dataset with summaries
file_path = "/Volumes/ads-predictive-analytics/default/test/clinical_notes_with_rag_summaries.csv"  
df = pd.read_csv(file_path)

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

#create function
def evaluate_summary(reference, generated):
    """Compute ROUGE scores between reference (original) and generated summaries."""
    scores = scorer.score(reference, generated)
    return {
        "ROUGE-1": scores["rouge1"].fmeasure,
        "ROUGE-2": scores["rouge2"].fmeasure,
        "ROUGE-L": scores["rougeL"].fmeasure,
    }

# Apply evaluation
results = df.apply(lambda row: evaluate_summary(row["Notes"], row["Generated_Summary"]), axis=1)

df_results = pd.DataFrame(list(results))

df["ROUGE-1"], df["ROUGE-2"], df["ROUGE-L"] = df_results["ROUGE-1"], df_results["ROUGE-2"], df_results["ROUGE-L"]

# Save evaluation results
output_file = "/Volumes/ads-predictive-analytics/default/test/evaluated_summaries.csv"
df.to_csv(output_file, index=False)

# Print average scores
print("Average ROUGE Scores:")
print(df_results.mean())
