# Installing Dependencies

In [11]:
!pip install pyPDF2
!pip install pytesseract pillow
!pip install reportlab



# API CONFIGURATION

In [10]:
# using gemini  api

import google.generativeai as genai
from google.colab import userdata

GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key = GOOGLE_API_KEY)

# Text Extraction for PDF and Images

In [14]:
import PyPDF2
from PIL import Image
import pytesseract
import os

# function to extract text from PDF file
def extract_text_from_pdf(pdf_path):
  text = ""
  with open(pdf_path,"rb") as file:
    reader = PyPDF2.PdfReader(file)
    for page in reader.pages:
      text+=page.extract_text() + "\n"
    return text

# function to extract text from Image
def extract_text_from_image(image_paths):
  result = {}
  for path in image_paths:
    try:
        image = Image.open(path)
        text = pytesseract.image_to_string(image)
        result[os.path.basename(path)] = text
    except Exception as e:
        result[os.path.basename(path)] = f"Error: {e}"
    return result

##Summarizer + Key Terms from Report

In [15]:
# function that calls gemini model when summary and key terms explainer

def extract_summary_and_key_terms(text):
  model = genai.GenerativeModel("gemini-1.5-flash")
  prompt = f"Summarize and explain the following medical document. Also create seperate heading to explain all difficult medical terms present in image very briefly but in simple language. Give in depth summary and other headings such as at home cures which are not something risky but known ways such as suggested foods and diets. Ensure to add warning to seek medical professional advice before following any advice. If the text is not relevant to any medical report or is not related to medical information, decline providing the information very politely and ask user to upload health, fitness, medical reports or medicine related docs only. Properly format with both headings and key pointers and general paragraphs. Do not mention any names of doctors, patients, etc. Try to not use your own knowledge and reply mostly from pdf. Here is the text to summarize: \n\n {text}"
  response = model.generate_content(prompt)
  return response.text

In [28]:
from reportlab.lib.pagesizes import A4
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.lib.enums import TA_CENTER, TA_JUSTIFY
from reportlab.pdfgen import canvas
from reportlab.lib import colors
import os

def create_pdf_report(output_path, explanation_text, logo_path):
    # --- Document setup ---
    doc = SimpleDocTemplate(
        output_path,
        pagesize=A4,
        rightMargin=40,
        leftMargin=40,
        topMargin=80,
        bottomMargin=50
    )

    # --- Styles ---
    styles = getSampleStyleSheet()
    styles.add(ParagraphStyle('CenterTitle', alignment=TA_CENTER, fontSize=18, spaceAfter=30))
    styles.add(ParagraphStyle('Justify', alignment=TA_JUSTIFY, leading=16))
    styles.add(ParagraphStyle('SubHeading', fontSize=14, spaceBefore=12, spaceAfter=6, leading=16))

    flowables = []

    # Add heading with more spacing
    flowables.append(Paragraph("<b>Clinically – Medical Report Analyzer</b>", styles['CenterTitle']))
    flowables.append(Spacer(1, 12))  # Extra space after heading

    # --- Parse explanation text ---
    for orig in explanation_text.splitlines():
        line = orig.strip()
        if not line:
            flowables.append(Spacer(1, 8))
            continue

        clean = line.replace('*', '').strip()

        if clean.endswith(':') and not orig.lstrip().startswith('*'):
            flowables.append(Paragraph(f"<b>{clean}</b>", styles['SubHeading']))
        elif orig.lstrip().startswith('*'):
            if ':' in clean:
                term, expl = clean.split(':', 1)
                flowables.append(Paragraph(f"• <b>{term.strip()}:</b> {expl.strip()}", styles['Justify']))
            else:
                flowables.append(Paragraph(f"• {clean}", styles['Justify']))
        else:
            flowables.append(Paragraph(clean, styles['Justify']))

    flowables.append(PageBreak())

    # --- Watermark logo centered and light ---
    def draw_header(c: canvas.Canvas, doc):
        width, height = A4

        # Draw watermark-style logo
        if os.path.exists(logo_path):
            watermark_width = 4 * inch
            watermark_height = 4 * inch
            c.saveState()
            c.translate((width - watermark_width) / 2, (height - watermark_height) / 2)
            c.setFillAlpha(0.2)  # Very light opacity
            c.drawImage(
                logo_path,
                0,
                0,
                width=watermark_width,
                height=watermark_height,
                mask='auto'
            )
            c.restoreState()

    # --- Build PDF ---
    doc.build(flowables, onFirstPage=draw_header, onLaterPages=draw_header)
    print(f"PDF successfully saved at: {output_path}")


In [29]:
#Function to take random input and then classify between images and pdf and output the pdf file accordingly

import os

def is_image_file(file_path):
    return file_path.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".tiff"))

def is_pdf_file(file_path):
    return file_path.lower().endswith(".pdf")

def process_medical_report(input_path):
    if isinstance(input_path, list):
        # Check if it's a list of images
        if all(is_image_file(p) for p in input_path):
            extracted_texts = extract_text_from_image(input_path)
            for filename, text in extracted_texts.items():
                summary = extract_summary_and_key_terms(text)
                print(f"\nSummary for {filename}:\n{summary}")
                output_path = "/content/medical_report_clinically_test1.pdf"
                logo_path = "/content/logo.png"
                create_pdf_report(output_path, summary, logo_path)
        else:
            print("Error: List contains non-image files.")

    elif isinstance(input_path, str):
        # Check if it's a single PDF file
        if is_pdf_file(input_path):
            extracted_text = extract_text_from_pdf(input_path)
            summary = extract_summary_and_key_terms(extracted_text)
            print(f"\nReport Brief for {os.path.basename(input_path)}:\n{summary}")
            output_path = "/content/medical_report_clinically_test1.pdf"
            logo_path = "/content/logo.png"
            create_pdf_report(output_path, summary, logo_path)
        else:
            print("Error: Input string is not a PDF file.")

    else:
        print("Unsupported input format.")



# # Test 1: Image list input
# image_list1 = ["/content/Image_test_for_medical_chatbot.png"]
# image_list2 = ["/content/medical_report_img-test1.png", "/content/medical_report_img-test2.png"]
# process_medical_report(image_list2)

# Test 2: PDF input
pdf_file = "/content/medical_report_test1.pdf"
non_med_pdf_file2 = "/content/Nitya Social Media LOR Draft.pdf"
process_medical_report(pdf_file)


Report Brief for medical_report_test1.pdf:
The provided document is a comprehensive laboratory report detailing the results of various blood and urine tests conducted on a 41-year-old male patient.  The report includes a complete blood count (CBC), lipid profile, blood glucose and HbA1c levels, thyroid function tests, kidney function tests, liver function tests, iron studies, vitamin levels (B12 and D), homocysteine levels, prostate-specific antigen (PSA), IgE levels, HIV and Hepatitis B screening, and hemoglobin electrophoresis.

**Summary of Key Findings:**

The report reveals several notable findings that require further investigation and clinical correlation by a medical professional:


* **Elevated Fasting Blood Sugar and HbA1c:**  The fasting blood sugar (141.0 mg/dL) and HbA1c (7.10%) levels are significantly higher than the reference intervals, suggesting uncontrolled diabetes. This needs careful monitoring and management.


* **Elevated Homocysteine:** The homocysteine level 

# Summary Tester

In [7]:
#Test Input Paths -> single pdf, multiple images
pdf_file = "/content/medical_report_test1.pdf"
non_med_pdf_file2 = "/content/Nitya Social Media LOR Draft.pdf"
image_list1 = ["/content/Image_test_for_medical_chatbot.png"]
image_list2 = ["/content/medical_report_img-test1.png","/content/medical_report_img-test2.png"]

# calling functions to create summary :
extracted_text = extract_text_from_pdf(pdf_file)
summary = extract_summary_and_key_terms(extracted_text)
print("Report Brief : \n", summary)

Report Brief : 
 This document is a comprehensive laboratory report detailing the results of various blood and urine tests conducted on a 41-year-old male patient.  The report covers a wide range of parameters, including complete blood count (CBC), lipid profile, blood glucose levels, thyroid function, kidney function, liver function, vitamin levels, and infectious disease markers.


**Summary of Key Findings:**

The patient's complete blood count shows a slightly elevated white blood cell count (10,570/cmm, slightly above the upper limit of normal), while other blood cell parameters (red blood cells, hemoglobin, hematocrit, etc.) are within the normal ranges.  His lipid profile reveals borderline high triglycerides (168 mg/dL), while other lipid values like cholesterol and HDL are within acceptable limits. The patient's fasting blood sugar is elevated (141.0 mg/dL), indicating potential hyperglycemia. His HbA1c level is 7.10%, which is significantly above the normal range, indicating 

In [9]:
# main app code to to input image file paths and call summarizer + tester

image_list1 = ["/content/Image_test_for_medical_chatbot.png"]
image_list2 = ["/content/medical_report_img-test1.png","/content/medical_report_img-test2.png"]
extracted_texts = extract_text_from_image(image_list2)
for filename, text in extracted_texts.items():
    summary = extract_summary_and_key_terms(text)
    print(f"\nSummary for {filename}:\n{summary}")


Summary for medical_report_img-test1.png:
This document is a complete blood count (CBC) report for a 21-year-old male patient.  The report indicates potential anemia requiring further investigation.

**Summary of Findings:**

The patient's hemoglobin level (Hb) is 12.5 g/dL, which falls below the reference range of 13.0-17.0 g/dL, suggesting anemia.  Other blood indices, while mostly within normal ranges, support this suspicion:

* **Red Blood Cell (RBC) Count:** Slightly elevated at 5.2 million/µL.
* **Packed Cell Volume (PCV):** Slightly elevated at 57.5%.
* **Mean Corpuscular Volume (MCV):** Slightly below the normal range at 87.75 fL (normal range 83-101 fL), suggesting that the red blood cells may be smaller than normal (microcytic anemia), although this is a minor deviation and does not clearly confirm this type of anemia.
* **Mean Corpuscular Hemoglobin (MCH):** Within the normal range.
* **Mean Corpuscular Hemoglobin Concentration (MCHC):**  Within the normal range.
* **Red ce

# Explain Medical Terms in Report

In [None]:
def explain_medical_terms(text):
  model = genai.GenerativeModel("gemini-1.5-flash")
  prompt = f"Explain all difficult medical terms present in image very briefly but in simple language. If the text is not relevant to any medical report or is not related to medical information, decline providing the summary very politely and ask user to upload health, fitness, medical reports or medicine related docs only. Here is the text: \n\n {text}"
  response = model.generate_content(prompt)
  return response.text

# Medical Terms Explain Tester

In [None]:
# for pdf

pdf_file = "/content/medical_report_test1.pdf"
non_med_pdf_file2 = "/content/FeeReceipt.pdf"
extracted_text = extract_text_from_pdf(pdf_file)
explain_terms = explain_medical_terms(extracted_text)
print("Key terms : \n", explain_terms)

Key terms : 
 This report shows the results of a comprehensive blood and urine test. Here's a simplified explanation of the complex terms:

**Blood Tests:**

* **Complete Blood Count (CBC):** A basic blood test checking various blood components.
    * **Hemoglobin (Hb):** Protein in red blood cells carrying oxygen.
    * **RBC Count:** Number of red blood cells.
    * **Hematocrit:** Percentage of red blood cells in blood.
    * **MCV:** Average size of red blood cells.
    * **MCH:** Average amount of hemoglobin in each red blood cell.
    * **MCHC:** Average concentration of hemoglobin in red blood cells.
    * **RDW:** Variation in the size of red blood cells.
    * **WBC Count:** Number of white blood cells (fight infection).
    * **Neutrophils, Lymphocytes, Eosinophils, Monocytes, Basophils:** Types of white blood cells, each with a specific function.
    * **Platelet Count:** Number of platelets (help blood clot).
    * **MPV:** Average size of platelets.
    * **RBC, WBC, Plate

In [None]:

image_list1 = ["/content/Image_test_for_medical_chatbot.png"]
image_list2 = ["/content/medical_report_img-test1.png","/content/medical_report_img-test2.png"]
extracted_texts = extract_text_from_image(image_list2)
for filename, text in extracted_texts.items():
    explain_terms = explain_medical_terms(extracted_texts)
    print("Key terms : \n", explain_terms)

Key terms : 
 This is a blood test report.  Here's a simplified explanation of some terms:

* **Hemoglobin (Hb):**  The protein in red blood cells that carries oxygen.  The patient's level (125 g/dL) is high; the normal range is 13.0-17.0 g/dL.

* **RBC count:** Red blood cell count. The patient's count (52 million/cu mm) is high; the normal range is 45-55 million/cu mm.

* **PCV (Packed Cell Volume):** The percentage of red blood cells in the blood. The patient's PCV (57.5%) is high; the normal range is 40-50%.

* **MCV (Mean Corpuscular Volume):** The average size of red blood cells. The patient's MCV (87.75 fl) is slightly high; the normal range is 83-101 fl.

* **WBC count:** White blood cell count. The patient's count (9000/cu mm) is within the normal range (4900-11000/cu mm).  White blood cells fight infection.

* **Neutrophils, Lymphocytes, Eosinophils, Monocytes, Basophils:** Types of white blood cells.  The percentages given show the proportion of each type.

* **Platelet coun

#Chat with Pdf

In [None]:
def chat_with_medical_report(text,question):
  model = genai.GenerativeModel("gemini-1.5-flash")
  prompt = f"Answer question asked based on below medical related test briefly and in simple language. If any of the text or question is not relevant to any medical report or is not related to medical information, decline providing answer very politely and ask user to upload or ask questions related to health, fitness, medical reports or medicine only. here is the question \n\n {question} \n\n Here is the text: \n\n {text}"
  response = model.generate_content(prompt)
  return response.text

#Chat with pdf tester

In [None]:
# for pdf

pdf_file = "/content/medical_report_test1.pdf"
non_med_pdf_file2 = "/content/FeeReceipt.pdf"
extracted_text = extract_text_from_pdf(pdf_file)

question_relevant = "What are key abnormalities in the report?"
question_irrelevant = "When is Modi Ji's Birthday?"
answer = chat_with_medical_report(extracted_text,question_relevant)
print("Answer :\n",answer)

Answer :
 The report shows several abnormalities:

* **High Fasting Blood Sugar:**  The fasting blood sugar level of 141.0 mg/dL is above the normal range (74-106 mg/dL), suggesting hyperglycemia.  This, combined with the HbA1c result (see below), indicates a potential problem with blood sugar control.

* **High HbA1c:** The HbA1c level of 7.10% is significantly above the normal range for non-diabetics (<5.7%) and even above the good control range for diabetics (6.0-7.0%), suggesting poorly controlled diabetes.

* **High Homocysteine:** The homocysteine level of 23.86 micromol/L is well above the normal range (6.0-14.8 micromol/L), indicating hyperhomocysteinemia.  This is a risk factor for cardiovascular disease.

* **Low Vitamin B12:** The Vitamin B12 level is below the normal range (<148 pg/mL vs 187-833 pg/mL), suggesting a deficiency.

* **Elevated IgE:** The IgE level of 492.30 IU/mL is much higher than the normal range (0-87 IU/mL), suggesting a possible allergic condition.  Fur