# CBC IMAGE PREDICTOR (Using Saved Model + Raw Data for Normalization)

# --- 1) Install OCR Tools ---

In [1]:
print("Installing OCR tools...")
!sudo apt-get install tesseract-ocr > /dev/null
!pip install pytesseract > /dev/null
print("Done.")

Installing OCR tools...
Done.


# --- 2) Imports ---

In [2]:
import os
import re
import joblib
import numpy as np
import pandas as pd
import pytesseract
try:
    from PIL import Image
except ImportError:
    import Image
from sklearn.preprocessing import StandardScaler

# --- 3) Configuration ---

In [3]:
MODEL_FILE = "/content/drive/MyDrive/ML for CBC Project/best_model_pipeline.joblib"
RAW_DATA_FILE = "/content/cbc_dataframe.csv"
IMAGE_FILE = "/content/ubnormal.jpg"

# --- 4) Load Resources ---

In [4]:
print(f"\n--- Loading Model and Data ---")

# A. Load the Saved Model
try:
    saved_bundle = joblib.load(MODEL_FILE)
    model_pipeline = saved_bundle['pipeline']
    label_encoder = saved_bundle['label_encoder']
    required_features = saved_bundle['feature_columns']
    print("Model loaded successfully")
    print(f"Model expects {len(required_features)} features: {required_features}")
except Exception as e:
    print(f"Error loading model: {e}")
    raise

# B. Create the "Translator" (Scaler) using Raw Data
# We DO NOT train the model. We just learn the Mean/Std Dev to normalize inputs.
print("   Building Normalizer (Scaler) from raw data...")
try:
    df_raw = pd.read_csv(RAW_DATA_FILE)

    # 1. Calculate NLR if missing (Model expects it)
    # Use a safe calculation handling zeros
    ne = df_raw['NE#'] if 'NE#' in df_raw else df_raw.get('NE', 0)
    ly = df_raw['LY#'] if 'LY#' in df_raw else df_raw.get('LY', 0)

    # Simple vector calculation for the scaler fitting
    df_raw['NLR'] = np.where(ly > 0, ne / ly, 0)

    # 2. Select ONLY the columns the model needs
    # We handle potential missing columns in raw data by filling with 0 temporarily for fitting
    for col in required_features:
        if col not in df_raw.columns:
            print(f"   Warning: '{col}' missing in raw data. Filling with median.")
            df_raw[col] = df_raw.select_dtypes(include=np.number).median().mean()

    X_raw_for_scaling = df_raw[required_features]

    # 3. Fit the Scaler
    scaler = StandardScaler()
    scaler.fit(X_raw_for_scaling)
    print("Normalizer ready. (Learned stats from raw CSV)")

except Exception as e:
    print(f"Error building normalizer: {e}")
    print("Please ensure 'cbc_dataframe.csv' is uploaded.")
    raise


--- Loading Model and Data ---
Model loaded successfully
Model expects 21 features: ['WBC', 'LY%', 'MO%', 'NE%', 'EO%', 'BA%', 'LY#', 'MO#', 'NE#', 'EO#', 'BA#', 'RBC', 'HGB', 'HCT', 'MCV', 'MCHC', 'MCH', 'RDW', 'PLT', 'MPV', 'NLR']
   Building Normalizer (Scaler) from raw data...
Normalizer ready. (Learned stats from raw CSV)


# --- 5) The OCR Function ---

In [10]:
def extract_data_from_image(image_path, feature_list):
    print(f"\n--- Scanning Image: {image_path} ---")
    try:
        text = pytesseract.image_to_string(Image.open(image_path))
    except Exception as e:
        print(f"OCR Error: {e}")
        return None

    # Expanded Synonyms Dictionary to catch full names
    synonyms = {
        # White Blood Cells
        'WBC': ['WBC', 'WHITE BLOOD', 'LEUKOCYTE', 'TOTAL LEUCOCYTIC COUNT', 'TLC'],
        'LY%': ['LY%', 'LYM%', 'LYMPHOCYTE %', 'LYMPHOCYTES %'],
        'MO%': ['MO%', 'MON%', 'MONOCYTE %', 'MONOCYTES %'],
        'NE%': ['NE%', 'NEU%', 'NEUTROPHIL %', 'NEUTROPHILS %', 'POLY %', 'SEGMENTED'],
        'EO%': ['EO%', 'EOS%', 'EOSINOPHIL %', 'EOSINOPHILS %'],
        'BA%': ['BA%', 'BAS%', 'BASOPHIL %', 'BASOPHILS %'],

        # Absolute Counts
        'LY#': ['LY#', 'LYM#', 'ABS LYMPH', 'ABSOLUTE LYMPHOCYTE'],
        'MO#': ['MO#', 'MON#', 'ABS MONO', 'ABSOLUTE MONOCYTE'],
        'NE#': ['NE#', 'NEU#', 'ABS NEUT', 'ABSOLUTE NEUTROPHIL'],
        'EO#': ['EO#', 'EOS#', 'ABS EOS', 'ABSOLUTE EOSINOPHIL'],
        'BA#': ['BA#', 'BAS#', 'ABS BASO', 'ABSOLUTE BASOPHIL'],

        # Red Blood Cells
        'RBC': ['RBC', 'RED BLOOD', 'ERYTHROCYTE', 'TOTAL RBC'],
        'HGB': ['HGB', 'HEMOGLOBIN', 'HAEMOGLOBIN'], # 'HAEMOGLOBIN' is common in non-US reports
        'HCT': ['HCT', 'HEMATOCRIT', 'PACKED CELL VOLUME', 'PCV'],

        # Indices
        'MCV': ['MCV', 'MEAN CORPUSCULAR VOLUME'],
        'MCH': ['MCH', 'MEAN CORPUSCULAR HEMOGLOBIN'],
        'MCHC': ['MCHC', 'MEAN CORP. HEM. CONC'],
        'RDW': ['RDW', 'RED CELL DISTRIBUTION'],

        # Platelets
        'PLT': ['PLT', 'PLATELET', 'THROMBOCYTE', 'PLATELET COUNT'],
        'MPV': ['MPV', 'MEAN PLATELET VOLUME']
    }

    extracted = {}
    text_upper = text.upper()

    for feature in feature_list:
        if feature == 'NLR': continue # Calculated later

        terms = synonyms.get(feature, [feature])
        val = None
        for term in terms:
            # Regex: Look for Term -> Optional Separators -> Number
            # Handles: "WBC: 8.5", "WBC 8.5", "WBC - 8.5"
            pattern = re.escape(term) + r"[\s:\.-]*([\d\.]+)"
            match = re.search(pattern, text_upper)
            if match:
                try:
                    val_str = match.group(1)
                    # Fix OCR glitches like "8.5.2" -> "8.5"
                    if val_str.count('.') > 1: val_str = val_str.rsplit('.', 1)[0]
                    val = float(val_str)
                    print(f"  Found {feature}: {val}")
                    break
                except: continue

        extracted[feature] = val

    return extracted

# --- 6) The Main Application ---

In [11]:
def run_diagnosis(image_path):
    if not os.path.exists(image_path):
        print("Error: Image file not found.")
        return

    # A. Extract
    data = extract_data_from_image(image_path, required_features)
    if not data: return

    # B. Calc Hidden Feature
    ne = data.get('NE#')
    ly = data.get('LY#')
    if ne is not None and ly is not None and ly > 0:
        data['NLR'] = ne / ly
        print(f"  Calculated NLR: {data['NLR']:.2f}")
    else:
        data['NLR'] = None

    # C. Prepare DataFrame
    input_df = pd.DataFrame([data])

    # Ensure all columns exist
    for col in required_features:
        if col not in input_df.columns:
            input_df[col] = np.nan

    # Reorder
    input_df = input_df[required_features]

    # --- CRITICAL FIX: Force everything to be a Number ---
    input_df = input_df.apply(pd.to_numeric, errors='coerce')
    # -----------------------------------------------------

    # D. Normalize
    print("\n--- Normalizing Data ---")
    try:
        input_values = input_df.values

        # Fill NaNs with column means from scaler (safe handling)
        col_means = scaler.mean_
        inds = np.where(np.isnan(input_values))
        input_values[inds] = np.take(col_means, inds[1])

        # Scale
        input_scaled = scaler.transform(input_values)

        # E. Predict
        print("--- Making Prediction ---")
        pred_idx = model_pipeline.predict(input_scaled)[0]
        diagnosis = label_encoder.inverse_transform([pred_idx])[0]

        # Probabilities
        if hasattr(model_pipeline, "predict_proba"):
            confidence = np.max(model_pipeline.predict_proba(input_scaled)) * 100
        else:
            confidence = 0.0

        print("\n" + "#"*40)
        print(f"  RESULT: {diagnosis}")
        if confidence > 0:
            print(f"  Confidence: {confidence:.2f}%")
        print("#"*40)

    except Exception as e:
        print(f"Prediction Error: {e}")

In [12]:
run_diagnosis(IMAGE_FILE)


--- Scanning Image: /content/ubnormal.jpg ---
  Found HGB: 106.0
  Found MCV: 836.0
  Found MCH: 26.5
  Found PLT: 150.0
  Found MPV: 10.4

--- Normalizing Data ---
--- Making Prediction ---

########################################
  RESULT: Thrombocytopenia
  Confidence: 88.00%
########################################




In [14]:
# ==========================================================
#   UPDATED OCR DIAGNOSTIC CELL (Checks All Synonyms)
# ==========================================================
import pytesseract
try:
    from PIL import Image
except ImportError:
    import Image
import re
import os

# --- CONFIGURATION ---
# Change this to match your uploaded image filename!
IMAGE_TO_CHECK = "/content/ubnormal.jpg"

if os.path.exists(IMAGE_TO_CHECK):
    print(f"--- 1. RAW TEXT DUMP (What Tesseract sees) ---")
    try:
        # Get the raw text
        raw_text = pytesseract.image_to_string(Image.open(IMAGE_TO_CHECK))
        text_upper = raw_text.upper()

        # Print it with a border so you can see the quality
        print("--------------------------------------------------")
        print(raw_text.strip())
        print("--------------------------------------------------")

        print(f"\n--- 2. PARSING CHECK (Scanning for values...) ---")

        # The full dictionary of terms to search for
        synonyms = {
            # White Blood Cells
            'WBC': ['WBC', 'WHITE BLOOD', 'LEUKOCYTE', 'TOTAL LEUCOCYTIC COUNT', 'TLC'],
            'LY%': ['LY%', 'LYM%', 'LYMPHOCYTE %', 'LYMPHOCYTES %'],
            'MO%': ['MO%', 'MON%', 'MONOCYTE %', 'MONOCYTES %'],
            'NE%': ['NE%', 'NEU%', 'NEUTROPHIL %', 'NEUTROPHILS %', 'POLY %', 'SEGMENTED'],
            'EO%': ['EO%', 'EOS%', 'EOSINOPHIL %', 'EOSINOPHILS %'],
            'BA%': ['BA%', 'BAS%', 'BASOPHIL %', 'BASOPHILS %'],

            # Absolute Counts
            'LY#': ['LY#', 'LYM#', 'ABS LYMPH', 'ABSOLUTE LYMPHOCYTE'],
            'MO#': ['MO#', 'MON#', 'ABS MONO', 'ABSOLUTE MONOCYTE'],
            'NE#': ['NE#', 'NEU#', 'ABS NEUT', 'ABSOLUTE NEUTROPHIL'],
            'EO#': ['EO#', 'EOS#', 'ABS EOS', 'ABSOLUTE EOSINOPHIL'],
            'BA#': ['BA#', 'BAS#', 'ABS BASO', 'ABSOLUTE BASOPHIL'],

            # Red Blood Cells
            'RBC': ['RBC', 'RED BLOOD', 'ERYTHROCYTE', 'TOTAL RBC'],
            'HGB': ['HGB', 'HEMOGLOBIN', 'HAEMOGLOBIN'],
            'HCT': ['HCT', 'HEMATOCRIT', 'PACKED CELL VOLUME', 'PCV'],

            # Indices
            'MCV': ['MCV', 'MEAN CORPUSCULAR VOLUME'],
            'MCH': ['MCH', 'MEAN CORPUSCULAR HEMOGLOBIN'],
            'MCHC': ['MCHC', 'MEAN CORP. HEM. CONC'],
            'RDW': ['RDW', 'RED CELL DISTRIBUTION'],

            # Platelets
            'PLT': ['PLT', 'PLATELET', 'THROMBOCYTE', 'PLATELET COUNT'],
            'MPV': ['MPV', 'MEAN PLATELET VOLUME']
        }

        # Loop through every feature and try every synonym
        for feature, search_terms in synonyms.items():
            found_val = None
            matched_term = None

            for term in search_terms:
                # Regex: Look for Term -> Optional Separators -> Number
                # Matches: "WBC 8.5", "WBC: 8.5", "WBC - 8.5"
                pattern = re.escape(term) + r"[\s:\.-]*([\d\.]+)"
                match = re.search(pattern, text_upper)

                if match:
                    val_str = match.group(1)
                    # Fix common OCR glitch (e.g., "8.5.2" -> "8.5")
                    if val_str.count('.') > 1: val_str = val_str.rsplit('.', 1)[0]

                    found_val = val_str
                    matched_term = term
                    break # Stop searching synonyms for this feature

            # Print results
            if found_val:
                print(f"✅ {feature:<5} : Found '{found_val}' (Matched: '{matched_term}')")
            else:
                # If completely missing, show what we looked for
                print(f"❌ {feature:<5} : NOT FOUND (Checked: {search_terms})")

    except Exception as e:
        print(f"Error running OCR: {e}")
else:
    print(f"Error: File '{IMAGE_TO_CHECK}' not found. Please check the filename.")

--- 1. RAW TEXT DUMP (What Tesseract sees) ---
--------------------------------------------------
TUT TEST REPORT
Patient MR. TRIJUGEE NARAYAN SHUKLA. Reg. No. 2108121930
Age/Gender :68 Y/Male Reg. Date 30-Aug-2021
Ref. By SELF Report Date 30-Aug-2021
Associate SRN DIAGNOSTICS INDORE Laboratory

COMPLETE BLOOD COUNT (CBC)
Parameter Observed Value Unit Biological Reference interval
Hemoglobin 106 gat 13.0 - 17.0
RBC Count 4.00 millionfemm 4.6 - 6.2
Hematrocrit 33.4 % 40-54
mcv 836 i 80 - 96
McH 26.5 Pg 27-33
McHe 7 % 32-36
ROW: CV 142 % 11-16
RDW-SD 50.1 fl 35 - 56
PLATELET COUNT 150 10% iL 150 - 410
MPV 10.4 a 65-120
POW 265 2500 - 65.0
TOTAL COUNT (WBC), EDTA blood 863 10% 40-100
DIFFERENTIAL WBC COUNT (Manual By Microscopy)
Neutrophils (%) 80 % 38-70
Lymphocytes (%) cr) % 20-45
Monocytes (%) 03 % 2-8
Eosinophils (%) 02 % 1-4
Basophils (%) 00 % o-1
Neutrophils (Abs) 695 10% iL
Lymphocytes (Abs) 1.23 10%
Monocytes (Abs) 036 10%
Eosinophils (Abs) 0.02 10%L
Basophils (Abs) 0.07 Jemm,

In