# Milestone 1: Data Ingestion System (Final Testing Phase)



In [1]:
# 1. Install System Dependencies for OCR
!sudo apt-get install tesseract-ocr

# 2. Install Python Packages
!pip install pdfplumber pytesseract pandas kaggle

# Note: We will define the improved classes (Interpreter, Risk Engine)
# directly in the cells below for better visibility and to ensure the latest fixes are applied.

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 2 not upgraded.


## Part 1: System Development & Calibration" (We used CBC,Bajaj medical and other datasets to design our extraction logic).


## Part 2: System Verification
We use **GitHub Lab Data** (CSV) and **Kaggle CBC** (Images) to verify the pipeline.(We tested our logic on unseen datasets to ensure robustness).

## Part 3: Manual Upload & Verification
Upload your own files (PDF, Image, CSV, JSON) to test the system.

In [7]:
# MILESTONE 1: DATA INGESTION & VALIDATION LAYER
import pdfplumber
import pandas as pd
import numpy as np
import re
from google.colab import files
try:
    from PIL import Image, ImageEnhance
    import pytesseract
except ImportError:
    pass

# --- 1. PREPROCESSOR (Data Cleaning) ---
class Preprocessor:
    def __init__(self):
        self.column_mapping = {
            'hb': 'Haemoglobin', 'hemoglobin': 'Haemoglobin', 'hgb': 'Haemoglobin',
            'platelets': 'Platelets', 'platelet count': 'Platelets', 'plt': 'Platelets',
            'wbc': 'White Blood Cells', 'white blood cells': 'White Blood Cells',
            'rbc': 'RBC Count', 'red blood cells': 'RBC Count',
            'pcv': 'Packed Cell Volume', 'hct': 'Packed Cell Volume',
            'mcv': 'Mean Corpuscular Volume', 'mch': 'Mean Corpuscular Hemoglobin',
            'mchc': 'Mean Corpuscular Hemoglobin Concentration',
            'glucose': 'Glucose', 'fbs': 'Glucose',
            'cholesterol': 'Cholesterol'
        }
    def normalize_column_names(self, df):
        if df.empty: return df
        df.columns = [str(col).strip() for col in df.columns]
        new_cols = {}
        for col in df.columns:
            lower_col = col.lower()
            if lower_col in self.column_mapping:
                new_cols[col] = self.column_mapping[lower_col]
        return df.rename(columns=new_cols)

# --- 2. VALIDATOR (to accept Dynamic Rules) ---
class DataValidator:
    def __init__(self):
        self.rules = {
            "Haemoglobin": {"min": 12.0, "max": 17.0, "unit": "g/dL"},
            "White Blood Cells": {"min": 4000, "max": 11000, "unit": "/cumm"},
            "RBC Count": {"min": 3.8, "max": 5.8, "unit": "mill/cumm"},
            "Platelets": {"min": 150000, "max": 450000, "unit": "/cumm"},
            "Packed Cell Volume": {"min": 36, "max": 50, "unit": "%"},
            "Mean Corpuscular Volume": {"min": 80, "max": 100, "unit": "fL"},
            "Mean Corpuscular Hemoglobin": {"min": 27, "max": 32, "unit": "pg"},
            "Mean Corpuscular Hemoglobin Concentration": {"min": 32, "max": 36, "unit": "g/dL"},
            "Glucose": {"min": 70, "max": 140, "unit": "mg/dL"},
            "Cholesterol": {"min": 0, "max": 200, "unit": "mg/dL"}
        }

    # Added 'rules=None' argument here
    def check_standard_ranges(self, df, rules=None):
        if rules is None: rules = self.rules # Use default if not provided

        if df.empty: return df
        validated_df = df.copy()
        errors = []
        for index, row in validated_df.iterrows():
            row_errors = []
            for col, rule in rules.items():
                if col in row and pd.notna(row[col]):
                    try:
                        val = float(row[col])
                        if val < rule['min'] or val > rule['max']:
                            row_errors.append(f"{col} out of range")
                    except: pass
            errors.append("; ".join(row_errors) if row_errors else "None")
        validated_df['validation_errors'] = errors
        return validated_df

# --- 3. INTERPRETER (OCR + REGEX PARSING) ---
class CommonInterpreter:
    def read_file(self, file_path):
        if file_path.lower().endswith('.csv'): return pd.read_csv(file_path)
        elif file_path.lower().endswith('.pdf'): return self.read_pdf(file_path)
        elif file_path.lower().endswith(('.jpg', '.png', '.jpeg')): return self.read_image(file_path)
        return pd.DataFrame()

    def read_pdf(self, file_path):
        all_data = []
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    parsed = self.parse_text_to_data(text)
                    if parsed: all_data.append(pd.DataFrame([parsed]))
        return pd.concat(all_data, ignore_index=True) if all_data else pd.DataFrame()

    def read_image(self, file_path):
        try:
            # Grayscale + High Contrast for better OCR
            img = Image.open(file_path).convert('L')
            img = ImageEnhance.Contrast(img).enhance(2.0)
            img = img.point(lambda x: 255 if x > 200 else 0, mode='1')

            text = pytesseract.image_to_string(img)
            parsed = self.parse_text_to_data(text)
            return pd.DataFrame([parsed]) if parsed else pd.DataFrame()
        except: return pd.DataFrame()

    def parse_text_to_data(self, text):
        data = {}
        patterns = {
            "Haemoglobin": [r"(?i)(?:Haemoglobin|Hb)[\s\:\-]+(\d+\.?\d*)"],
            "White Blood Cells": [r"(?i)(?:WBC|White\s+Blood\s+Cells?)[\s\:\-]+(\d+(?:[\.,]\d+)?)"],
            "RBC Count": [r"(?i)(?:RBC|Red\s+Blood\s+Cells?)[\s\:\-]+(\d+\.?\d*)"],
            "Platelets": [r"(?i)(?:Platelets|PLT)[\s\:\-]+(\d+(?:\.?\d+)?)"],
            "Packed Cell Volume": [r"(?i)(?:PCV|HCT)[\s\:\-]+(\d+\.?\d*)"],
            "Mean Corpuscular Volume": [r"(?i)MCV[\s\:\-]+(\d+\.?\d*)"],
            "Mean Corpuscular Hemoglobin": [r"(?i)MCH\b[\s\:\-]+(\d+\.?\d*)"],
            "Mean Corpuscular Hemoglobin Concentration": [r"(?i)MCHC[\s\:\-]+(\d+\.?\d*)"],
            "Glucose": [r"(?i)(?:Glucose|FBS)[\s\:\-]+(\d+\.?\d*)"],
            "Cholesterol": [r"(?i)(?:Cholesterol)[\s\:\-]+(\d+\.?\d*)"]
        }
        for param, regex_list in patterns.items():
            for pattern in regex_list:
                match = re.search(pattern, text)
                if match:
                    try: data[param] = float(match.group(1).replace(',', ''))
                    except: continue
        return data

# --- EXECUTION ---
print("Milestone 1 System Ready. Upload files...")
uploaded = files.upload()

for filename in uploaded.keys():
    print(f"\nProcessing {filename}...")
    try:
        # Instantiate
        interpreter = CommonInterpreter()
        preprocessor = Preprocessor()
        validator = DataValidator()

        # Pipeline
        df = interpreter.read_file(filename)
        df_norm = preprocessor.normalize_column_names(df)
        df_val = validator.check_standard_ranges(df_norm)

        if not df_val.empty and 'validation_errors' in df_val.columns:
            display(df_val[['validation_errors'] + [c for c in df_val.columns if c != 'validation_errors']].head())
            print("‚úÖ Data Extracted and Validated.")
        else:
            print("‚ö†Ô∏è No structured data found (Header/Cover Page).")

    except Exception as e:
        print(f"‚ùå Error: {e}")

Milestone 1 System Ready. Upload files...


Saving AHD-0425-PA-0007719_E-REPORTS_250427_2032@E.pdf_page_7.png to AHD-0425-PA-0007719_E-REPORTS_250427_2032@E.pdf_page_7 (2).png
Saving BDCBC7196_Hematology_Dataset.csv to BDCBC7196_Hematology_Dataset (2).csv
Saving Blood_report_pdf_1.pdf to Blood_report_pdf_1.pdf
Saving Blood_report_pdf_4.pdf to Blood_report_pdf_4 (1).pdf
Saving BLR-0425-PA-0037318_SASHANK P K 0037318 2 OF 2_28-04-2025_1007-19_AM@E.pdf_page_29.png to BLR-0425-PA-0037318_SASHANK P K 0037318 2 OF 2_28-04-2025_1007-19_AM@E.pdf_page_29 (1).png

Processing AHD-0425-PA-0007719_E-REPORTS_250427_2032@E.pdf_page_7 (2).png...


Unnamed: 0,validation_errors,Haemoglobin
0,Haemoglobin out of range,910.0


‚úÖ Data Extracted and Validated.

Processing BDCBC7196_Hematology_Dataset (2).csv...


Unnamed: 0,validation_errors,Gender,Age,Haemoglobin,RBC Count,White Blood Cells,Platelets,LYMP,MONO,Packed Cell Volume,Mean Corpuscular Volume,Mean Corpuscular Hemoglobin,Mean Corpuscular Hemoglobin Concentration,RDW,PDW,MPV,PCT,Diagnosis
0,White Blood Cells out of range,0,45,12.1,4.25,12300,404000.0,29.0,4.6,36.2,85.2,28.4,33.4,14.0,13.6,10.2,0.41,Anemia of Chronic Disease
1,White Blood Cells out of range,0,58,12.3,4.34,12000,392000.0,30.0,5.1,37.1,85.5,28.3,33.1,14.0,13.8,10.2,0.39,Anemia of Chronic Disease
2,White Blood Cells out of range,0,49,12.6,4.35,11300,387000.0,23.5,7.0,38.2,87.9,28.9,32.9,14.1,14.9,10.7,0.41,Anemia of Chronic Disease
3,Packed Cell Volume out of range,0,43,12.0,4.3,5000,298000.0,43.1,6.5,35.8,83.4,27.9,33.5,13.7,15.3,8.5,0.254,Anemia of Chronic Disease
4,Haemoglobin out of range; Packed Cell Volume o...,0,29,11.4,4.36,8720,267000.0,31.1,5.9,35.1,80.4,26.1,32.5,14.0,15.6,8.3,0.222,Anemia of Chronic Disease


‚úÖ Data Extracted and Validated.

Processing Blood_report_pdf_1.pdf...


Unnamed: 0,validation_errors,Mean Corpuscular Volume,Mean Corpuscular Hemoglobin,Mean Corpuscular Hemoglobin Concentration
0,Mean Corpuscular Hemoglobin Concentration out ...,88.0,27.9,31.7


‚úÖ Data Extracted and Validated.

Processing Blood_report_pdf_4 (1).pdf...


Unnamed: 0,validation_errors,Mean Corpuscular Volume,Mean Corpuscular Hemoglobin,Mean Corpuscular Hemoglobin Concentration
0,,87.7,29.3,33.4


‚úÖ Data Extracted and Validated.

Processing BLR-0425-PA-0037318_SASHANK P K 0037318 2 OF 2_28-04-2025_1007-19_AM@E.pdf_page_29 (1).png...
‚ö†Ô∏è No structured data found (Header/Cover Page).


In [8]:
# MILESTONE 2: ADVANCED RISK & PATTERN RECOGNITION (WITH SCORING)


class BiomarkerCorrelationEngine:
    def __init__(self):
        self.patterns = {
            "metabolic_syndrome": {
                "name": "Metabolic Syndrome",
                "markers": ["Glucose", "Cholesterol"],
                "required": 2,
                "weights": [0.6, 0.4], # Glucose is weighted higher
                "significance": "Insulin resistance, cardiovascular risk"
            },
            "anemia": {
                "name": "Anemia (RBC Deficiency)",
                "markers": ["Haemoglobin", "RBC Count", "Packed Cell Volume"],
                "required": 2,
                "weights": [0.5, 0.3, 0.2], # Hb is most important
                "significance": "Low oxygen-carrying capacity, fatigue"
            },
            "infection": {
                "name": "Active Infection",
                "markers": ["White Blood Cells", "Neutrophils"],
                "required": 1,
                "weights": [1.0, 0.0],
                "significance": "Immune response activation"
            }
        }

    def analyze_risk(self, biomarkers, rules):
        """1. Calculate Deviations, 2. Match Patterns, 3. Calculate Risk Score"""
        deviations = {}
        for m, v in biomarkers.items():
            if m in rules:
                rule = rules[m]
                dev = 0.0
                if v < rule['min']: dev = (rule['min'] - v) / rule['min']
                elif v > rule['max']: dev = (v - rule['max']) / rule['max']
                deviations[m] = min(dev, 2.0) # Cap deviation at 200% for scoring logic

        found_patterns = []
        for pid, cfg in self.patterns.items():
            count = 0
            details = []

            # 1. Check Matches
            for m in cfg['markers']:
                if m in deviations and deviations[m] > 0.05: # >5% deviation
                    count += 1
                    details.append(f"{m} (+{deviations[m]:.1%})")

            if count >= cfg['required']:
                # 2. Calculate Weighted Risk Score (0-100)
                score_accum = 0.0
                weight_accum = 0.0
                for i, m in enumerate(cfg['markers']):
                    if m in deviations:
                        w = cfg['weights'][i] if i < len(cfg['weights']) else 0.0
                        score_accum += deviations[m] * w
                        weight_accum += w

                # Normalize: deviation of 0.5 (50%) -> Score 50
                final_score = (score_accum / weight_accum) * 100 if weight_accum > 0 else 0
                final_score = min(final_score, 100) # Cap at 100

                # Categorize
                risk_cat = "Low"
                if final_score > 50: risk_cat = "High"
                elif final_score > 20: risk_cat = "Moderate"

                found_patterns.append({
                    "Pattern": cfg['name'],
                    "Significance": cfg['significance'],
                    "Evidence": ", ".join(details),
                    "Score": final_score,
                    "Category": risk_cat
                })
        return found_patterns

# MODEL 3: CONTEXTUAL ANALYSIS (Adjust Rules)
class ContextualAnalysisModel:
    def adjust_rules(self, base_rules, age, gender):
        """Returns a NEW set of rules modified for Age/Gender."""
        adj_rules = {k: v.copy() for k, v in base_rules.items()} # Deep copy to be safe

        # 1. Gender Adjustments (Haemoglobin)
        if 'Haemoglobin' in adj_rules:
            new_hb = adj_rules['Haemoglobin']
            if gender.lower().startswith('f'): # Female
                new_hb['min'] = 12.0; new_hb['max'] = 15.5
            elif gender.lower().startswith('m'): # Male
                new_hb['min'] = 13.5; new_hb['max'] = 17.5

        # 2. Age Adjustments (Glucose tolerance decreases with age)
        if 'Glucose' in adj_rules and age > 60:
            adj_rules['Glucose']['max'] = 140 # Elderly allow higher glucose

        return adj_rules


# EXECUTION
print("Running Milestone 2 Risk Analysis (with Scoring) on previous files...\n")
risk_engine = BiomarkerCorrelationEngine()

rules = validator.rules

# Re-process the uploaded files from M1
for filename in uploaded.keys():
    try:
        df = interpreter.read_file(filename)
        df_norm = preprocessor.normalize_column_names(df)
        df_val = validator.check_standard_ranges(df_norm) # Check against standard first

        # Convert DataFrame to Dict for Engine
        biomarkers = {}
        for col in df_val.columns:
            if col in rules:
                val = pd.to_numeric(df_val[col], errors='coerce').iloc[0]
                if pd.notna(val): biomarkers[col] = float(val)

        if biomarkers:
            print(f"üìä Analyzing {filename}...")
            risks = risk_engine.analyze_risk(biomarkers, rules)
            if risks:
                for r in risks:
                    print(f"  üö® DETECTED: {r['Pattern']}")
                    print(f"     Significance: {r['Significance']}")
                    print(f"     Risk Score: {r['Score']:.1f}/100 ({r['Category']})")
                    print(f"     Evidence: {r['Evidence']}")
            else:
                print("  ‚úÖ No significant disease patterns detected.")
            print("-" * 40)

    except Exception as e:
        print(f"Skipping {filename}: {e}")


Running Milestone 2 Risk Analysis (with Scoring) on previous files...

üìä Analyzing AHD-0425-PA-0007719_E-REPORTS_250427_2032@E.pdf_page_7 (2).png...
  ‚úÖ No significant disease patterns detected.
----------------------------------------
üìä Analyzing BDCBC7196_Hematology_Dataset (2).csv...
  üö® DETECTED: Active Infection
     Significance: Immune response activation
     Risk Score: 11.8/100 (Low)
     Evidence: White Blood Cells (+11.8%)
----------------------------------------
üìä Analyzing Blood_report_pdf_1.pdf...
  ‚úÖ No significant disease patterns detected.
----------------------------------------
üìä Analyzing Blood_report_pdf_4 (1).pdf...
  ‚úÖ No significant disease patterns detected.
----------------------------------------


In [13]:
# FINAL EXECUTION: REAL FILE ANALYSIS + PATIENT CONTEXT


# 1. Setup Models
interpreter = CommonInterpreter()
preprocessor = Preprocessor()
validator = DataValidator()
context_model = ContextualAnalysisModel()
risk_engine = BiomarkerCorrelationEngine()

# 2. Get Context (Because the extracted text might not strictly parse Age/Gender yet)
print("--- üè• PATIENT CONTEXT SETUP ---")
try:
    p_age = int(input("Enter Patient Age: "))
    p_gender = input("Enter Patient Gender (Male/Female): ").strip()
except:
    p_age = 30; p_gender = "Female"
    print(f"(Invalid input, defaulting to: {p_age} Years, {p_gender})")

# 3. Analyze ACTUAL Uploaded Files
print(f"\n--- üìÇ ANALYZING REPORTS FOR: {p_age} Year Old {p_gender} ---")
uploaded = files.upload() # Uncomment if you need to upload new files

for filename in uploaded.keys():
    print(f"\nüìÑ Processing: {filename}")
    try:
        # Step A: Extract Data from Report (Milestone 1)
        df = interpreter.read_file(filename)
        df = preprocessor.normalize_column_names(df)

        if df.empty:
            print("   ‚ö†Ô∏è No readable clinical data (Header/Cover Page).")
            continue

        # Step B: Adjust Medical Rules based on Context (Model 3)
        # e.g. If Age > 60, increase Glucose limit
        active_rules = context_model.adjust_rules(validator.rules, p_age, p_gender)

        # Step C: Interpret Parameters (Model 1)
        # Check the extracted file data against the PERSONALIZED rules
        df_val = validator.check_standard_ranges(df, active_rules)

        # Display Individual Parameter Results
        biomarkers = {}
        for col in df.columns:
            if col in active_rules and pd.to_numeric(df[col], errors='coerce').notna().all():
                val = float(df[col].iloc[0])
                biomarkers[col] = val

                # Check status based on rules
                rule = active_rules[col]
                status = "Normal"
                if val < rule['min']: status = f"LOW (<{rule['min']})"
                elif val > rule['max']: status = f"HIGH (>{rule['max']})"

                print(f"   ‚ñ™ {col}: {val} -> {status}")

        # Step D: Pattern Recognition (Model 2)
        # Look for combinations like Anemia/Infection
        patterns = risk_engine.analyze_risk(biomarkers, active_rules)
        if patterns:
            for p in patterns:
                print(f"   üö® RISK PATTERN: {p}")
        else:
            print("   ‚úÖ No complex risk patterns detected.")

    except Exception as e:
        print(f"   ‚ùå Error: {e}")

--- üè• PATIENT CONTEXT SETUP ---
Enter Patient Age: 45
Enter Patient Gender (Male/Female): male

--- üìÇ ANALYZING REPORTS FOR: 45 Year Old male ---


Saving BLR-0425-PA-0038965_BIPUL CHAKRABORTY 0038965 2 OF 2_28-04-2025_1014-26_AM.pdf_page_7.png to BLR-0425-PA-0038965_BIPUL CHAKRABORTY 0038965 2 OF 2_28-04-2025_1014-26_AM.pdf_page_7 (2).png

üìÑ Processing: BLR-0425-PA-0038965_BIPUL CHAKRABORTY 0038965 2 OF 2_28-04-2025_1014-26_AM.pdf_page_7 (2).png
   ‚ñ™ Glucose: 126.0 -> Normal
   ‚úÖ No complex risk patterns detected.
