# Milestone 1: Data Ingestion System (Final Testing Phase)



In [6]:
# 1. Install System Dependencies for OCR
!sudo apt-get install tesseract-ocr

# 2. Install Python Packages
!pip install pdfplumber pytesseract pandas kaggle

# 3. Unzip the project code
!unzip -o data_ingestion.zip

import pandas as pd
import os
from data_ingestion.interpreters import CommonInterpreter
from data_ingestion.validators import DataValidator
from data_ingestion.preprocessors import Preprocessor

# Initialize global components
interpreter = CommonInterpreter()
validator = DataValidator()
preprocessor = Preprocessor()

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 1 not upgraded.
Archive:  data_ingestion.zip
  inflating: data_ingestion/interpreters.py  
  inflating: data_ingestion/preprocessors.py  
  inflating: data_ingestion/validators.py  
 extracting: data_ingestion/__init__.py  
  inflating: data_ingestion/__pycache__/interpreters.cpython-314.pyc  
  inflating: data_ingestion/__pycache__/preprocessors.cpython-314.pyc  
  inflating: data_ingestion/__pycache__/validators.cpython-314.pyc  
  inflating: data_ingestion/__pycache__/__init__.cpython-314.pyc  


## Part 1: Training Phase (Data Download)
We use **Mendeley** and **Bajaj** datasets to define our schema and test OCR capabilities.


In [2]:
# Setup Kaggle for Bajaj (Training Data)
if os.path.exists('kaggle.json'):
    !mkdir -p ~/.kaggle
    !cp kaggle.json ~/.kaggle/
    !chmod 600 ~/.kaggle/kaggle.json
    # Download Bajaj (Images for OCR training/testing)
    !kaggle datasets download -d dikshaasinghhh/bajaj --unzip -p datasets/bajaj
    print("Bajaj Dataset downloaded.")
else:
    print("INFO: kaggle.json not found. Skipping auto-download of Bajaj dataset.")


Dataset URL: https://www.kaggle.com/datasets/dikshaasinghhh/bajaj
License(s): unknown
Downloading bajaj.zip to datasets/bajaj
 96% 661M/692M [00:06<00:00, 58.9MB/s]
100% 692M/692M [00:06<00:00, 107MB/s] 
Bajaj Dataset downloaded.


## Part 2: Testing Phase
We use **GitHub Lab Data** (CSV) and **Kaggle CBC** (Images) to verify the pipeline.

In [3]:
# 1. Download GitHub Lab Data (CSV)
!wget -O datasets/lab_data.csv https://raw.githubusercontent.com/Bahmni/openmrs-data/master/demo-data/csv/lab_data.csv

# 2. Download Kaggle CBC (if API available)
if os.path.exists('kaggle.json'):
    !kaggle datasets download -d orvile/complete-blood-count-cbc-dataset --unzip -p datasets/cbc
    print("CBC Dataset downloaded.")

--2026-01-10 14:55:19--  https://raw.githubusercontent.com/Bahmni/openmrs-data/master/demo-data/csv/lab_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6265 (6.1K) [text/plain]
Saving to: ‘datasets/lab_data.csv’


2026-01-10 14:55:19 (46.0 MB/s) - ‘datasets/lab_data.csv’ saved [6265/6265]

Dataset URL: https://www.kaggle.com/datasets/orvile/complete-blood-count-cbc-dataset
License(s): Attribution 4.0 International (CC BY 4.0)
Downloading complete-blood-count-cbc-dataset.zip to datasets/cbc
  0% 0.00/8.65M [00:00<?, ?B/s]
100% 8.65M/8.65M [00:00<00:00, 1.27GB/s]
CBC Dataset downloaded.


## Part 3: Manual Upload & Verification
Upload your own files (PDF, Image, CSV, JSON) to test the system.

In [7]:
from google.colab import files
import pdfplumber
import pandas as pd
import numpy as np
from collections import defaultdict

class CommonInterpreter:
    def read_file(self, file_path):
        if file_path.lower().endswith('.csv'):
            return self.read_csv(file_path)
        elif file_path.lower().endswith(('.json')):
            return self.read_json(file_path)
        elif file_path.lower().endswith(('.pdf')):
            return self.read_pdf(file_path)
        elif file_path.lower().endswith(('.jpg', '.jpeg', '.png')):
            return self.read_image(file_path)
        else:
            raise ValueError(f"Unsupported file type: {file_path}")

    def read_csv(self, file_path):
        return pd.read_csv(file_path)

    def read_json(self, file_path):
        return pd.read_json(file_path)

    def read_pdf(self, file_path):
        all_extracted_data = []
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                tables = page.extract_tables()
                for table in tables:
                    if not table:
                        continue

                    header = []
                    data_rows = []

                    if table and table[0]:
                        potential_header = [h.strip() if isinstance(h, str) else None for h in table[0]]
                        if any(ph for ph in potential_header if ph):
                            header = potential_header
                            data_rows = table[1:]
                        else:
                            header = [f"Col_{i}" for i in range(len(table[0]))] if table[0] else []
                            data_rows = table[:]
                    else:
                        if table:
                            header = [f"Col_{i}" for i in range(len(table[0]))] if table[0] else []
                            data_rows = table[:]
                        else:
                            continue

                    if not data_rows:
                        continue

                    max_col_len = max(len(row) for row in data_rows + [header])
                    header = header + [f"Col_{i}" for i in range(len(header), max_col_len)]
                    data_rows = [row + [None] * (max_col_len - len(row)) for row in data_rows]

                    df = pd.DataFrame(data_rows, columns=header)

                    df = df.dropna(axis=1, how='all')

                    cols = df.columns.tolist()
                    seen = defaultdict(int)
                    new_cols = []
                    for col in cols:
                        base_col = col if col is not None else 'None'
                        if base_col in seen:
                            seen[base_col] += 1
                            new_cols.append(f"{base_col}_{seen[base_col]}")
                        else:
                            new_cols.append(base_col)
                        seen[base_col] += 1 # Increment for the first seen instance too

                    df.columns = new_cols

                    all_extracted_data.append(df)

        if all_extracted_data:
            return pd.concat(all_extracted_data, ignore_index=True)
        return pd.DataFrame()

    def read_image(self, file_path):
        try:
            from PIL import Image
            import pytesseract
            text = pytesseract.image_to_string(Image.open(file_path))
            if text:
                lines = text.strip().split('\n')
                return pd.DataFrame(lines, columns=['ExtractedText'])
            return pd.DataFrame()
        except Exception as e:
            # print(f"Error during OCR for {file_path}: {e}")
            return pd.DataFrame()

# Re-initialize the interpreter with the updated class definition
interpreter = CommonInterpreter()

# --- End of proposed change ---


print("Upload a medical report file (CSV, JSON, PDF, or Image)...")
uploaded = files.upload()

for filename in uploaded.keys():
    print(f"\nProcessing {filename}...")
    try:
        # 1. Interpret
        df = interpreter.read_file(filename)
        if df.empty:
            print("No data extracted (empty).")
            continue

        print("Extracted Data (First 5 rows):")
        print(df.head())

        # 2. Preprocess
        df_norm = preprocessor.normalize_column_names(df)

        # 3. Validate
        df_val = validator.check_standard_ranges(df_norm)

        print("\nValidation Results:")
        # Show only columns that exist in our validation rules + Error column
        rules = validator.get_default_rules()
        cols_to_show = [c for c in df_val.columns if c in rules or c == 'validation_errors']
        if cols_to_show:
            print(df_val[cols_to_show].head())
        else:
            print("No recognized standard columns found to validate.")

    except Exception as e:
        print(f"Error processing file: {e}")

Upload a medical report file (CSV, JSON, PDF, or Image)...


Saving BDCBC7196_Hematology_Dataset.csv to BDCBC7196_Hematology_Dataset (2).csv
Saving blood_report_img_1.jpg to blood_report_img_1 (2).jpg
Saving blood_report_img_2.jpg to blood_report_img_2 (2).jpg
Saving Blood_report_pdf_1.pdf to Blood_report_pdf_1 (2).pdf
Saving Blood_report_pdf_2.pdf to Blood_report_pdf_2 (2).pdf
Saving Blood_report_pdf_3.pdf to Blood_report_pdf_3 (2).pdf
Saving Blood_report_pdf_4.pdf to Blood_report_pdf_4 (2).pdf
Saving Blood_report_pdf_5.pdf to Blood_report_pdf_5 (2).pdf
Saving Blood_report_pdf_6.pdf to Blood_report_pdf_6 (2).pdf

Processing BDCBC7196_Hematology_Dataset (2).csv...
Extracted Data (First 5 rows):
   Gender  Age    Hb   RBC    WBC  PLATELETS  LYMP  MONO   HCT   MCV   MCH  \
0       0   45  12.1  4.25  12300   404000.0  29.0   4.6  36.2  85.2  28.4   
1       0   58  12.3  4.34  12000   392000.0  30.0   5.1  37.1  85.5  28.3   
2       0   49  12.6  4.35  11300   387000.0  23.5   7.0  38.2  87.9  28.9   
3       0   43  12.0  4.30   5000   298000.0 



No data extracted (empty).

Processing Blood_report_pdf_3 (2).pdf...
Extracted Data (First 5 rows):
  Peak Name Calibrated\nArea % Area % Retention\nTime (min) Peak\nArea
0   Unknown                ---    0.1                  1.00       2400
1         F                0.3    ---                  1.08       8321
2   Unknown                ---    1.6                  1.18      44109
3        P2                ---    5.5                  1.30     148807
4        P3                ---    5.2                  1.72     139183

Validation Results:
  validation_errors
0              None
1              None
2              None
3              None
4              None

Processing Blood_report_pdf_4 (2).pdf...
Extracted Data (First 5 rows):
  None                  None_2
0    C  ertificate No: MC-2318
1    C  ertificate No: MC-2318

Validation Results:
  validation_errors
0              None
1              None

Processing Blood_report_pdf_5 (2).pdf...
Extracted Data (First 5 rows):
  COMPLETE BL

In [10]:
from google.colab import files

print("Upload additional medical report files (CSV, JSON, PDF, or Image)...")
uploaded_additional = files.upload()

for filename in uploaded_additional.keys():
    print(f"\nProcessing {filename}...")
    try:
        # 1. Interpret
        df = interpreter.read_file(filename)
        if df.empty:
            print("No data extracted (empty).")
            continue

        print("Extracted Data (Full DataFrame):")
        # Display the entire DataFrame to show all extracted data
        print(df.to_string())

        # 2. Preprocess
        df_norm = preprocessor.normalize_column_names(df)

        # 3. Validate
        df_val = validator.check_standard_ranges(df_norm)

        print("\nValidation Results:")
        # Show only columns that exist in our validation rules + Error column
        rules = validator.get_default_rules()
        cols_to_show = [c for c in df_val.columns if c in rules or c == 'validation_errors']
        if cols_to_show:
            print(df_val[cols_to_show].head())
        else:
            print("No recognized standard columns found to validate.")

    except Exception as e:
        print(f"Error processing file: {e}")

Upload additional medical report files (CSV, JSON, PDF, or Image)...


Saving BLR-0425-PA-0039883_ALL CLAIMS DOCM BHUVANESHWARI VIDAL_0001_27-04-2025_1131-10_AM@E.pdf_page_38.png to BLR-0425-PA-0039883_ALL CLAIMS DOCM BHUVANESHWARI VIDAL_0001_27-04-2025_1131-10_AM@E.pdf_page_38.png
Saving BLR-0425-PA-0038965_BIPUL CHAKRABORTY 0038965 2 OF 2_28-04-2025_1014-26_AM.pdf_page_7.png to BLR-0425-PA-0038965_BIPUL CHAKRABORTY 0038965 2 OF 2_28-04-2025_1014-26_AM.pdf_page_7.png
Saving BLR-0425-PA-0037318_SASHANK P K 0037318 2 OF 2_28-04-2025_1007-19_AM@E.pdf_page_29.png to BLR-0425-PA-0037318_SASHANK P K 0037318 2 OF 2_28-04-2025_1007-19_AM@E.pdf_page_29.png
Saving AHD-0425-PA-0007719_E-REPORTS_250427_2032@E.pdf_page_7.png to AHD-0425-PA-0007719_E-REPORTS_250427_2032@E.pdf_page_7.png

Processing BLR-0425-PA-0039883_ALL CLAIMS DOCM BHUVANESHWARI VIDAL_0001_27-04-2025_1131-10_AM@E.pdf_page_38.png...
Extracted Data (Full DataFrame):
                                                   ExtractedText
0                                                           CF &
1      