In [1]:
import fitz  # PyMuPDF


In [2]:
import camelot
import pandas as pd

In [11]:
import json
from pathlib import Path

In [12]:
pdf_path = Path("../data/raw_pdfs/TCS_2023-24.pdf")
if not pdf_path.exists(): raise FileNotFoundError(f"PDF file not found: {pdf_path}")

In [13]:
# -------------------------------
# 2. Extract text with PyMuPDF
# -------------------------------
print("=== Extracting text with PyMuPDF ===")
doc = fitz.open(pdf_path)

all_pages_text = []
for page_num, page in enumerate(doc, start=1):
    text = page.get_text("text")
    all_pages_text.append({
        "page": page_num,
        "text": text.strip()
    })


=== Extracting text with PyMuPDF ===


In [14]:
# -------------------------------
# 3. Extract tables with Camelot
# -------------------------------
print("\n=== Extracting tables with Camelot ===")
tables = camelot.read_pdf(str(pdf_path), pages="all", flavor="lattice")  

print(f"Found {tables.n} tables in PDF.")


=== Extracting tables with Camelot ===
Found 11 tables in PDF.


In [50]:
all_tables_json = []
for i, table in enumerate(tables, start=1):
    df = table.df  # Pandas DataFrame
    df.columns = df.iloc[0]  # First row as header
    df = df[1:]  # Remove header row
    df.reset_index(drop=True, inplace=True)
    
    # Convert each row to key-value dict based on headers
    rows_as_dicts = df.to_dict(orient="records")
    
    all_tables_json.append({
        "table_number": i,
        "page": table.page,  # Camelot stores page number
        "data": rows_as_dicts
    })

  rows_as_dicts = df.to_dict(orient="records")
  rows_as_dicts = df.to_dict(orient="records")
  rows_as_dicts = df.to_dict(orient="records")
  rows_as_dicts = df.to_dict(orient="records")
  rows_as_dicts = df.to_dict(orient="records")
  rows_as_dicts = df.to_dict(orient="records")
  rows_as_dicts = df.to_dict(orient="records")


In [17]:
# -------------------------------
# 4. Combine into a single JSON
# -------------------------------
result_json = {
    "text_pages": all_pages_text,
    "tables": all_tables_json
}



In [19]:
output_path = "financial_statement.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(result_json, f, ensure_ascii=False, indent=2)

print(f"\n✅ JSON saved to {output_path}")


✅ JSON saved to financial_statement.json


In [42]:
import pdfplumber

pdf_path = "../data/raw_pdfs/TCS_2023-24.pdf"

with pdfplumber.open(pdf_path) as pdf:
    page = pdf.pages[0]

    # Debug: See all words with positions
    words = page.extract_words()
    #print(words[:10])  # sample output

    # Try table extraction with custom settings
    table = page.extract_table({
        "vertical_strategy": "lines",
        "horizontal_strategy": "lines",
        "intersection_tolerance": 5
    })

    if table:
        for row in table:
            print(row)
    else:
        print("No table detected — try OCR next.")


No table detected — try OCR next.


In [59]:
import pytesseract
from PIL import Image
import io

# If on Windows, specify path to Tesseract exe
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

pdf_path = "../data/raw_pdfs/TCS_2023-24.pdf"

doc = fitz.open(pdf_path)


pix = doc[page_num].get_pixmap(dpi=300)
img = Image.open(io.BytesIO(pix.tobytes("png")))

ocr_data = pytesseract.image_to_data(img, output_type=Output.DICT)

rows = {}
n_boxes = len(ocr_data['text'])

for i in range(n_boxes):
    if int(ocr_data['conf'][i]) > 50:  # only keep good confidence
        line_num = ocr_data['line_num'][i]
        word = ocr_data['text'][i].strip()
        if word:
            rows.setdefault(line_num, []).append(word)

# Convert to list of rows
row_list = [" ".join(words) for _, words in sorted(rows.items())]

# Print result row-wise
for r in row_list:
    print(r) 

TATA CONSULTANCY SERVICES LIMITED Audited Consolidated Statement of Financial Results (= crore) Three month period ended Revenue from operations 61,237 60,583 59,162 2,40,893 OTHER COMPREHENSIVE INCOME (OCI) Non-controlling interests Other comprehensive income for the period attributable to: Total comprehensive income for the period attributable to: Paid up equity share capital (Face value: %1 per share) Earnings per equity share:- Basic and diluted (%) Interim dividend on equity shares (%) Final dividend on equity shares Total dividend on equity shares Total equity dividend percentage
Registered Office: Floor, Nirmal Building, Nariman Point, Mumbai 400 021 March 31, December 31, March 31, March 31, March 31, Other income 1,157 86. , 4,422 3,449 Items that will not be reclassified subsequently to profit or loss Shareholders of the Company Shareholders of the Company Total reserves (including Non-controlling interests) Dividend per share (Par value each)
CIN: L22210MH1995PLC084781 2024 

In [49]:
from pypdf import PdfReader
import re

pdf_path = Path("../data/raw_pdfs/TCS_2023-24.pdf")

reader = PdfReader(str(pdf_path))
all_rows = []

for page_num, page in enumerate(reader.pages, start=1):
    text = page.extract_text()
    if not text:
        continue
    lines = text.split("\n")

    for line in lines:
        # Always keep the line, even if only one column
        parts = re.split(r"\s{2,}", line.strip())
        all_rows.append(parts)

# Create JSON-like structure
result_json = {"rows": all_rows}

# Save JSON
output_path = "financial_statement_.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(result_json, f, ensure_ascii=False, indent=2)

# Preview first 10 rows
for r in all_rows[:10]:
    print(r)


['March 31, December 31, March 31, March 31, March 31,']
['2024 2023 2023 2024 2023']
['Revenue from operations', '61,237 60,583 59,162 2,40,893 2,25,458']
['Other income', '1,157 862 1,175 4,422 3,449']
['TOTAL INCOME 62,394 61,445 60,337 2,45,315 2,28,907']
['Expenses']
['Employee benefit expens es', '35,138 34,722 33,687 1,40,131 1,27,522']
['Cos t of equipment and s oftware licences', '1,561 1,173 620 3,702 1,881']
['Finance cos ts', '226 230 272 778 779']
['Depreciation and amortis ation expens e', '1,246 1,233 1,286 4,985 5,022']
