In [18]:
import os
import fitz  # PyMuPDF
from pytesseract import image_to_string  # OCR
from PIL import Image  # Image handling
import io  # Byte stream handling
import pandas as pd  # Data manipulation
import re  # Regular expressions for pattern matching





# Function to read text from a PDF
def read_pdf_text(pdf_path):
    try:
        doc = fitz.open(pdf_path)  # Open the PDF
        full_text = ""  # Initialize an empty string
        for page_num in range(len(doc)):
            page = doc[page_num]
            text = page.get_text()
            if not text.strip():
                pix = page.get_pixmap()
                img = Image.open(io.BytesIO(pix.tobytes()))
                text = image_to_string(img, lang="eng")
            full_text += text + "\n"
        doc.close()
        return full_text
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return None


# Function to extract the title
def extract_title(text):
    patterns = [
        r"(?i)^(title:?\s*)(.*)",
        r"(?i)^is\s+the\s+.*",
        r"(?i)^[A-Z][^\n]{10,}\n"
    ]
    for pattern in patterns:
        match = re.search(pattern, text, re.MULTILINE)
        if match:
            return match.group(2 if pattern == patterns[0] else 0).strip()
    return "Title not found"


# Function to extract authors
def extract_authors(text):
    patterns = [
        r"(?i)(authors?:?\s*)(.*)",
        r"(?i)(^[A-Z][a-z]+,\s+[A-Z][a-z]+(?:\sand\s[A-Z][a-z]+)?)"
    ]
    for pattern in patterns:
        match = re.search(pattern, text, re.MULTILINE)
        if match:
            return match.group(2 if pattern == patterns[0] else 1).strip()
    return "Authors not found"


# Function to extract abstract
def extract_abstract(text):
    abstract_match = re.search(
        r"(?i)(abstract[:\s]+)([\s\S]*?)(?=\n\n|keywords:|introduction|1\.)",
        text, re.DOTALL
    )
    return abstract_match.group(2).strip() if abstract_match else "Abstract not found"


# Function to extract figures and tables
def extract_figures_tables(text):
    figures_tables = re.findall(
        r"(?i)(figure|table)\s*(\d+):?\s*([\s\S]*?)(?=\n\n|figure|table|\Z)", text
    )
    unique_figures_tables = []
    seen = set()
    for fig_tab in figures_tables:
        identifier = (fig_tab[0].lower(), fig_tab[1])
        if identifier not in seen:
            seen.add(identifier)
            unique_figures_tables.append({
                "Type": fig_tab[0].capitalize(),
                "Number": fig_tab[1],
                "Description": clean_text(fig_tab[2]) if fig_tab[2].strip() else "No description"
            })
    return unique_figures_tables


# Main script
if __name__ == "__main__":
    pdf_path = r"C:\Users\Nima\Downloads\Simmons (2021) Psychosocial drivers of land management behaviour - How threats, norms, and context influence deforestation intentions.pdf"  # Update file path
    
    # Read the PDF text
    pdf_text = read_pdf_text(pdf_path)
    if pdf_text:
        # Extract and display results
        print("\n--- Extracted Information ---\n")
        
        # Title
        title = extract_title(pdf_text)
        print(f"Title: {title}")
        
        # Authors
        authors = extract_authors(pdf_text)
        print(f"Authors: {authors}")
        
        # Abstract
        abstract = extract_abstract(pdf_text)
        print(f"Abstract: {abstract}")
        
        # Figures and Tables
        figures_tables = extract_figures_tables(pdf_text)
        if figures_tables:
            print("\nFigures and Tables:")
            for item in figures_tables:
                print(f"  {item['Type']} {item['Number']}: {item['Description']}")
        else:
            print("\nNo Figures or Tables found.")



--- Extracted Information ---

Title: is the Theory of Planned Behaviour (Ajzen 1985), which
Authors: ized users.
Abstract: Understanding how private landholders make
deforestation decisions is of paramount importance for
conservation. Behavioural frameworks from the social
sciences have a lot to offer researchers and practitioners,
yet these insights remain underutilised in describing what
drives
landholders’
deforestation
intentions
under
important political, social, and management contexts.
Using survey data of private landholders in Queensland,
Australia,
we
compare
the
ability
of
two
popular
behavioural
models
to
predict
future
deforestation
intentions, and propose a more integrated behavioural
model of deforestation intentions. We found that the
integrated model outperformed other models, revealing the
importance of threat perceptions, attitudes, and social
norms for predicting landholders’ deforestation intentions.
Social capital, policy uncertainty, and years of experience
are

In [20]:
import os
import fitz  # PyMuPDF
from pytesseract import image_to_string  # OCR
from PIL import Image  # Image handling
import io  # Byte stream handling
import pandas as pd  # Data manipulation
import re  # Regular expressions for pattern matching
from openpyxl import Workbook  # For creating Excel files


# Function to clean text
# Function to clean text
def clean_text(text):
    """
    Clean text by removing problematic characters, including illegal Excel characters.
    """
    if text:
        # Remove illegal Excel characters
        illegal_chars = re.compile(r'[\x00-\x08\x0B-\x0C\x0E-\x1F]')
        text = illegal_chars.sub("", text)
        return text.replace("\u2003", " ").replace("\x08", "").strip()
    return text



# Function to read text from a PDF
def read_pdf_text(pdf_path):
    try:
        doc = fitz.open(pdf_path)  # Open the PDF
        full_text = ""  # Initialize an empty string
        for page_num in range(len(doc)):
            page = doc[page_num]
            text = page.get_text()
            if not text.strip():
                pix = page.get_pixmap()
                img = Image.open(io.BytesIO(pix.tobytes()))
                text = image_to_string(img, lang="eng")
            full_text += text + "\n"
        doc.close()
        return full_text
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return None


# Function to extract the title
def extract_title(text):
    patterns = [
        r"(?i)^(title:?\s*)(.*)",
        r"(?i)^is\s+the\s+.*",
        r"(?i)^[A-Z][^\n]{10,}\n"
    ]
    for pattern in patterns:
        match = re.search(pattern, text, re.MULTILINE)
        if match:
            return match.group(2 if pattern == patterns[0] else 0).strip()
    return "Title not found"


# Function to extract authors
def extract_authors(text):
    patterns = [
        r"(?i)(authors?:?\s*)(.*)",
        r"(?i)(^[A-Z][a-z]+,\s+[A-Z][a-z]+(?:\sand\s[A-Z][a-z]+)?)"
    ]
    for pattern in patterns:
        match = re.search(pattern, text, re.MULTILINE)
        if match:
            return match.group(2 if pattern == patterns[0] else 1).strip()
    return "Authors not found"


# Function to extract abstract
def extract_abstract(text):
    abstract_match = re.search(
        r"(?i)(abstract[:\s]+)([\s\S]*?)(?=\n\n|keywords:|introduction|1\.)",
        text, re.DOTALL
    )
    return abstract_match.group(2).strip() if abstract_match else "Abstract not found"


# Function to extract figures and tables
def extract_figures_tables(text):
    figures_tables = re.findall(
        r"(?i)(figure|table)\s*(\d+):?\s*([\s\S]*?)(?=\n\n|figure|table|\Z)", text
    )
    unique_figures_tables = []
    seen = set()
    for fig_tab in figures_tables:
        identifier = (fig_tab[0].lower(), fig_tab[1])
        if identifier not in seen:
            seen.add(identifier)
            unique_figures_tables.append({
                "Type": fig_tab[0].capitalize(),
                "Number": fig_tab[1],
                "Description": clean_text(fig_tab[2]) if fig_tab[2].strip() else "No description"
            })
    return unique_figures_tables


# Function to save extracted data to Excel
# Function to save extracted data to Excel
def save_to_excel(title, authors, abstract, figures_tables, output_path):
    # Clean text inputs
    title = clean_text(title)
    authors = clean_text(authors)
    abstract = clean_text(abstract)

    # Clean figures/tables descriptions
    figures_tables = [
        {
            "Type": clean_text(item["Type"]),
            "Number": clean_text(item["Number"]),
            "Description": clean_text(item["Description"])
        }
        for item in figures_tables
    ]

    # Create a Pandas Excel writer
    with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
        # Write title, authors, and abstract to a DataFrame
        metadata_df = pd.DataFrame({
            "Title": [title],
            "Authors": [authors],
            "Abstract": [abstract]
        })
        metadata_df.to_excel(writer, sheet_name="Metadata", index=False)

        # Write figures and tables to another sheet
        if figures_tables:
            figures_tables_df = pd.DataFrame(figures_tables)
            figures_tables_df.to_excel(writer, sheet_name="Figures_Tables", index=False)
        else:
            # Create an empty DataFrame if no figures/tables
            empty_df = pd.DataFrame(columns=["Type", "Number", "Description"])
            empty_df.to_excel(writer, sheet_name="Figures_Tables", index=False)



# Main script
if __name__ == "__main__":
    pdf_path = r"C:\Users\Nima\Downloads\Simmons (2021) Psychosocial drivers of land management behaviour - How threats, norms, and context influence deforestation intentions.pdf"  # Update file path
    output_excel_path = r"C:\Users\Nima\Downloads\Extracted_Data.xlsx"  # Update output path

    # Read the PDF text
    pdf_text = read_pdf_text(pdf_path)
    if pdf_text:
        # Extract data
        title = extract_title(pdf_text)
        authors = extract_authors(pdf_text)
        abstract = extract_abstract(pdf_text)
        figures_tables = extract_figures_tables(pdf_text)

        # Save to Excel
        save_to_excel(title, authors, abstract, figures_tables, output_excel_path)
        print(f"Data successfully saved to {output_excel_path}")
    else:
        print("Failed to extract text from the PDF.")


Data successfully saved to C:\Users\Nima\Downloads\Extracted_Data.xlsx
