In [6]:
import fitz  # PyMuPDF
import pandas as pd
import re

# Function to capitalize the first letter of each word
def title_case(text):
    return ' '.join([word.capitalize() for word in text.split()])

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text")
    doc.close()
    return text

# Extract and format fields from text
def extract_fields(text):
    fields = {}
    patterns = {
        "customer": r"CUSTOMER:?\s+([A-Z\s]+?)(?:\sACCOUNT|\n)",
        "account_number": r"ACCOUNT NUMBER:?\s+([0-9\-]+)",
        "service_address": r"SERVICE ADDRESS:?\s+([A-Z0-9\s,.-]+)\n",
        "billing_date": r"BILLING DATE:?\s+([A-Za-z0-9,\s]+)\n",
        "autopay_date": r"AUTOPAY DATE:?\s+([A-Za-z0-9,\s]+)\n",
        "amount_due": r"AMOUNT DUE:?\s+\$?([0-9,.\s]+)"
    }
    
    for field, pattern in patterns.items():
        match = re.search(pattern, text, re.MULTILINE)
        if match:
            extracted_text = match.group(1).strip()
            # Apply specific formatting
            if field in ["billing_date", "autopay_date"]:
                fields[field] = extracted_text  # Keep the original date format
            else:
                fields[field] = title_case(extracted_text.lower())
    
    return fields

# Write extracted fields to CSV
def write_to_csv(fields, csv_path):
    df = pd.DataFrame([fields])
    df.columns = [column.replace("_", " ").title() for column in df.columns]  # Adjust column names
    df.to_csv(csv_path, index=False)

# Main process
pdf_path = 'D:\\Python Practice files\\bill.pdf'
csv_path = 'D:\\Python Practice files\\bill.csv'

text = extract_text_from_pdf(pdf_path)
fields = extract_fields(text)
write_to_csv(fields, csv_path)

print("Extraction and saving to CSV complete. Fields extracted:", list(fields.keys()))


Extraction and saving to CSV complete. Fields extracted: ['customer', 'account_number', 'billing_date', 'autopay_date', 'amount_due']


In [6]:
# Dates formatted for dates fields as DD/MM/YYYY

import fitz  # PyMuPDF
import pandas as pd
import re
from datetime import datetime

# Function to capitalize the first letter of each word
def title_case(text):
    return ' '.join([word.capitalize() for word in text.split()])

# Convert date from "Jan 12, 2024" to "MM/DD/YYYY"
def convert_date_format(date_str):
    # Trim and clean the date string to ensure it matches the expected format
    cleaned_date_str = date_str.strip().split('\n')[0]  # This takes the first line if there are multiple lines
    # Parse the date
    date_obj = datetime.strptime(cleaned_date_str, '%b %d, %Y')
    # Convert to desired format
    return date_obj.strftime('%m/%d/%Y')


# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text")
    doc.close()
    return text

# Extract and format fields from text
def extract_fields(text):
    fields = {}
    patterns = {
        "customer": r"CUSTOMER:?\s+([A-Z\s]+?)(?:\sACCOUNT|\n)",
        "account_number": r"ACCOUNT NUMBER:?\s+([0-9\-]+)",
        "service_address": r"SERVICE ADDRESS:?\s+([A-Z0-9\s,.-]+)\n",
        "billing_date": r"BILLING DATE:?\s+([A-Za-z0-9,\s]+)\n",
        "autopay_date": r"AUTOPAY DATE:?\s+([A-Za-z0-9,\s]+)\n",
        "amount_due": r"AMOUNT DUE:?\s+\$?([0-9,.\s]+)"
    }
    
    for field, pattern in patterns.items():
        match = re.search(pattern, text, re.MULTILINE)
        if match:
            extracted_text = match.group(1).strip()
            # Apply specific formatting
            if field in ["billing_date", "autopay_date"]:
                fields[field] = convert_date_format(extracted_text)  # Convert date format
            else:
                fields[field] = title_case(extracted_text.lower())
    
    return fields

# Write extracted fields to CSV
def write_to_csv(fields, csv_path):
    df = pd.DataFrame([fields])
    df.columns = [column.replace("_", " ").title() for column in df.columns]  # Adjust column names
    df.to_csv(csv_path, index=False)

# Main process
pdf_path = 'D:\\Python Practice files\\bill.pdf'
csv_path = 'D:\\Python Practice files\\bill.csv'


text = extract_text_from_pdf(pdf_path)
fields = extract_fields(text)
write_to_csv(fields, csv_path)

print("Extraction and saving to CSV complete. Fields extracted:", list(fields.keys()))


Extraction and saving to CSV complete. Fields extracted: ['customer', 'account_number', 'billing_date', 'autopay_date', 'amount_due']


In [None]:
filename = 'Arctic Berk.pdf'
df = pd.read_csv('', index_col=0)

try:
    csv_page1 = df.loc[filename[:-4]+'_0001-1.jpg']['csv']
    text_page1= eval(df.loc[filename[:-4]+'_0001-1.jpg']['text'])
    df1 = pd.read_csv(io.StringIO(csv_page1.split('\n\n', 1) [1]), sep='|',on_bad_lines='skip')
    df1= df1.dropna(axis=1, how='all')
    df1=df1.dropna(axis=0, how='all')
except:
    csv_page2 = df.loc[filename[:-4]+'_0001-2.jpg']['csv']
    text_page2= eval(df.loc[filename[:-4]+'_0001-2.jpg']['text'])
    df1 = pd.read_csv(io.StringIO(csv_page1.split('\n\n', 1) [1]), sep='|',on_bad_lines='skip')
    df1= df1.dropna(axis=1, how='all')
    df1=df1.dropna(axis=0, how='all')
df1.fillna('', inplace=True)
df1
        

In [None]:
import pandas as pd
import re

def extract_project_name(text_list):
    for text in text_list:
        match = re.search(r"Re: Project\s+([^\,\.]+)", text)
        if match:
            return match.group(1).strip()
    return ''

csv_file_path = 'path_to_your_input_csv_file.csv'
output_csv_path = 'path_to_your_output_csv_file.csv'

df = pd.read_csv(csv_file_path)

for index, row in df.iterrows():
    try:
        # Assuming 'text' field exists and contains relevant data to identify the project
        text_data = eval(row['text']) if isinstance(row['text'], str) else row['text']
        project_name = extract_project_name(text_data)
            
        if project_name:  # This checks if a project name was successfully extracted
            df_main['Project name'] = project_name
        
    except Exception as e:
        print(f"Failed to process row {index}: {e}")

df_main.to_csv(output_csv_path, index=False)


In [None]:
import pandas as pd
import re

def reset_main_fields():
    # Return a set of column names
    return {'Project name', 'Insurer', 'EV', 'Limit 1', 'Limit 2'}

# Convert the set of column names to a list when initializing df_main
df_main = pd.DataFrame(columns=list(reset_main_fields()))

# Placeholder paths for CSV files
input_csv_path = 'path_to_your_input_csv.csv'
output_csv_path = 'path_to_your_output_csv.csv'

# Function to simulate data extraction, replace with actual logic
def extract_data(text, field):
    if field == 'Project_Name':
        match = re.search(r"Re: Project\s+([^\,\.]+)", text, re.IGNORECASE)
        if match:
            return match.group(1).strip().rstrip("'")
    return ''

# Read input CSV
df = pd.read_csv(input_csv_path, index_col=0)
df_main = pd.DataFrame(columns=list(reset_main_fields()))

# List to store extracted data
extracted_data_list = []

for index, row in df.iterrows():
    # Initialize a dictionary for the row's data with keys from reset_main_fields
    extracted_data = dict.fromkeys(reset_main_fields(), '')
    # Assuming 'text' is a column in your CSV. Replace 'text' with actual text data to process.
    text = row['text']  # Placeholder for where your text data is located
    
    for field in reset_main_fields():
        # Populate extracted data for each field
        extracted_data[field] = extract_data(text, field)
    
    # Append the populated dictionary to the list
    extracted_data_list.append(extracted_data)

# Convert list of dictionaries into a DataFrame
df_main = pd.DataFrame(extracted_data_list)

# Write DataFrame to CSV
df_main.to_csv(output_csv_path, index=False)


In [None]:
extracted_data_list = []

for index, row in df.iterrows():
    extracted_data_list.append(extracted_data(row, state))
    df_main = pd.DataFrame(extracted_data_list, columns=reset_main_fields())
df_main.to_csv(output_csv_path, index=False)