In [2]:
from pathlib import Path
from pprint import pprint
from typing import List
from collections import defaultdict
import re
import pandas as pd
import re
import csv
from tqdm import tqdm
from datetime import datetime

%reload_ext autoreload
%autoreload 2

In [3]:
def extract(file_contents, regex, post=None):
    # Search for the pattern in the text
    match = re.search(regex, file_contents)

    if match:
        # Extract first capture group
        if post:
            temp = post(match.group(1))
        else:
            temp = match.group(1)
        return temp
    else:
        return ""

In [33]:
FILENAME = "filename"
NUMBER_OF_BIDDERS = "Number_of_Bidders"
BID_OPENING_DATE = "Bid_Opening_Date"
CONTRACT_NUMBER = "Contract_Number"
TOTAL_NUMBER_OF_WORKING_DAYS = "Total_Number_of_Working_Days"
CONTRACT_ITEMS = "Number_of_Contract_Items"
CONTRACT_DESCRIPTION = "Contract_Description"
PERCENT_OVER_EST = "Percent_Est_Over"
PERCENT_UNDER_EST = "Percent_Est_Under"
ENGINEERS_EST = "Engineers_Est"
AMOUNT_OVER = "Amount_Over"
AMOUNT_UNDER = "Amount_Under"
CONTRACT_CODE = "Contract_Code"

BID_RANK = "Bid_Rank"
BID_TOTAL = "Bid_Total"   
BIDDER_ID = "Bidder_ID"
BIDDER_NAME = "Bidder_Name"
BIDDER_NAME_COND = "Bidder_Name_cond"
CSLB_NUMBER = "CSLB_Number"



ITEM_NUMBER = "Item_Number"
ITEM_CODE = "Item_Code"
ITEM_DESCRIPTION = "Item_Description"
ITEM_DOLLAR_AMOUNT = "Item_Dollar_Amount"

timestamp = datetime.strftime(datetime.now(), '%m-%d-%Y-%H:%M:%S')

In [46]:
def extract_contract_data(file_contents):
    row = defaultdict(str)
    row[CONTRACT_NUMBER] = extract(file_contents, r"CONTRACT NUMBER\s+([A-Za-z0-9-]+)")
    row[BID_OPENING_DATE] = extract(file_contents, r"BID OPENING DATE\s+(\d+/\d+/\d+)")
    row[CONTRACT_CODE] = extract(file_contents, r"CONTRACT CODE\s+'([^']+)'")
    row[CONTRACT_ITEMS] = extract(file_contents, r"(\d+)\s+CONTRACT ITEMS")
    row[TOTAL_NUMBER_OF_WORKING_DAYS] = extract(file_contents, r"TOTAL NUMBER OF WORKING DAYS\s+(\d+)")
    row[NUMBER_OF_BIDDERS] = extract(file_contents, r"NUMBER OF BIDDERS\s+(\d+)")
    row[ENGINEERS_EST] = extract(file_contents, r"ENGINEERS EST\s+([\d,]+\.\d{2})")
    row[AMOUNT_OVER] = extract(file_contents, r"AMOUNT OVER\s+([\d,]+\.\d{2})")
    row[AMOUNT_UNDER] = extract(file_contents, r"AMOUNT UNDER\s+([\d,]+\.\d{2})")
    row[PERCENT_OVER_EST] = extract(file_contents, r"PERCENT OVER EST\s+(\d+)")
    row[PERCENT_UNDER_EST] = extract(file_contents, r"PERCENT UNDER EST\s+(\d+)")
    row[CONTRACT_DESCRIPTION] = extract(file_contents, r"(.*?)\s+FEDERAL AID", post=lambda x: x.lstrip())
    return row


def extract_contract_bid_data(file_contents):
    contract_number = extract(file_contents, r"CONTRACT NUMBER\s+([A-Za-z0-9-]+)")

    pattern = re.compile(r"(\d+)\s+(A\))?\s+([\d,]+\.\d{2})\s+(\d+)\s+(.*?)\s+(\d{3} \d{3}-\d{4})\s+(.*?)\s+(.*?)(\d{8})")
    matches = pattern.findall(file_contents)
    
    contract_bid_data = []

    for match in matches:
        row = defaultdict(str)
        row[CONTRACT_NUMBER] = contract_number
        row[BID_RANK] = match[0]
        row["OPTIONAL_A"] = match[1]
        row[BID_TOTAL] = match[2]
        row[BIDDER_ID] = match[3].strip()
        row[BIDDER_NAME] = match[4] + ' ' + match[7].strip()
        row["BIDDER PHONE"] = match[5].strip()
        row["Extra_Text"] = match[6]
        row[CSLB_NUMBER] = match[8] 
        contract_bid_data.append(row)
        
    return contract_bid_data


def extract_contract_line_item_data(file_contents):
    contract_number = extract(file_contents, r"CONTRACT NUMBER\s+([A-Za-z0-9-]+)")

    pattern = re.compile(r"(^\s*(\d+)\s+(\d+)\s+([\dA-Z\(\)\"\- ]{46})\s(.{35})\s+([\d,]+\.\d{2}))(?:\n\s+([\dA-Z\(\)\"\- $]+)\n)?", re.MULTILINE)

    # between dotted line and total:     ^\s+-+\n([\s\S]+?)\n\s+TOTAL\s+\$?([\d,]+\.\d{2})
    matches = pattern.findall(file_contents)

    contract_line_item_data = []
    for match in matches:
        row = defaultdict(str)
        row[CONTRACT_NUMBER] = contract_number
        row[ITEM_NUMBER] = match[1]
        row[ITEM_CODE] = match[2]
        row[ITEM_DESCRIPTION] = match[3].rstrip() + ' ' + match[6]
        row[ITEM_DOLLAR_AMOUNT] = match[5]
        contract_line_item_data.append(row)
        
    return contract_line_item_data


def write_to_file(data: List, timestamp: str, name: str):
    df = pd.DataFrame(data)
    df.to_csv(f'{timestamp}_{name}.csv', index=False)
    

def read_file(filepath: str):
    # Open the file in read mode ('r')
    with open(filepath, 'r') as file:
        # Read the contents of the file into a string
        file_contents = file.read()
    return file_contents

In [9]:
# test for contract data
filepath = Path('/Users/nenadbozinovic/Documents/ocr/RR Procurement - Raw Data/01-0A0904.pdf_2724.txt')
file_contents = read_file(filepath)
contract_data = [extract_contract_data(file_contents)]
df = pd.DataFrame(contract_data)
df.head()

Unnamed: 0,Contract_Number,Bid_Opening_Date,Contract_Code,Number_of_Contract_Items,Total_Number_of_Working_Days,Number_of_Bidders,Engineers_Est,Amount_Over,Amount_Under,Percent_Est_Over,Percent_Est_Under,Contract_Description
0,01-0A0904,03/29/11,H,15,20,6,356785.0,62635.0,,17,,SEAL COAT


In [18]:
# test for contract bid data
filepath = Path('/Users/nenadbozinovic/Documents/ocr/RR Procurement - Raw Data/01-0A0904.pdf_2724.txt')
file_contents = read_file(filepath)
contract_bid_data = extract_contract_bid_data(file_contents)
df = pd.DataFrame(contract_bid_data)
df.head()


Unnamed: 0,Contract_Number,Bid_Rank,OPTIONAL_A,Bid_Total,Bidder_ID,Bidder_Name,BIDDER PHONE,Extra_Text,CSLB_Number
0,01-0A0904,1,,419420.0,2,INTERNATIONAL SURFACING SYSTEMS,916 373-2420,SB PREF CLAIMED,736996
1,01-0A0904,2,,428428.0,1,NORTHWEST PAVING,530 246-4388,SB PREF CLAIMED,822126
2,01-0A0904,3,,432845.0,6,ADVANTAGE PAVING AND EXCAVATING INC,530 598-7759,SB PREF CLAIMED,909239
3,01-0A0904,4,,502205.0,4,FRANKLIN CONSTRUCTION INC,530 343-9600,SB PREF CLAIMED,567469
4,01-0A0904,5,,514740.0,5,WINDSOR FUEL COMPANY,925 427-5266,SB PREF CLAIMED,776848


In [47]:
# test for contract line item data
filepath = Path('/Users/nenadbozinovic/Documents/ocr/RR Procurement - Raw Data/01-0A0904.pdf_2724.txt')
file_contents = read_file(filepath)
# bid_subcontractor_data = extract_bid_subcontractor_data(file_contents)
contract_line_item_data = extract_contract_line_item_data(file_contents)
    
df = pd.DataFrame(contract_line_item_data)
df


Unnamed: 0,Contract_Number,Item_Number,Item_Code,Item_Description,Item_Dollar_Amount
0,01-0A0904,1,74016,CONSTRUCTION SITE MANAGEMENT,421.0
1,01-0A0904,2,74017,PREPARE WATER POLLUTION CONTROL PROGRAM,735.0
2,01-0A0904,3,120090,CONSTRUCTION AREA SIGNS,6750.0
3,01-0A0904,4,120100,TRAFFIC CONTROL SYSTEM,50500.0
4,01-0A0904,5,128650,PORTABLE CHANGEABLE MESSAGE SIGN,2500.0
5,01-0A0904,6,141104,REMOVE YELLOW THERMOPLASTIC PAVEMENT MARKING (...,2700.0
6,01-0A0904,7,150715,REMOVE THERMOPLASTIC PAVEMENT MARKING,4750.0
7,01-0A0904,8,190110,LEAD COMPLIANCE PLAN,1200.0
8,01-0A0904,9,365001,SAND COVER,13200.0
9,01-0A0904,10,374002,ASPHALTIC EMULSION (FOG SEAL COAT),17500.0


In [44]:
df.iloc[5][ITEM_DESCRIPTION]

'REMOVE YELLOW THERMOPLASTIC PAVEMENT MARKING (HAZARDOUS WASTE)'

In [None]:

contract_line_item_data = []

for match in matches:
    row = defaultdict(str)
    row[CONTRACT_NUMBER] = contract_number
    
    
    
df = pd.DataFrame(contract_line_item_data)
df.head()


In [21]:
matches

[]

In [None]:
contract_data = []
contract_data_outliers = []

contract_bid_data = []
contract_bid_data_outliers = []

bid_subcontractor_data = []
bid_subcontractor_data_outliers = []

contract_line_item_data = []
contract_line_item_data_outliers = []

i = 0
for filepath in tqdm(Path('/Users/nenadbozinovic/Documents/ocr/RR Procurement - Raw Data/Txt files - lineprinter').glob('*.txt')):
    
    file_contents = read_file(filepath)
        
    filename = filepath.stem
    contract_number = filename.split('.pdf')[0]
    
    contract_data_row = extract_contract_data(file_contents)
    
    # TODO
    # contract_contract_bid_data_rows = extract_contract_bid_data(file_contents)
    
    row = defaultdict(str)
    # Regular expression to match each entry
    pattern = re.compile(r"(\d+)\s+(A\))?\s+([\d,]+\.\d{2})\s+(\d+)\s+(.*?)\s+(\d{3} \d{3}-\d{4})(.*?)\s+(\d+)\s+(B\))?")

    # Find all matches
    matches = pattern.findall(file_contents)

    # Prepare CSV data
    csv_data = [[CONTRACT_NUMBER, BID_RANK, "OPTIONAL_A", BID_TOTAL, BIDDER_ID, BIDDER_NAME, "BIDDER PHONE", "OPTIONAL TEXT", CSLB_NUMBER, "OPTIONAL_B"]]
    for match in matches:
        # Combine the captured groups into a single row, note that some post-processing may be needed depending on the exact format you want
        row = [contract_number, match[0], match[1], match[2], match[3].strip(), match[4], match[5].strip(), match[6], match[7].strip(), match[8]]
        contract_contract_bid_data_rows.append(row)
        
    # Write CSV data to file
    with open(f"bids_{filepath.stem}.csv", "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(csv_data)
    

    
    # TODO
    # contract_subcontractor_data_rows = extract_subcontractor_data(file_contents)
    # contract_line_item_data_rows = extract_line_item_data(file_contents)
    
    if contract_number != contract_data_row[CONTRACT_NUMBER]:  
        # if contract number doesn't match then something is off that needs investigation
        contract_data_outliers.append(contract_data_row)
        
    else:
        # write the contract data
        contract_data.append(contract_data_row)

    # this is temporary
    i += 1
    if i == 3:
        break

write_to_file(contract_data, "contract_data")
write_to_file(contract_data_outliers, "contract_data_outliers")



In [194]:
    
    row = defaultdict(str)
    # Regular expression to match each entry
    pattern = re.compile(r"(\d+)\s+(A\))?\s+([\d,]+\.\d{2})\s+(\d+)\s+(.*?)\s+(\d{3} \d{3}-\d{4})(.*?)\s+(\d+)\s+(B\))?")

    # Find all matches
    matches = pattern.findall(file_contents)

    # Prepare CSV data
    csv_data = [[CONTRACT_NUMBER, BID_RANK, "OPTIONAL_A", BID_TOTAL, BIDDER_ID, BIDDER_NAME, "BIDDER PHONE", "OPTIONAL TEXT", CSLB_NUMBER, "OPTIONAL_B"]]
    for match in matches:
        # Combine the captured groups into a single row, note that some post-processing may be needed depending on the exact format you want
        row = [row[CONTRACT_NUMBER], match[0], match[1], match[2], match[3].strip(), match[4], match[5].strip(), match[6], match[7].strip(), match[8]]
        csv_data.append(row)
        
    # Write CSV data to file
    with open(f"bids_{filepath.stem}.csv", "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(csv_data)