# Data extraction

The goal of this project is to extract relevant data from text files, previously converted from PDF files. Since the text files are quite structured, the decision is to use regex to do this.

In [1]:
from pathlib import Path
from pprint import pprint
from typing import List, Tuple
from collections import defaultdict
import re
import pandas as pd
import re
from tqdm import tqdm

from datetime import datetime
import shutil

%reload_ext autoreload
%autoreload 2

Define paths:

In [2]:
RAW_DATA_PATH = Path('/Users/nenadbozinovic/Documents/regex/RR Procurement - Raw Data')
RESULTS_PATH = RAW_DATA_PATH.parent / 'results'
RESULTS_PATH.mkdir(exist_ok=True, parents=True)

OUTLIERS_PATH = RESULTS_PATH / 'outliers'

RAW_DATA_PATH_PDF = RAW_DATA_PATH / 'PDFs'
OUTLIERS_PATH_PDF = OUTLIERS_PATH / 'PDFs'
OUTLIERS_PATH_PDF.mkdir(exist_ok=True, parents=True)

RAW_DATA_PATH_LINEPRINTER = RAW_DATA_PATH / 'Txt files - lineprinter'
OUTLIERS_PATH_LINEPRINTER = OUTLIERS_PATH / 'Txt files - lineprinter'
OUTLIERS_PATH_LINEPRINTER.mkdir(exist_ok=True, parents=True)


Define column names so we don't use literals at any point:

In [3]:

IDENTIFIER = "Identifier"
POSTPONED_CONTRACT = "Postponed_Contract"
NUMBER_OF_BIDDERS = "Number_of_Bidders"
BID_OPENING_DATE = "Bid_Opening_Date"
CONTRACT_DATE = "Contract_Date"
CONTRACT_NUMBER = "Contract_Number"
TOTAL_NUMBER_OF_WORKING_DAYS = "Total_Number_of_Working_Days"
CONTRACT_ITEMS = "Number_of_Contract_Items"
CONTRACT_DESCRIPTION = "Contract_Description"
PERCENT_OVER_EST = "Percent_Est_Over"
PERCENT_UNDER_EST = "Percent_Est_Under"
ENGINEERS_EST = "Engineers_Est"
AMOUNT_OVER = "Amount_Over"
AMOUNT_UNDER = "Amount_Under"
CONTRACT_CODE = "Contract_Code"

BID_RANK = "Bid_Rank"
BID_TOTAL = "Bid_Total"   
BIDDER_ID = "Bidder_ID"
BIDDER_NAME = "Bidder_Name"
BIDDER_NAME_COND = "Bidder_Name_cond"
CSLB_NUMBER = "CSLB_Number"

SUBCONTRACTOR_NAME = "Subcontractor_Name"
SUBCONTRACTED_LINE_ITEM = "Subcontracted_Line_Item"

ITEM_NUMBER = "Item_Number"
ITEM_CODE = "Item_Code"
ITEM_DESCRIPTION = "Item_Description"
ITEM_DOLLAR_AMOUNT = "Item_Dollar_Amount"


In [9]:
def get_contract_number_and_tag_from_filename(filename:str) -> Tuple[str, str]:
    pattern = re.compile(r"^(\d{2}-\w+)\.pdf_(\d+)$", re.IGNORECASE)  # IGNORECASE is critical since names might have both PDF and pdf
    match = pattern.search(filename)
    contract_number, tag = match.groups()
    identifier = f"{contract_number}_{tag}"
    return contract_number, tag, identifier


def get_contract_number(file_contents):
    return extract(file_contents, r"CONTRACT NUMBER\s+([A-Za-z0-9-]+)")


def get_dates(file_contents):
    match = re.search(r"BID OPENING DATE\s+(\d+\/\d+\/\d+).+\s+(\d+\/\d+\/\d+)", file_contents)
    return match.group(1), match.group(2)
    


def extract(file_contents, regex, post_processing=None):
    # Search for the pattern in the text
    match = re.search(regex, file_contents)

    if match:
        # Extract first capture group
        if post_processing:
            temp = post_processing(match.group(1))
        else:
            temp = match.group(1)
        return temp
    else:
        return ""
    

def extract_contract_data(file_contents, identifier):
    row = defaultdict(str)
    row[IDENTIFIER] = identifier
    match = extract(file_contents, r"(POSTPONED CONTRACT)")
    row[POSTPONED_CONTRACT] = 1 if match else 0
    row[BID_OPENING_DATE], row[CONTRACT_DATE] = get_dates(file_contents)
    row[CONTRACT_CODE] = extract(file_contents, r"CONTRACT CODE\s+'([^']+)'")
    row[CONTRACT_ITEMS] = extract(file_contents, r"(\d+)\s+CONTRACT ITEMS")
    row[TOTAL_NUMBER_OF_WORKING_DAYS] = extract(file_contents, r"TOTAL NUMBER OF WORKING DAYS\s+(\d+)")
    row[NUMBER_OF_BIDDERS] = extract(file_contents, r"NUMBER OF BIDDERS\s+(\d+)")
    row[ENGINEERS_EST] = extract(file_contents, r"ENGINEERS EST\s+([\d,]+\.\d{2})")
    row[AMOUNT_OVER] = extract(file_contents, r"AMOUNT OVER\s+([\d,]+\.\d{2})")
    row[AMOUNT_UNDER] = extract(file_contents, r"AMOUNT UNDER\s+([\d,]+\.\d{2})")
    row[PERCENT_OVER_EST] = extract(file_contents, r"PERCENT OVER EST\s+(\d+.\d{2})")
    row[PERCENT_UNDER_EST] = extract(file_contents, r"PERCENT UNDER EST\s+(\d+.\d{2})")
    row[CONTRACT_DESCRIPTION] = extract(file_contents, r"(?:\n)?(.*?)FEDERAL AID", post_processing=lambda x: x.lstrip())
    return row


def extract_contract_bid_data(file_contents, identifier):

    # have fixed width for name (37 characters) and CSLB number (8 digits)
    pattern = re.compile(r"(\d+)\s+(A\))?\s+([\d,]+\.\d{2})\s+(\d+)\s+(.{37})\s(\d{3} \d{3}-\d{4})(.*)?$\s+(.*?)(.{37})\s(\d{8})", re.MULTILINE)
    matches = pattern.findall(file_contents)
    
    contract_bid_data = []

    for match in matches:
        row = defaultdict(str)
        row[IDENTIFIER] = identifier
        row[BID_RANK] = match[0]
        row["OPTIONAL_A"] = match[1]
        row[BID_TOTAL] = match[2]
        row[BIDDER_ID] = match[3].strip()
        row[BIDDER_NAME] = match[4].strip()
        row["BIDDER PHONE"] = match[5].strip()
        row["Extra_Text"] = match[6]
        row['Weird_Contract_Notes'] = match[7]
        row[BIDDER_NAME] += ' ' + match[8]
        row[BIDDER_NAME].strip()
        row[CSLB_NUMBER] = match[9] 
        contract_bid_data.append(row)
        
    return contract_bid_data


def reduce_file_contents(file_contents):
    # here we first extract file_contents_relevant from file_contents so we narrow down the search
    pattern = r"(?s)L I S T   O F   S U B C O N T R A C T O R S(.*?)C O N T R A C T   P R O P O S A L   O F   L O W   B I D D E R"
    match = re.search(pattern, file_contents)
    if match:
        file_contents_relevant = match.group()
        return file_contents_relevant
    else:
        return None


def extract_bid_subcontractor_data(file_contents, identifier):

    file_contents_relevant = reduce_file_contents(file_contents)
    if not file_contents_relevant:
        return []
    
    # now we run a standard regex to extract the data
    pattern = re.compile(r"^\s*(\d+)?\s+([A-Z ]+[A-Z])\s+(ITEM.+)$", re.MULTILINE)
    matches = pattern.findall(file_contents_relevant)
    
    bid_subcontractor_data = []

    for match in matches:
        row = defaultdict(str)
        row[IDENTIFIER] = identifier
        row[BIDDER_ID] = match[0]
        row[SUBCONTRACTOR_NAME] = match[1]
        row[SUBCONTRACTED_LINE_ITEM] = match[2]
        bid_subcontractor_data.append(row)
        
    return bid_subcontractor_data


def extract_contract_line_item_data(file_contents, identifier):

    pattern = re.compile(r"(^\s*(\d+)\s+(\d+)\s+([\dA-Z\(\)\"\- ]{46})\s(.{35})\s+([\d,]+\.\d{2}))(?:\n\s+([\dA-Z\(\)\"\- $]+)\n)?", re.MULTILINE)

    # maybe instead of the one above we use approach of column widths, 
    # and extract first text between dotted line and total: 
    # ^\s+-+\n([\s\S]+?)\n\s+TOTAL\s+\$?([\d,]+\.\d{2})
    matches = pattern.findall(file_contents)

    contract_line_item_data = []
    for match in matches:
        row = defaultdict(str)
        row[IDENTIFIER] = identifier
        row[ITEM_NUMBER] = match[1]
        row[ITEM_CODE] = match[2]
        row[ITEM_DESCRIPTION] = match[3].rstrip() + ' ' + match[6]
        row[ITEM_DOLLAR_AMOUNT] = match[5]
        contract_line_item_data.append(row)
        
    return contract_line_item_data


def write_to_results(data: List, name: str):
    df = pd.DataFrame(data)
    df.to_csv(RESULTS_PATH / f'{name}.csv', index=False)
    

def read_file(filepath: str):
    # Open the file in read mode ('r')
    with open(filepath, 'r') as file:
        # Read the contents of the file into a string
        file_contents = file.read()
    return file_contents

# One sample study

In [10]:
# filepath = RAW_DATA_PATH.parent / 'sample' / '01-0A3804.pdf_2724.txt'
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0A3804.pdf_4353.txt'
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0A0904.pdf_2724.txt'
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0A1204.pdf_11468.txt'
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0F4304.pdf_12346.txt'  # issue # 11
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0K6104.pdf_12731.txt'  # issue # 9

filepath = RAW_DATA_PATH_LINEPRINTER / '01-0H3204.pdf_9871.txt'  # issue # 2


In [11]:
contract_number_from_filename, tag, identifier = get_contract_number_and_tag_from_filename(filepath.stem)

file_contents = read_file(filepath)

In [12]:
contract_number_from_filename, tag, identifier 

('01-0H3204', '9871', '01-0H3204_9871')

Extract contract data:

In [13]:
df_contract_data = pd.DataFrame([extract_contract_data(file_contents, identifier)])
# df_contract_bid_data = pd.DataFrame(extract_contract_bid_data(file_contents, identifier))
# df_bid_subcontractor_data = pd.DataFrame(extract_bid_subcontractor_data(file_contents, identifier))
# df_contract_line_item_data = pd.DataFrame(extract_contract_line_item_data(file_contents, identifier))

In [14]:
df_contract_data

Unnamed: 0,Identifier,Postponed_Contract,Bid_Opening_Date,Contract_Date,Contract_Code,Number_of_Contract_Items,Total_Number_of_Working_Days,Number_of_Bidders,Engineers_Est,Amount_Over,Amount_Under,Percent_Est_Over,Percent_Est_Under,Contract_Description
0,01-0H3204_9871,0,05/17/18,05/21/18,B,45,40,2,4289668.0,1048840.75,,24.45,,OVERLAY


In [85]:
df_contract_bid_data

Unnamed: 0,Identifier,Bid_Opening_Date,Bid_Rank,OPTIONAL_A,Bid_Total,Bidder_ID,Bidder_Name,BIDDER PHONE,Extra_Text,Weird_Contract_Notes,CSLB_Number
0,01-0K6104_12731,05/10/22,1,A),1843000.0,4,AMERICAN CIVIL CONSTRUCTORS WEST COAST LLC ...,707 746-8028,NSB PREF CLAIMED,,1011546
1,01-0K6104_12731,05/10/22,2,A),1914912.0,5,"MYERS & SONS CONSTRUCTION, LLC ...",916 283-9950,CC PREF CLAIMED,,1033752
2,01-0K6104_12731,05/10/22,3,A),1969969.0,3,"TRUESDELL CORPORATION OF CALIFORNIA, INC. ...",602 437-1711,CC PREF CLAIMED,,615058
3,01-0K6104_12731,05/10/22,4,A),2248436.0,2,"GOLDEN STATE BRIDGE, INC. ...",925 372-8000,CC PREF CLAIMED,,851187
4,01-0K6104_12731,05/10/22,5,A),1587105.5,1,"T.P.A. CONSTRUCTION, INC. ...",916 919-1624,SB PREF CLAIMED,BRC REQUESTED A RELIEF OF BID,782044


In [86]:
df_bid_subcontractor_data

Unnamed: 0,Identifier,Bid_Opening_Date,Bidder_ID,Subcontractor_Name,Subcontracted_Line_Item
0,01-0K6104_12731,05/10/22,,DARREN TAYLOR CONSTRUCTION INC,ITEM 11 (100%)
1,01-0K6104_12731,05/10/22,,DARREN TAYLOR CONSTRUCTION INC,ITEM 36 (22%)
2,01-0K6104_12731,05/10/22,,DARREN TAYLOR CONSTRUCTION INC,"ITEMS 11, 12, 13"
3,01-0K6104_12731,05/10/22,,DARREN TAYLOR CONSTRUCTION INC,ITEM 11 (100%)
4,01-0K6104_12731,05/10/22,3.0,DARREN TAYLOR CONSTRUCTION INC,ITEM 12 (100%)
5,01-0K6104_12731,05/10/22,,DARREN TAYLOR CONSTRUCTION INC,ITEM 13 (100%)
6,01-0K6104_12731,05/10/22,,HIGHWAY SPECIALTY COMPANY INC,ITEM 4 (86%)
7,01-0K6104_12731,05/10/22,2.0,APPLY A LINE,ITEM 1 (37%)
8,01-0K6104_12731,05/10/22,,APPLY A LINE,ITEM 22 (100%)
9,01-0K6104_12731,05/10/22,,APPLY A LINE,ITEM 32 (100%)


In [87]:
df_contract_line_item_data

Unnamed: 0,Identifier,Bid_Opening_Date,Item_Number,Item_Code,Item_Description,Item_Dollar_Amount
0,01-0K6104_12731,05/10/22,1,70030,LEAD COMPLIANCE PLAN,3000.0
1,01-0K6104_12731,05/10/22,2,90105,TIME-RELATED OVERHEAD (LS),170000.0
2,01-0K6104_12731,05/10/22,3,120090,CONSTRUCTION AREA SIGNS,22000.0
3,01-0K6104_12731,05/10/22,4,120100,TRAFFIC CONTROL SYSTEM,125000.0
4,01-0K6104_12731,05/10/22,5,13563,PORTABLE RADAR FEEDBACK SIGN SYSTEMS,6000.0
5,01-0K6104_12731,05/10/22,6,128652,PORTABLE CHANGEABLE MESSAGE SIGN (LS),18000.0
6,01-0K6104_12731,05/10/22,7,130100,JOB SITE MANAGEMENT,981.7
7,01-0K6104_12731,05/10/22,8,130200,PREPARE WATER POLLUTION CONTROL PROGRAM,695.0
8,01-0K6104_12731,05/10/22,9,141103,REMOVE YELLOW THERMOPLASTIC TRAFFIC STRIPE (HA...,4680.0
9,01-0K6104_12731,05/10/22,10,146002,CONTRACTOR-SUPPLIED BIOLOGIST (LS),12000.0


# Batch run

Here we run a batch or a single file (for example, if you want to run a specific outlier, make `files` a single element list):

In [15]:
files = (RAW_DATA_PATH/'Txt files - lineprinter').glob('*.txt')
# files = [RAW_DATA_PATH/'Txt files - lineprinter'/'01-0F9204.PDF_12364.txt']

In [16]:
contract_data = []
contract_bid_data = []
bid_subcontractor_data = []
contract_line_item_data = []
outliers = []

for filepath in tqdm(files):
    
    file_contents = read_file(filepath)
        
    filename = filepath.stem
    contract_number_from_filename, tag, identifier = get_contract_number_and_tag_from_filename(filename)
    contract_number_from_contents = get_contract_number(file_contents)
    
    if contract_number_from_filename == contract_number_from_contents:  
        contract_data.append(extract_contract_data(file_contents, identifier))
        contract_bid_data.extend(extract_contract_bid_data(file_contents, identifier))
        bid_subcontractor_data.extend(extract_bid_subcontractor_data(file_contents, identifier))
        contract_line_item_data.extend(extract_contract_line_item_data(file_contents, identifier))
    else:
        # if contract number doesn't match then something is off that needs investigation
        outliers.append({'outlier_filename': filename})
        # let's also copy the pdf to a folder for manual inspection
        source_path = RAW_DATA_PATH_PDF / f'{filename}.pdf'
        destination_path = OUTLIERS_PATH_PDF / f'{filename}.pdf'
        shutil.copy(source_path, destination_path)
        
        source_path = RAW_DATA_PATH_LINEPRINTER / f'{filename}.txt'
        destination_path = OUTLIERS_PATH_LINEPRINTER / f'{filename}.txt'
        shutil.copy(source_path, destination_path)

5it [00:00, 30.11it/s]0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
370it [00:35, 10.54it/s]


# Save files

In [90]:
# # one can add timestamp to the files if needed
# timestamp = datetime.strftime(datetime.now(), '%m-%d-%Y-%H:%M:%S')
write_to_results(contract_data, "contract_data")
write_to_results(contract_bid_data, "contract_bid_data")
write_to_results(bid_subcontractor_data, "bid_subcontractor_data")
write_to_results(contract_line_item_data, "contract_line_item_data")
write_to_results(outliers, "outliers")

# Save to excel

In [91]:
import pandas as pd

# Paths to your CSV files
csv_file_paths = RESULTS_PATH.glob('*.csv')

# Path to the output Excel file
excel_file_path = RESULTS_PATH / 'results.xlsx'

# Create a Pandas Excel writer using openpyxl as the engine
with pd.ExcelWriter(excel_file_path, engine='openpyxl') as writer:
    # Iterate over your CSV files
    for csv_file in csv_file_paths:
        # Use Path from pathlib to work with file paths
        csv_path = Path(csv_file)
        
        # Extract the file name without the extension for the sheet name
        sheet_name = csv_path.stem
        
        # Read each CSV file into a DataFrame
        df = pd.read_csv(csv_file)
        
        # Write the DataFrame to a new sheet in the Excel file using the file name as the sheet name
        df.to_excel(writer, sheet_name=sheet_name, index=False)

print(f'Merged CSV files into {excel_file_path}')

Merged CSV files into /Users/nenadbozinovic/Documents/regex/results/results.xlsx
