# Data extraction

The goal of this project is to extract relevant data from text files, previously converted from PDF files. Since the text files are quite structured, the decision is to use regex to do this.

In [94]:
from pathlib import Path
from pprint import pprint
from typing import List
from collections import defaultdict
import re
import pandas as pd
import re
from tqdm import tqdm
from datetime import datetime
import shutil

%reload_ext autoreload
%autoreload 2

Define paths:

In [82]:
RAW_DATA_PATH = Path('/Users/nenadbozinovic/Documents/regex/RR Procurement - Raw Data')
RESULTS_PATH = RAW_DATA_PATH.parent / 'results'
OUTLIERS_PATH = RESULTS_PATH / 'outliers'

RAW_DATA_PATH_PDF = RAW_DATA_PATH / 'PDFs'
OUTLIERS_PATH_PDF = OUTLIERS_PATH / 'PDFs'
OUTLIERS_PATH_PDF.mkdir(exist_ok=True, parents=True)

RAW_DATA_PATH_LINEPRINTER = RAW_DATA_PATH / 'Txt files - lineprinter'
OUTLIERS_PATH_LINEPRINTER = OUTLIERS_PATH / 'Txt files - lineprinter'
OUTLIERS_PATH_LINEPRINTER.mkdir(exist_ok=True, parents=True)


Define column names so we don't use literals at any point:

In [84]:
FILENAME = "filename"
NUMBER_OF_BIDDERS = "Number_of_Bidders"
BID_OPENING_DATE = "Bid_Opening_Date"
CONTRACT_NUMBER = "Contract_Number"
TOTAL_NUMBER_OF_WORKING_DAYS = "Total_Number_of_Working_Days"
CONTRACT_ITEMS = "Number_of_Contract_Items"
CONTRACT_DESCRIPTION = "Contract_Description"
PERCENT_OVER_EST = "Percent_Est_Over"
PERCENT_UNDER_EST = "Percent_Est_Under"
ENGINEERS_EST = "Engineers_Est"
AMOUNT_OVER = "Amount_Over"
AMOUNT_UNDER = "Amount_Under"
CONTRACT_CODE = "Contract_Code"

BID_RANK = "Bid_Rank"
BID_TOTAL = "Bid_Total"   
BIDDER_ID = "Bidder_ID"
BIDDER_NAME = "Bidder_Name"
BIDDER_NAME_COND = "Bidder_Name_cond"
CSLB_NUMBER = "CSLB_Number"

ITEM_NUMBER = "Item_Number"
ITEM_CODE = "Item_Code"
ITEM_DESCRIPTION = "Item_Description"
ITEM_DOLLAR_AMOUNT = "Item_Dollar_Amount"


In [85]:
def get_contract_number(file_contents):
    return extract(file_contents, r"CONTRACT NUMBER\s+([A-Za-z0-9-]+)")


def extract(file_contents, regex, post=None):
    # Search for the pattern in the text
    match = re.search(regex, file_contents)

    if match:
        # Extract first capture group
        if post:
            temp = post(match.group(1))
        else:
            temp = match.group(1)
        return temp
    else:
        return ""
    

def extract_contract_data(file_contents):
    row = defaultdict(str)
    row[CONTRACT_NUMBER] = get_contract_number(file_contents)
    row[BID_OPENING_DATE] = extract(file_contents, r"BID OPENING DATE\s+(\d+/\d+/\d+)")
    row[CONTRACT_CODE] = extract(file_contents, r"CONTRACT CODE\s+'([^']+)'")
    row[CONTRACT_ITEMS] = extract(file_contents, r"(\d+)\s+CONTRACT ITEMS")
    row[TOTAL_NUMBER_OF_WORKING_DAYS] = extract(file_contents, r"TOTAL NUMBER OF WORKING DAYS\s+(\d+)")
    row[NUMBER_OF_BIDDERS] = extract(file_contents, r"NUMBER OF BIDDERS\s+(\d+)")
    row[ENGINEERS_EST] = extract(file_contents, r"ENGINEERS EST\s+([\d,]+\.\d{2})")
    row[AMOUNT_OVER] = extract(file_contents, r"AMOUNT OVER\s+([\d,]+\.\d{2})")
    row[AMOUNT_UNDER] = extract(file_contents, r"AMOUNT UNDER\s+([\d,]+\.\d{2})")
    row[PERCENT_OVER_EST] = extract(file_contents, r"PERCENT OVER EST\s+(\d+)")
    row[PERCENT_UNDER_EST] = extract(file_contents, r"PERCENT UNDER EST\s+(\d+)")
    row[CONTRACT_DESCRIPTION] = extract(file_contents, r"(.*?)\s+FEDERAL AID", post=lambda x: x.lstrip())
    return row


def extract_contract_bid_data(file_contents):
    contract_number = get_contract_number(file_contents)

    pattern = re.compile(r"(\d+)\s+(A\))?\s+([\d,]+\.\d{2})\s+(\d+)\s+(.*?)\s+(\d{3} \d{3}-\d{4})\s+(.*?)\s+(.*?)(\d{8})")
    matches = pattern.findall(file_contents)
    
    contract_bid_data = []

    for match in matches:
        row = defaultdict(str)
        row[CONTRACT_NUMBER] = contract_number
        row[BID_RANK] = match[0]
        row["OPTIONAL_A"] = match[1]
        row[BID_TOTAL] = match[2]
        row[BIDDER_ID] = match[3].strip()
        row[BIDDER_NAME] = match[4] + ' ' + match[7].strip()
        row["BIDDER PHONE"] = match[5].strip()
        row["Extra_Text"] = match[6]
        row[CSLB_NUMBER] = match[8] 
        contract_bid_data.append(row)
        
    return contract_bid_data


def extract_contract_line_item_data(file_contents):
    contract_number = get_contract_number(file_contents)

    pattern = re.compile(r"(^\s*(\d+)\s+(\d+)\s+([\dA-Z\(\)\"\- ]{46})\s(.{35})\s+([\d,]+\.\d{2}))(?:\n\s+([\dA-Z\(\)\"\- $]+)\n)?", re.MULTILINE)

    # maybe instead of the one above we use approach of column widths, 
    # and extract first text between dotted line and total: 
    # ^\s+-+\n([\s\S]+?)\n\s+TOTAL\s+\$?([\d,]+\.\d{2})
    matches = pattern.findall(file_contents)

    contract_line_item_data = []
    for match in matches:
        row = defaultdict(str)
        row[CONTRACT_NUMBER] = contract_number
        row[ITEM_NUMBER] = match[1]
        row[ITEM_CODE] = match[2]
        row[ITEM_DESCRIPTION] = match[3].rstrip() + ' ' + match[6]
        row[ITEM_DOLLAR_AMOUNT] = match[5]
        contract_line_item_data.append(row)
        
    return contract_line_item_data


def write_to_results(data: List, name: str):
    df = pd.DataFrame(data)
    df.to_csv(RESULTS_PATH / f'{name}.csv', index=False)
    

def read_file(filepath: str):
    # Open the file in read mode ('r')
    with open(filepath, 'r') as file:
        # Read the contents of the file into a string
        file_contents = file.read()
    return file_contents

# One sample study

In [95]:
filepath = RAW_DATA_PATH.parent / 'sample/01-0A0904.pdf_2724.txt'
file_contents = read_file(filepath)

Extract contract data:

In [102]:
df_contract_data = pd.DataFrame([extract_contract_data(file_contents)])
df_contract_bid_data = pd.DataFrame(extract_contract_bid_data(file_contents))
# df_bid_subcontractor_data = pd.DataFrame(extract_bid_subcontractor_data(file_contents))
df_contract_line_item_data = pd.DataFrame(extract_contract_line_item_data(file_contents))

In [99]:
df_contract_data

Unnamed: 0,Contract_Number,Bid_Opening_Date,Contract_Code,Number_of_Contract_Items,Total_Number_of_Working_Days,Number_of_Bidders,Engineers_Est,Amount_Over,Amount_Under,Percent_Est_Over,Percent_Est_Under,Contract_Description
0,01-0A0904,03/29/11,H,15,20,6,356785.0,62635.0,,17,,SEAL COAT


In [100]:
df_contract_bid_data

Unnamed: 0,Contract_Number,Bid_Rank,OPTIONAL_A,Bid_Total,Bidder_ID,Bidder_Name,BIDDER PHONE,Extra_Text,CSLB_Number
0,01-0A0904,1,,419420.0,2,INTERNATIONAL SURFACING SYSTEMS,916 373-2420,SB PREF CLAIMED,736996
1,01-0A0904,2,,428428.0,1,NORTHWEST PAVING,530 246-4388,SB PREF CLAIMED,822126
2,01-0A0904,3,,432845.0,6,ADVANTAGE PAVING AND EXCAVATING INC,530 598-7759,SB PREF CLAIMED,909239
3,01-0A0904,4,,502205.0,4,FRANKLIN CONSTRUCTION INC,530 343-9600,SB PREF CLAIMED,567469
4,01-0A0904,5,,514740.0,5,WINDSOR FUEL COMPANY,925 427-5266,SB PREF CLAIMED,776848
5,01-0A0904,6,,719084.6,3,CALIFORNIA PAVEMENT MAINTENANCE COMPANY INC,916 381-8033,SB PREF CLAIMED,374514


In [103]:
df_contract_line_item_data

Unnamed: 0,Contract_Number,Item_Number,Item_Code,Item_Description,Item_Dollar_Amount
0,01-0A0904,1,74016,CONSTRUCTION SITE MANAGEMENT,421.0
1,01-0A0904,2,74017,PREPARE WATER POLLUTION CONTROL PROGRAM,735.0
2,01-0A0904,3,120090,CONSTRUCTION AREA SIGNS,6750.0
3,01-0A0904,4,120100,TRAFFIC CONTROL SYSTEM,50500.0
4,01-0A0904,5,128650,PORTABLE CHANGEABLE MESSAGE SIGN,2500.0
5,01-0A0904,6,141104,REMOVE YELLOW THERMOPLASTIC PAVEMENT MARKING (...,2700.0
6,01-0A0904,7,150715,REMOVE THERMOPLASTIC PAVEMENT MARKING,4750.0
7,01-0A0904,8,190110,LEAD COMPLIANCE PLAN,1200.0
8,01-0A0904,9,365001,SAND COVER,13200.0
9,01-0A0904,10,374002,ASPHALTIC EMULSION (FOG SEAL COAT),17500.0


# Batch run

Here we run a batch or a single file (for example, if you want to run a specific outlier, make `files` a single element list):

In [91]:
files = (RAW_DATA_PATH/'Txt files - lineprinter').glob('*.txt')
# files = [RAW_DATA_PATH/'Txt files - lineprinter'/'01-0F9204.PDF_12364.txt']

In [92]:
contract_data = []
contract_bid_data = []
bid_subcontractor_data = []
contract_line_item_data = []
outliers = []

for filepath in tqdm(files):
    
    file_contents = read_file(filepath)
        
    filename = filepath.stem
    # here we have to be slightly careful since some files have .PDF and some .pdf (better way would be to use regex to match)
    contract_number_from_filename = filename.lower().split('.pdf')[0].upper() 
    contract_number_from_contents = get_contract_number(file_contents)
    
    if contract_number_from_filename == contract_number_from_contents:  
        contract_data.append(extract_contract_data(file_contents))
        contract_bid_data.extend(extract_contract_bid_data(file_contents))
        # TODO bid_subcontractor_data.extend(extract_contract_bid_data(file_contents))
        contract_line_item_data.extend(extract_contract_line_item_data(file_contents))
    else:
        # if contract number doesn't match then something is off that needs investigation
        outliers.append({'outlier_filename': filename})
        # let's also copy the pdf to a folder for manual inspection
        source_path = RAW_DATA_PATH_PDF / f'{filename}.pdf'
        destination_path = OUTLIERS_PATH_PDF / f'{filename}.pdf'
        shutil.copy(source_path, destination_path)
        
        source_path = RAW_DATA_PATH_LINEPRINTER / f'{filename}.txt'
        destination_path = OUTLIERS_PATH_LINEPRINTER / f'{filename}.txt'
        shutil.copy(source_path, destination_path)

370it [00:22, 16.61it/s]


# Save files

In [93]:
# # one can add timestamp to the files if needed
# timestamp = datetime.strftime(datetime.now(), '%m-%d-%Y-%H:%M:%S')
write_to_results(contract_data, "contract_data")
write_to_results(contract_bid_data, "contract_bid_data")
# write_to_results(bid_subcontractor_data, "bid_subcontractor_data")
write_to_results(contract_line_item_data, "contract_line_item_data")
write_to_results(outliers, "outliers")

# Save to excel

In [105]:
import pandas as pd

# Paths to your CSV files
csv_file_paths = RESULTS_PATH.glob('*.csv')

# Path to the output Excel file
excel_file_path = RESULTS_PATH / 'results.xlsx'

# Create a Pandas Excel writer using openpyxl as the engine
with pd.ExcelWriter(excel_file_path, engine='openpyxl') as writer:
    # Iterate over your CSV files
    for csv_file in csv_file_paths:
        # Use Path from pathlib to work with file paths
        csv_path = Path(csv_file)
        
        # Extract the file name without the extension for the sheet name
        sheet_name = csv_path.stem
        
        # Read each CSV file into a DataFrame
        df = pd.read_csv(csv_file)
        
        # Write the DataFrame to a new sheet in the Excel file using the file name as the sheet name
        df.to_excel(writer, sheet_name=sheet_name, index=False)

print(f'Merged CSV files into {excel_file_path}')

Merged CSV files into /Users/nenadbozinovic/Documents/regex/results/results.xlsx
