# Caltrans Data Extraction

The goal of this project is to extract relevant data from text files, previously converted from PDF files. Since the text files are quite structured, the decision is to use regex to do this.

## Setup

Install the following packages if you don't have them yet:

In [222]:
# pip install pandas numpy tqdm ipykernel notebook python-dotenv openpyxl

In [223]:
from pathlib import Path
from pprint import pprint
from typing import List, Tuple
from collections import defaultdict
import re
import pandas as pd
import re
from tqdm import tqdm
import numpy as np
import os
from datetime import datetime
import shutil
from dotenv import load_dotenv

%reload_ext autoreload
%autoreload 2

# pd.set_option('display.max_rows', None)  # optional to see all rows in DataFrames

Either change the hard-coded path of assign user path path to raw data in .env file:

In [224]:
load_dotenv()
RAW_DATA_PATH = Path(os.getenv('RAW_DATA_PATH'))
if not RAW_DATA_PATH.exists():
    RAW_DATA_PATH = Path('./RR Procurement - Raw Data')
    if not RAW_DATA_PATH.exists():
        raise ValueError('Make sure to set a path to raw data in the .env file or copy data into root of the repo')
print(f'Current RAW_DATA_PATH is {RAW_DATA_PATH}')

Current RAW_DATA_PATH is RR Procurement - Raw Data


Rest of the paths:

In [225]:
RESULTS_PATH = RAW_DATA_PATH.parent / 'results'
RESULTS_PATH.mkdir(exist_ok=True, parents=True)

OUTLIERS_PATH = RESULTS_PATH / 'outliers'

RAW_DATA_PATH_PDF = RAW_DATA_PATH / 'PDFs'
OUTLIERS_PATH_PDF = OUTLIERS_PATH / 'PDFs'
OUTLIERS_PATH_PDF.mkdir(exist_ok=True, parents=True)

RAW_DATA_PATH_LINEPRINTER = RAW_DATA_PATH / 'Txt files - lineprinter'
OUTLIERS_PATH_LINEPRINTER = OUTLIERS_PATH / 'Txt files - lineprinter'
OUTLIERS_PATH_LINEPRINTER.mkdir(exist_ok=True, parents=True)

Define column names so we don't use literals at any point:

In [226]:
IDENTIFIER = "Identifier"
POSTPONED_CONTRACT = "Postponed_Contract"
NUMBER_OF_BIDDERS = "Number_of_Bidders"
BID_OPENING_DATE = "Bid_Opening_Date"
CONTRACT_DATE = "Contract_Date"
CONTRACT_NUMBER = "Contract_Number"
TOTAL_NUMBER_OF_WORKING_DAYS = "Total_Number_of_Working_Days"
CONTRACT_ITEMS = "Number_of_Contract_Items"
CONTRACT_DESCRIPTION = "Contract_Description"
PERCENT_OVER_EST = "Percent_Est_Over"
PERCENT_UNDER_EST = "Percent_Est_Under"
ENGINEERS_EST = "Engineers_Est"
AMOUNT_OVER = "Amount_Over"
AMOUNT_UNDER = "Amount_Under"
CONTRACT_CODE = "Contract_Code"

BID_RANK = "Bid_Rank"
A_PLUS_B_INDICATOR = "A_plus_B_indicator"
BID_TOTAL = "Bid_Total"   
BIDDER_ID = "Bidder_ID"
BIDDER_NAME = "Bidder_Name"
CSLB_NUMBER = "CSLB_Number"

SUBCONTRACTOR_NAME = "Subcontractor_Name"
SUBCONTRACTED_LINE_ITEM = "Subcontracted_Line_Item"

ITEM_NUMBER = "Item_Number"
ITEM_CODE = "Item_Code"
ITEM_DESCRIPTION = "Item_Description"
ITEM_DOLLAR_AMOUNT = "Item_Dollar_Amount"

CITY = "City"
SUBCONTRACTOR_LICENSE_NUMBER = "Subcontractor_License_Number"

COULD_NOT_PARSE = "COULD NOT PARSE"


## Core functions

The following are the core functions that will be used to extract the data from the text files (maybe move to separate python file):

In [227]:
def get_contract_number_and_tag_from_filename(filename:str) -> Tuple[str, str]:
    pattern = re.compile(r"^(\d{2}-\w+)\.pdf_(\d+)$", re.IGNORECASE)  # IGNORECASE is critical since names might have both PDF and pdf
    match = pattern.search(filename)
    contract_number, tag = match.groups()
    identifier = f"{contract_number}_{tag}"
    return contract_number, tag, identifier


def get_contract_number(file_contents):
    return extract(file_contents, r"CONTRACT NUMBER\s+([A-Za-z0-9-]+)")


def get_dates(file_contents):
    match = re.search(r"BID OPENING DATE\s+(\d+\/\d+\/\d+).+\s+(\d+\/\d+\/\d+)", file_contents)
    return match.group(1), match.group(2)
    

def extract(file_contents, regex):
    # Search for the pattern in the text
    match = re.search(regex, file_contents)

    if match:
        # Extract first capture group
        return match.group(1)
    else:
        return ""
    

def extract_contract_data(file_contents, identifier):
    row = defaultdict(str)
    row[IDENTIFIER] = identifier
    match = extract(file_contents, r"(POSTPONED CONTRACT)")
    row[POSTPONED_CONTRACT] = 1 if match else 0
    row[BID_OPENING_DATE], row[CONTRACT_DATE] = get_dates(file_contents)
    row[CONTRACT_CODE] = extract(file_contents, r"CONTRACT CODE\s+'([^']+)'")  # check
    row[CONTRACT_ITEMS] = extract(file_contents, r"(\d+)\s+CONTRACT ITEMS")
    row[TOTAL_NUMBER_OF_WORKING_DAYS] = extract(file_contents, r"TOTAL NUMBER OF WORKING DAYS\s+(\d+)")
    row[NUMBER_OF_BIDDERS] = extract(file_contents, r"NUMBER OF BIDDERS\s+(\d+)")
    row[ENGINEERS_EST] = extract(file_contents, r"ENGINEERS EST\s+([\d,]+\.\d{2})")
    row[AMOUNT_OVER] = extract(file_contents, r"AMOUNT OVER\s+([\d,]+\.\d{2})")
    row[AMOUNT_UNDER] = extract(file_contents, r"AMOUNT UNDER\s+([\d,]+\.\d{2})")
    row[PERCENT_OVER_EST] = extract(file_contents, r"PERCENT OVER EST\s+(\d+.\d{2})")
    row[PERCENT_UNDER_EST] = extract(file_contents, r"PERCENT UNDER EST\s+(\d+.\d{2})")
    row[CONTRACT_DESCRIPTION] = extract(file_contents, r"(?:\n)?(.*?)FEDERAL AID").strip()

    return row


def extract_contract_bid_data(file_contents, identifier):
    
    # have fixed width for name (37 characters) and CSLB number (8 digits)
    pattern = re.compile(r"(\d+)\s+(A\))?\s+([\d,]+\.\d{2})\s+(\d+)\s+(.{37})\s(\d{3} \d{3}-\d{4})(.*)?$\s+(.*?)(.{37})\s(\d{8})", re.MULTILINE)
    matches = pattern.findall(file_contents)
    
    contract_bid_data = []

    for match in matches:
        row = defaultdict(str)
        row[IDENTIFIER] = identifier
        row[BID_RANK] = match[0]
        row[A_PLUS_B_INDICATOR] = 1 if match[1] else 0
        row[BID_TOTAL] = match[2]
        row[BIDDER_ID] = match[3].strip()
        row[BIDDER_NAME] = match[4].strip()
        row["Bidder_Phone"] = match[5].strip()
        row["Extra"] = match[6]
        row['Weird_Contract_Notes'] = match[7]
        row[BIDDER_NAME] += ' ' + match[8]
        row[BIDDER_NAME] = row[BIDDER_NAME].strip()
        row[CSLB_NUMBER] = match[9] 
        contract_bid_data.append(row)


    # if contract has A+B we need to correct the BID_TOTAL:
    pattern = re.compile(r"A\+B\)\s+([\d,]+\.\d{2})", re.MULTILINE)  # this will find many A+B) matches but it is reasonable to expect that first A+B) matches are all we need
    a_plus_b_bids = pattern.findall(file_contents)
    if a_plus_b_bids:
        for i, a_plus_b_bid in zip(range(len(contract_bid_data)), a_plus_b_bids):  # this does truncation of a_plus_b_bids list 
            contract_bid_data[i][BID_TOTAL] = a_plus_b_bid

    return contract_bid_data


def extract_bid_subcontractor_data(file_contents, identifier):
    """
    We extract data in two steps.
    1) First we get the relevant information from a whole contract using pattern1:
    "X(.*?)(?=X|Y|Z)"
    this means starting phrase must be X, then text that we want extracted and then the match can either finish with X, Y or Z.
    In our case:
    X = BIDDER ID NAME AND ADDRESS LICENSE NUMBER DESCRIPTION OF PORTION OF WORK SUBCONTRACTED
    Y = \f (this is a new page character, in the text is denoted as FF, but this is not a pure FF text but /f)
    Z = CONTINUED ON NEXT PAGE

    I also ensure that we are doing positive lookahead (using ?=), so the matches do not overlap.

    2) The second step is to exact the columns, we use some fixed with columns for that in pattern2.
    """

    pattern1= re.compile(r"(?s)BIDDER ID NAME AND ADDRESS\s+LICENSE NUMBER\s+DESCRIPTION OF PORTION OF WORK SUBCONTRACTED(.*?)(?=BIDDER ID NAME AND ADDRESS\s+LICENSE NUMBER\s+DESCRIPTION OF PORTION OF WORK SUBCONTRACTED|\f|CONTINUED ON NEXT PAGE)")
    matches1 = pattern1.findall(file_contents)
    if not matches1:
        return []
            
    bid_subcontractor_data = []
    for match1 in matches1:
        pattern2 = re.compile(r"(?m)^\s+(\d{2})?\s+(.{58})\s+(.+)\n\s+(.{38})?(.+)")
        
        matches2 = pattern2.findall(match1)
        
        for match2 in matches2:
            row = defaultdict(str)
            row[IDENTIFIER] = identifier
            row[BIDDER_ID] = match2[0]
            row[SUBCONTRACTOR_NAME] = match2[1].strip()
            row[SUBCONTRACTED_LINE_ITEM] = match2[2]
            row[CITY] = match2[3].strip()
            row[SUBCONTRACTOR_LICENSE_NUMBER] = match2[4].strip()
            
            bid_subcontractor_data.append(row)

    return bid_subcontractor_data


def extract_contract_line_item_data(file_contents, identifier):

    pattern = re.compile(r"(?m)^\s+(\d+)\s+(\(F\))?\s+(\d+)\s+(.{45})\s+(.{35})\s+([\d,]+\.\d{2})(?:\n\s{26}(.+)\n)?")

    matches = pattern.findall(file_contents)

    contract_line_item_data = []
    for match in matches:
        row = defaultdict(str)
        row[IDENTIFIER] = identifier
        row[ITEM_NUMBER] = match[0]
        row["Extra"] = match[1]
        row[ITEM_CODE] = match[2]
        row[ITEM_DESCRIPTION] = match[3].strip() + ' ' + match[6]
        row[ITEM_DOLLAR_AMOUNT] = match[5]
        contract_line_item_data.append(row)
        
    contract_line_item_data
    return contract_line_item_data

def fill_gaps_in_bidder_id(df):
    df[BIDDER_ID] = df[BIDDER_ID].replace('', np.nan)
    df[BIDDER_ID] = df[BIDDER_ID].ffill()
    return df

def write_to_results(df: pd.DataFrame | List, name: str, timestamp=None):
    if isinstance(df, list):
        df = pd.DataFrame(df)
    
    if timestamp:
        df.to_csv(RESULTS_PATH / f'{timestamp}_{name}.csv', index=False)
    else:
        df.to_csv(RESULTS_PATH / f'{name}.csv', index=False)
    

def read_file(filepath: str):
    # Open the file in read mode ('r')
    with open(filepath, 'r') as file:
        # Read the contents of the file into a string
        file_contents = file.read()
    return file_contents


def expand_ranges_in_subcontracted_line_item(line: str) -> str:
    """
    For example: takes a "6-8, 13-15" and converts to "6, 7, 8, 13, 14, 15".
    Converts NaN to empty string.
    """
    if pd.isnull(line):
        return ""
    
    try:
        # Split the string by commas to separate different ranges/groups
        parts = str(line).split(',')
        # Initialize an empty list to store all numbers
        all_numbers = []
        
        for part in parts:
            # Strip whitespace and check if part contains a range (indicated by '-')
            if '-' in part:
                start, end = map(int, part.split('-'))
                # Add all numbers in this range (inclusive) to the list
                all_numbers.extend(range(start, end + 1))
            else:
                # If not a range, just add the single number
                all_numbers.append(int(part.strip()))
        
        # Return a comma-separated string of all_numbers
        return ", ".join(map(str, all_numbers))
    except:
        return COULD_NOT_PARSE
    

def parse_subcontracted_line_item(df):
    """
    Takes a Subcontracted_Line_Item in df, and splits into three columns: Y1, Y2, Y3.
    For example "SOME TEXT ITEMS 6 THRU 8 AND 13 THRU 15 (PARTIALS)", will be split into:
    - SOME TEXT, 
    - ITEMS, 
    - 6 THRU 8 AND 13 THRU 15, 
    - (PARTIALS)
    Next, the "6 THRU 8 AND 13 THRU 15" will be converted into "6-8, 13-15" and then expanded to "6, 7, 8, 13, 14, 15".
    """
    # splits subcontracted line item into three columns
    df[['PARSED_1', 'PARSED_2', 'PARSED_3', 'PARSED_4']] = df[SUBCONTRACTED_LINE_ITEM].str.extract(r"^(.+?)?(ITEMS|ITEM NUMBERS|ITEM)(.+?)(\(.+\))?$")
    # replace the 'THRU' and 'AND' with '-' and ','
    df['PARSED_3'] = df['PARSED_3'].str.replace('THRU', '-', regex=False).str.replace('AND', ',', regex=False).str.replace('&', ',', regex=False)
    # extend all the ranges
    df['PARSED_5'] = df['PARSED_3'].apply(expand_ranges_in_subcontracted_line_item)
    return df


## Test parse_subcontracted_line_item

In [228]:
df = pd.read_csv('subcontracted_line_item_examples.txt', header=None, delimiter="\t", names=['Subcontracted_Line_Item'])
parse_subcontracted_line_item(df)
df

Unnamed: 0,Subcontracted_Line_Item,Y1,Y2,Y3,Y4,Y5
0,ITEMS 6 THRU 8 AND 13 THRU 15,,ITEMS,"6 - 8 , 13 - 15",,"6, 7, 8, 13, 14, 15"
1,ITEM 12,,ITEM,12,,12
2,ITEMS 1 THRU 3 AND 5,,ITEMS,"1 - 3 , 5",,"1, 2, 3, 5"
3,ITEMS 6 THRU 8 AND 13 THRU 15 (PARTIALS),,ITEMS,"6 - 8 , 13 - 15",(PARTIALS),"6, 7, 8, 13, 14, 15"
4,"ITEMS 6, 7, 8, 13, 14 AND 15",,ITEMS,"6, 7, 8, 13, 14 , 15",,"6, 7, 8, 13, 14, 15"
5,"ITEMS 4, 6 THRU 8, 13 THRU 15",,ITEMS,"4, 6 - 8, 13 - 15",,"4, 6, 7, 8, 13, 14, 15"
6,"ITEMS 6, 7, 8 AND 13 THRU 15",,ITEMS,"6, 7, 8 , 13 - 15",,"6, 7, 8, 13, 14, 15"
7,ITEM 94 (100%),,ITEM,94,(100%),94
8,ITEM LESS GC PROVIDED WATER,,ITEM,LESS GC PROVIDED WATER,,COULD NOT PARSE
9,AS DESCRIBED IN BID ITEM LIST,AS DESCRIBED IN BID,ITEM,LIST,,COULD NOT PARSE


In [229]:
df_outlier = df[df['PARSED_5'] == COULD_NOT_PARSE]
df_outlier

Unnamed: 0,Subcontracted_Line_Item,Y1,Y2,Y3,Y4,Y5
8,ITEM LESS GC PROVIDED WATER,,ITEM,LESS GC PROVIDED WATER,,COULD NOT PARSE
9,AS DESCRIBED IN BID ITEM LIST,AS DESCRIBED IN BID,ITEM,LIST,,COULD NOT PARSE


# One sample study

In [230]:
# filepath = RAW_DATA_PATH.parent / 'sample' / '01-0A3804.pdf_2724.txt'
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0A3804.pdf_4353.txt'
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0A0904.pdf_2724.txt'
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0A1204.pdf_11468.txt'
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0F4304.pdf_12346.txt'  # issue # 11
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0K6104.pdf_12731.txt'  # issue # 9
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0K4604.pdf_12040.txt'  # issue # 1
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0H3204.pdf_9871.txt'  # issue # 5
filepath = RAW_DATA_PATH_LINEPRINTER / '01-0A0404.pdf_10165.txt'  # issue # 2

In [231]:
contract_number_from_filename, tag, identifier = get_contract_number_and_tag_from_filename(filepath.stem)

file_contents = read_file(filepath)

In [232]:
contract_number_from_filename, tag, identifier 

('01-0A0404', '10165', '01-0A0404_10165')

Extract contract data:

In [233]:
df_contract_data = pd.DataFrame([extract_contract_data(file_contents, identifier)])
df_contract_bid_data = pd.DataFrame(extract_contract_bid_data(file_contents, identifier))
df_bid_subcontractor_data = parse_subcontracted_line_item(fill_gaps_in_bidder_id(pd.DataFrame(extract_bid_subcontractor_data(file_contents, identifier))))
df_contract_line_item_data = pd.DataFrame(extract_contract_line_item_data(file_contents, identifier))

In [234]:
df_contract_data

Unnamed: 0,Identifier,Postponed_Contract,Bid_Opening_Date,Contract_Date,Contract_Code,Number_of_Contract_Items,Total_Number_of_Working_Days,Number_of_Bidders,Engineers_Est,Amount_Over,Amount_Under,Percent_Est_Over,Percent_Est_Under,Contract_Description
0,01-0A0404_10165,0,09/27/18,09/28/18,A,115,130,7,10366370.0,,2718364.5,,26.22,SHOULDER WIDENING


In [235]:
df_contract_bid_data

Unnamed: 0,Identifier,Bid_Rank,A_plus_B_indicator,Bid_Total,Bidder_ID,Bidder_Name,Bidder_Phone,Extra,Weird_Contract_Notes,CSLB_Number
0,01-0A0404_10165,1,1,8948005.5,3,GHILOTTI CONSTRUCTION CO. INC.,707 585-1221,,,644515
1,01-0A0404_10165,2,1,9794359.2,6,"O.C. JONES & SONS, INC.",510 526-3424,,,759729
2,01-0A0404_10165,3,1,10220181.75,5,ARGONAUT CONSTRUCTORS,707 542-4862,,,171432
3,01-0A0404_10165,4,1,10837402.25,7,WAHLUND CONSTRUCTION INC.,707 268-0150,,,678993
4,01-0A0404_10165,5,1,11298495.0,4,"STEVE MANNING CONSTRUCTION, INC.",530 222-0810,,,754230
5,01-0A0404_10165,6,1,11436342.95,2,MERCER FRASER COMPANY,707 443-6371,,,105709
6,01-0A0404_10165,7,1,11451749.2,1,GRANITE CONSTRUCTION COMPANY,707 467-4100,,,89


In [236]:
df_bid_subcontractor_data

Unnamed: 0,Identifier,Bidder_ID,Subcontractor_Name,Subcontracted_Line_Item,City,Subcontractor_License_Number,Y1,Y2,Y3,Y4,Y5
0,01-0A0404_10165,03,ABSL CONSTRUCTION,GRINDER RENTAL TO COLD PLANE EXISTING RDWY SEC...,HAYWARD CA,621781,,,,,
1,01-0A0404_10165,03,ABSL CONSTRUCTION,GRINDER RENTAL TO CUT & LOAD EXISTING RDWY SEC...,HAYWARD CA,621781,,,,,
2,01-0A0404_10165,03,ABSL CONSTRUCTION,GRINDER RENTAL TO OBLITERATE EXISTING RDWY SUR...,HAYWARD CA,621781,,,,,
3,01-0A0404_10165,03,ABSL CONSTRUCTION,ITEM 34 (8%),HAYWARD CA,621781,,ITEM,34,(8%),34
4,01-0A0404_10165,03,ABSL CONSTRUCTION,ITEM 51 (75%),HAYWARD CA,621781,,ITEM,51,(75%),51
...,...,...,...,...,...,...,...,...,...,...,...
369,01-0A0404_10165,01,TULLY CONSULTING GROUP,PREPARE SWPPP,DIXON CA,,,,,,
370,01-0A0404_10165,01,TULLY CONSULTING GROUP,STORMWATER SAMPLING/ANALYSIS DAY,DIXON CA,,,,,,
371,01-0A0404_10165,01,TULLY CONSULTING GROUP,WATER QUALITY SAMPLING & ANALYSIS DAY,DIXON CA,,,,,,
372,01-0A0404_10165,01,TYRELL RESOURCES INC.,CLEARING,REDDING CA,938998,,,,,


In [237]:
# with pd.option_context('display.max_rows', None, 
#                        'display.max_columns', None, 
#                        'display.width', None, 
#                        'display.max_colwidth', None):
df_contract_line_item_data

Unnamed: 0,Identifier,Item_Number,Extra,Item_Code,Item_Description,Item_Dollar_Amount
0,01-0A0404_10165,1,,070030,LEAD COMPLIANCE PLAN,765.00
1,01-0A0404_10165,2,,080050,PROGRESS SCHEDULE (CRITICAL PATH METHOD),6000.00
2,01-0A0404_10165,3,,090105,TIME-RELATED OVERHEAD (LS),130000.00
3,01-0A0404_10165,4,,100100,DEVELOP WATER SUPPLY,5000.00
4,01-0A0404_10165,5,,120090,CONSTRUCTION AREA SIGNS,5900.00
...,...,...,...,...,...,...
110,01-0A0404_10165,111,,036442,ABANDON WELL,6000.00
111,01-0A0404_10165,112,,000003,ITEM DELETED PER ADDENDUM,0.00
112,01-0A0404_10165,113,,870600,TRAFFIC MONITORING STATION SYSTEM,86425.00
113,01-0A0404_10165,114,,872140,REMOVING EXISTING ELECTRICAL SYSTEM,2500.00


# Batch run

Here we run a batch or a single file (for example, if you want to run a specific outlier, make `files` a single element list):

In [238]:
files = (RAW_DATA_PATH/'Txt files - lineprinter').glob('*.txt')
# files = [RAW_DATA_PATH/'Txt files - lineprinter'/'01-0F9204.PDF_12364.txt']

In [239]:
contract_data = []
contract_bid_data = []
bid_subcontractor_data = []
contract_line_item_data = []
other_format = []

for filepath in tqdm(files):
    
    file_contents = read_file(filepath)
        
    filename = filepath.stem
    contract_number_from_filename, tag, identifier = get_contract_number_and_tag_from_filename(filename)
    contract_number_from_contents = get_contract_number(file_contents)
    
    if contract_number_from_filename == contract_number_from_contents:  
        contract_data.append(extract_contract_data(file_contents, identifier))
        contract_bid_data.extend(extract_contract_bid_data(file_contents, identifier))
        bid_subcontractor_data.extend(extract_bid_subcontractor_data(file_contents, identifier))
        contract_line_item_data.extend(extract_contract_line_item_data(file_contents, identifier))
    else:
        # if contract number doesn't match then something is off that needs investigation
        other_format.append({'other_format_filename': filename})
        # let's also copy the pdf to a folder for manual inspection
        source_path = RAW_DATA_PATH_PDF / f'{filename}.pdf'
        destination_path = OUTLIERS_PATH_PDF / f'{filename}.pdf'
        shutil.copy(source_path, destination_path)
        
        source_path = RAW_DATA_PATH_LINEPRINTER / f'{filename}.txt'
        destination_path = OUTLIERS_PATH_LINEPRINTER / f'{filename}.txt'
        shutil.copy(source_path, destination_path)

370it [00:08, 44.82it/s]


# Save files

Uncomment the timestamp line if you want to save all files with the timstampt prefix:

In [240]:
# timestamp = datetime.strftime(datetime.now(), '%m-%d-%Y-%H:%M:%S')
timestamp = None

In [241]:
write_to_results(contract_data, "contract_data", timestamp=timestamp)
write_to_results(contract_bid_data, "contract_bid_data", timestamp=timestamp)

df_bid_subcontractor_data = parse_subcontracted_line_item(fill_gaps_in_bidder_id(pd.DataFrame(bid_subcontractor_data)))
df_bid_subcontractor_data_could_not_parse = df_bid_subcontractor_data[df_bid_subcontractor_data['Y5'] == COULD_NOT_PARSE]

write_to_results(df_bid_subcontractor_data, "bid_subcontractor_data", timestamp=timestamp)
write_to_results(df_bid_subcontractor_data_could_not_parse, "bid_subcontractor_outliers", timestamp=timestamp)

write_to_results(contract_line_item_data, "contract_line_item_data", timestamp=timestamp)
write_to_results(other_format, "other_format", timestamp=timestamp)

# Save to excel

In [242]:
# Paths to your CSV files
csv_file_paths = RESULTS_PATH.glob('*.csv')

# Path to the output Excel file
excel_file_path = RESULTS_PATH / 'results.xlsx'

# Create a Pandas Excel writer using openpyxl as the engine
with pd.ExcelWriter(excel_file_path, engine='openpyxl') as writer:
    # Iterate over your CSV files
    for csv_file in csv_file_paths:
        # Use Path from pathlib to work with file paths
        csv_path = Path(csv_file)
        
        # Extract the file name without the extension for the sheet name
        sheet_name = csv_path.stem
        
        # Read each CSV file into a DataFrame
        df = pd.read_csv(csv_file)
        
        # Write the DataFrame to a new sheet in the Excel file using the file name as the sheet name
        df.to_excel(writer, sheet_name=sheet_name, index=False)

print(f'Merged CSV files into {excel_file_path}')

Merged CSV files into results/results.xlsx
