# Caltrans Data Extraction

The goal of this project is to extract relevant data from text files, previously converted from PDF files. Since the text files are quite structured, the decision is to use regex to do this.

## Setup

Install the following packages if you don't have them yet:

In [2]:
# pip install pandas numpy tqdm ipykernel notebook python-dotenv openpyxl

In [3]:
from pathlib import Path
from pprint import pprint
from typing import List, Tuple
from collections import defaultdict
import re
import pandas as pd
import re
from tqdm import tqdm
import numpy as np
import os
from datetime import datetime
import shutil
from dotenv import load_dotenv

from utils import *

%reload_ext autoreload
%autoreload 2

# pd.set_option('display.max_rows', None)  # optional to see all rows in DataFrames

# One sample study

In [4]:
# filepath = RAW_DATA_PATH.parent / 'sample' / '01-0A3804.pdf_2724.txt'
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0A3804.pdf_4353.txt'
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0A0904.pdf_2724.txt'
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0A1204.pdf_11468.txt'
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0F4304.pdf_12346.txt'  # issue # 11
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0K6104.pdf_12731.txt'  # issue # 9
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0K4604.pdf_12040.txt'  # issue # 1
filepath = RAW_DATA_PATH_LINEPRINTER / '01-0H3204.pdf_9871.txt'  # issue # 5
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0A0404.pdf_10165.txt'  # different format
# filepath = RAW_DATA_PATH_LINEPRINTER / '04-4G6404.pdf_7310.txt'


In [5]:
contract_number_from_filename, tag, identifier = get_contract_number_and_tag_from_filename(filepath.stem)
file_contents = read_file(filepath)

In [6]:
contract_number_from_filename, tag, identifier 

('01-0H3204', '9871', '01-0H3204_9871')

Extract contract data:

In [7]:
df_contract_data = pd.DataFrame([extract_contract_data(file_contents, identifier)])
df_contract_bid_data = pd.DataFrame(extract_contract_bid_data(file_contents, identifier))
df_bid_subcontractor_data, df_bid_subcontractor_data_outliers = parse_subcontracted_line_item(fill_gaps_in_bidder_id(pd.DataFrame(extract_bid_subcontractor_data(file_contents, identifier))))
df_contract_line_item_data = pd.DataFrame(extract_contract_line_item_data(file_contents, identifier))

In [8]:
df_contract_data

Unnamed: 0,Identifier,Postponed_Contract,Bid_Opening_Date,Contract_Date,Contract_Code,Number_of_Contract_Items,Total_Number_of_Working_Days,Number_of_Bidders,Engineers_Est,Amount_Over,Amount_Under,Percent_Est_Over,Percent_Est_Under,Contract_Description
0,01-0H3204_9871,0,05/17/18,05/21/18,B,45,40,2,4289668.0,1048840.75,,24.45,,OVERLAY


In [9]:
df_contract_bid_data

Unnamed: 0,Identifier,Bid_Rank,A_plus_B_indicator,Bid_Total,Bidder_ID,Bidder_Name,Bidder_Phone,Extra,Weird_Contract_Notes,CSLB_Number
0,01-0H3204_9871,1,0,5338508.75,1,MERCER FRASER COMPANY,707 443-6371,,,105709
1,01-0H3204_9871,2,0,6466644.0,2,"TELFER PAVEMENT TECHNOLOGIES, LLC",916 383-1756,,,1005314


In [10]:
with pd.option_context('display.max_rows', None, 
                       'display.max_columns', None, 
                       'display.width', None, 
                       'display.max_colwidth', None):
    display(df_contract_line_item_data)

Unnamed: 0,Identifier,Item_Number,Extra,Item_Code,Item_Description,Extra1,Item_Dollar_Amount
0,01-0H3204_9871,1,,70030,LEAD COMPLIANCE PLAN,"LS LUMP SUM 9,500.00",9500.0
1,01-0H3204_9871,2,,120090,CONSTRUCTION AREA SIGNS,"LS LUMP SUM 25,000.00",25000.0
2,01-0H3204_9871,3,,120100,TRAFFIC CONTROL SYSTEM,"LS LUMP SUM 450,000.00",450000.0
3,01-0H3204_9871,4,,128652,PORTABLE CHANGEABLE MESSAGE SIGN (LS),"LS LUMP SUM 35,000.00",35000.0
4,01-0H3204_9871,5,,130100,JOB SITE MANAGEMENT,"LS LUMP SUM 80,000.00",80000.0
5,01-0H3204_9871,6,,130200,PREPARE WATER POLLUTION CONTROL PROGRAM,"LS LUMP SUM 9,500.00",9500.0
6,01-0H3204_9871,7,,130900,TEMPORARY CONCRETE WASHOUT,"LS LUMP SUM 2,500.00",2500.0
7,01-0H3204_9871,8,,141120,TREATED WOOD WASTE,"LB 247,000 .30",74100.0
8,01-0H3204_9871,9,,190185,SHOULDER BACKING,TON 180 100.00,18000.0
9,01-0H3204_9871,10,,198011,IMPORTED BORROW (TON),TON 410 88.00,36080.0


# Batch run

In [151]:
files = RAW_DATA_PATH_LINEPRINTER.glob('*.txt')
# files = [RAW_DATA_PATH_LINEPRINTER/'01-0F9204.PDF_12364.txt']

In [152]:
run_batch(files)

8977it [03:25, 43.60it/s]


# Save to excel

In [153]:
# Paths to your CSV files
csv_file_paths = RESULTS_PATH.glob('*.csv')

# Path to the output Excel file
excel_file_path = RESULTS_PATH / 'results.xlsx'

# Create a Pandas Excel writer using openpyxl as the engine
with pd.ExcelWriter(excel_file_path, engine='openpyxl') as writer:
    # Iterate over your CSV files
    for csv_file in tqdm(csv_file_paths):
        # Use Path from pathlib to work with file paths
        csv_path = Path(csv_file)
        
        # Extract the file name without the extension for the sheet name
        sheet_name = csv_path.stem
        
        # Read each CSV file into a DataFrame
        df = pd.read_csv(csv_file)
        
        # Write the DataFrame to a new sheet in the Excel file using the file name as the sheet name
        df.to_excel(writer, sheet_name=sheet_name, index=False)

print(f'Merged CSV files into {excel_file_path}')

7it [00:53,  7.60s/it]


Merged CSV files into results/results.xlsx
