# Caltrans Data Extraction

The goal of this project is to extract relevant data from text files, previously converted from PDF files. Since the text files are quite structured, the decision is to use regex to do this.

## Setup

Install the following packages if you don't have them yet:

In [96]:
# pip install pandas numpy tqdm ipykernel notebook python-dotenv openpyxl

In [97]:
from pathlib import Path
from pprint import pprint
from typing import List, Tuple
from collections import defaultdict
import re
import pandas as pd
import re
from tqdm import tqdm
import numpy as np
import os
from datetime import datetime
import shutil
from dotenv import load_dotenv

from utils import *

%reload_ext autoreload
%autoreload 2

# pd.set_option('display.max_rows', None)  # optional to see all rows in DataFrames

## Test parse_subcontracted_line_item

In [98]:
df = pd.read_csv('subcontracted_line_item_examples.txt', header=None, delimiter="\t", names=['Subcontracted_Line_Item'])
df, df_outlier = parse_subcontracted_line_item(df)
df

Unnamed: 0,Subcontracted_Line_Item,PARSED_1,PARSED_2,PARSED_3,PARSED_4,PARSED_5
0,ITEMS 6 THRU 8 AND 13 THRU 15,,ITEMS,"6 - 8 , 13 - 15",,"6, 7, 8, 13, 14, 15"
1,ITEM 12,,ITEM,12,,12
2,ITEMS 1 THRU 3 AND 5,,ITEMS,"1 - 3 , 5",,"1, 2, 3, 5"
3,ITEMS 6 THRU 8 AND 13 THRU 15 (PARTIALS),,ITEMS,"6 - 8 , 13 - 15",(PARTIALS),"6, 7, 8, 13, 14, 15"
4,"ITEMS 6, 7, 8, 13, 14 AND 15",,ITEMS,"6, 7, 8, 13, 14 , 15",,"6, 7, 8, 13, 14, 15"
5,"ITEMS 4, 6 THRU 8, 13 THRU 15",,ITEMS,"4, 6 - 8, 13 - 15",,"4, 6, 7, 8, 13, 14, 15"
6,"ITEMS 6, 7, 8 AND 13 THRU 15",,ITEMS,"6, 7, 8 , 13 - 15",,"6, 7, 8, 13, 14, 15"
7,ITEM 94 (100%),,ITEM,94,(100%),94
8,ITEM LESS GC PROVIDED WATER,,ITEM,LESS GC PROVIDED WATER,,COULD NOT PARSE
9,"WORK AS DESCRIBED BY BID ITEM NUMBERS 11,14,18",WORK AS DESCRIBED BY BID,ITEM NUMBERS,111418,,"11, 14, 18"


In [99]:
df_outlier

Unnamed: 0,Subcontracted_Line_Item,PARSED_1,PARSED_2,PARSED_3,PARSED_4,PARSED_5
8,ITEM LESS GC PROVIDED WATER,,ITEM,LESS GC PROVIDED WATER,,COULD NOT PARSE
10,BID ITEMS 108-110 ALL 100%,BID,ITEMS,108-110 ALL 100%,,COULD NOT PARSE
11,WORK AS DESCRIBED BY BID ITEMS LISTED,WORK AS DESCRIBED BY BID,ITEMS,LISTED,,COULD NOT PARSE
12,"BID ITEMS 23-25,27-31,45,68-79 ALL 100%",BID,ITEMS,"23-25,27-31,45,68-79 ALL 100%",,COULD NOT PARSE
14,"BID ITEMS 20-29, EACH 100%",BID,ITEMS,"20-29, EACH 100%",,COULD NOT PARSE
15,"ITEMS 91,92,93,94,95,98 - 100%",,ITEMS,"91,92,93,94,95,98 - 100%",,COULD NOT PARSE
16,"ITEMS 10, 18 & -23",,ITEMS,"10, 18 , -23",,COULD NOT PARSE
17,"ITEMS 2, 3, 4 6 & 5",,ITEMS,"2, 3, 4 6 , 5",,COULD NOT PARSE


# One sample study

In [100]:
# filepath = RAW_DATA_PATH.parent / 'sample' / '01-0A3804.pdf_2724.txt'
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0A3804.pdf_4353.txt'
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0A0904.pdf_2724.txt'
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0A1204.pdf_11468.txt'
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0F4304.pdf_12346.txt'  # issue # 11
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0K6104.pdf_12731.txt'  # issue # 9
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0K4604.pdf_12040.txt'  # issue # 1
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0H3204.pdf_9871.txt'  # issue # 5
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0A0404.pdf_10165.txt'  # different format
filepath = RAW_DATA_PATH_LINEPRINTER / '04-4G6404.pdf_7310.txt'


In [101]:
filepath

PosixPath('RR Procurement - Raw Data-2/Txt files - lineprinter - population/04-4G6404.pdf_7310.txt')

In [102]:
shutil.copy(filepath, 'test_data/04-4G6404.pdf_7310.txt')

'test_data/04-4G6404.pdf_7310.txt'

In [103]:
contract_number_from_filename, tag, identifier = get_contract_number_and_tag_from_filename(filepath.stem)
file_contents = read_file(filepath)

In [104]:
contract_number_from_filename, tag, identifier 

('04-4G6404', '7310', '04-4G6404_7310')

Extract contract data:

In [120]:
df_contract_data = pd.DataFrame([extract_contract_data(file_contents, identifier)])
df_contract_bid_data = pd.DataFrame(extract_contract_bid_data(file_contents, identifier))
df_bid_subcontractor_data, df_bid_subcontractor_data_outliers = parse_subcontracted_line_item(fill_gaps_in_bidder_id(pd.DataFrame(extract_bid_subcontractor_data(file_contents, identifier))))
df_contract_line_item_data = pd.DataFrame(extract_contract_line_item_data(file_contents, identifier))

In [106]:
df_contract_line_item_data.to_csv('test_data/04-4G6404.pdf_7310.csv', index=False)

In [107]:
df_contract_data

Unnamed: 0,Identifier,Postponed_Contract,Bid_Opening_Date,Contract_Date,Contract_Code,Number_of_Contract_Items,Total_Number_of_Working_Days,Number_of_Bidders,Engineers_Est,Amount_Over,Amount_Under,Percent_Est_Over,Percent_Est_Under,Contract_Description
0,04-4G6404_7310,0,11/03/15,11/05/15,D,121,90,8,4178805.0,,791668.0,,18.94,ROUTES 84/280 SEPARATION CONSTRUCT


In [108]:
df_contract_bid_data

Unnamed: 0,Identifier,Bid_Rank,A_plus_B_indicator,Bid_Total,Bidder_ID,Bidder_Name,Bidder_Phone,Extra,Weird_Contract_Notes,CSLB_Number
0,04-4G6404_7310,1,1,3657137.0,3,GHILOTTI CONSTRUCTION CO. INC.,707 585-1221,,,644515
1,04-4G6404_7310,2,1,4039158.0,8,"GORDON N. BALL, INC.",925 838-5675,,,710807
2,04-4G6404_7310,3,1,4189774.0,4,"RGW CONSTRUCTION, INC.",925 606-2400,,,591940
3,04-4G6404_7310,4,1,4298995.26,1,GRANITE CONSTRUCTION,408 327-7013,,,89
4,04-4G6404_7310,5,1,4519443.0,5,GRANITE ROCK COMPANY,408 574-1400,,,22
5,04-4G6404_7310,6,1,4609762.0,2,"DISNEY CONSTRUCTION, INC.",650 259-9545,,,866974
6,04-4G6404_7310,7,1,4768691.0,6,VALENTINE CORPORATION,415 453-3732,,,229225
7,04-4G6404_7310,8,1,5255322.0,7,BUGLER CONSTRUCTION,925 416-0700,,,740863


In [123]:
df_bid_subcontractor_data_outliers.to_csv('test_data/04-4G6404.pdf_7310_subcontractor_data_outliers.csv', index=False)

In [110]:
with pd.option_context('display.max_rows', None, 
                       'display.max_columns', None, 
                       'display.width', None, 
                       'display.max_colwidth', None):
    display(df_contract_line_item_data)

Unnamed: 0,Identifier,Item_Number,Extra,Item_Code,Item_Description,Extra1,Item_Dollar_Amount
0,04-4G6404_7310,1,,70030,LEAD COMPLIANCE PLAN,"LS LUMP SUM 1,500.00",1500.0
1,04-4G6404_7310,2,,120090,CONSTRUCTION AREA SIGNS,"LS LUMP SUM 5,000.00",5000.0
2,04-4G6404_7310,3,,120100,TRAFFIC CONTROL SYSTEM,"LS LUMP SUM 35,000.00",35000.0
3,04-4G6404_7310,4,,120159,TEMPORARY TRAFFIC STRIPE (PAINT),LF 670 1.80,1206.0
4,04-4G6404_7310,5,,120165,CHANNELIZER (SURFACE MOUNTED),EA 12 45.00,540.0
5,04-4G6404_7310,6,,128652,PORTABLE CHANGEABLE MESSAGE SIGN (LS),"LS LUMP SUM 14,000.00",14000.0
6,04-4G6404_7310,7,,129000,TEMPORARY RAILING (TYPE K),LF 700 30.00,21000.0
7,04-4G6404_7310,8,,29897,TEMPORARY ALTERNATIVE CRASH CUSHION,"EA 1 5,000.00",5000.0
8,04-4G6404_7310,9,,129150,TEMPORARY TRAFFIC SCREEN,LF 700 8.00,5600.0
9,04-4G6404_7310,10,,130100,JOB SITE MANAGEMENT,"LS LUMP SUM 10,000.00",10000.0


# Batch run

In [111]:
files = RAW_DATA_PATH_LINEPRINTER.glob('*.txt')
# files = [RAW_DATA_PATH_LINEPRINTER/'01-0F9204.PDF_12364.txt']

In [112]:
run_batch(files)

8977it [03:43, 40.13it/s]


In [113]:
raw_item_text = """
    ITEMS 6 THRU 8 AND 13 THRU 15
ITEM 12
ITEMS 1 THRU 3 AND 5 
ITEMS 6 THRU 8 AND 13 THRU 15 (PARTIALS)
ITEMS 6, 7, 8, 13, 14 AND 15
ITEMS 4, 6 THRU 8, 13 THRU 15
ITEMS 6, 7, 8 AND 13 THRU 15
ITEM     94   (100%)
ITEM LESS GC PROVIDED WATER
WORK AS DESCRIBED BY BID ITEM NUMBERS 11,14,18
BID ITEMS 108-110 ALL 100%
WORK AS DESCRIBED BY BID ITEMS LISTED
BID ITEMS 23-25,27-31,45,68-79 ALL 100%
WORK AS DESCRIBED IN BID ITEM(S): 24, 25, 26, 27, 28
BID ITEMS 20-29, EACH 100%
ITEMS 91,92,93,94,95,98 - 100%
ITEMS 10, 18 & -23
ITEMS 2, 3, 4  6 & 5
QUICK SAND 
SOMETHING ELSE
"""

In [114]:
raw_items = raw_item_text.split('\n')

In [115]:
raw_items


['',
 '    ITEMS 6 THRU 8 AND 13 THRU 15',
 'ITEM 12',
 'ITEMS 1 THRU 3 AND 5 ',
 'ITEMS 6 THRU 8 AND 13 THRU 15 (PARTIALS)',
 'ITEMS 6, 7, 8, 13, 14 AND 15',
 'ITEMS 4, 6 THRU 8, 13 THRU 15',
 'ITEMS 6, 7, 8 AND 13 THRU 15',
 'ITEM     94   (100%)',
 'ITEM LESS GC PROVIDED WATER',
 'WORK AS DESCRIBED BY BID ITEM NUMBERS 11,14,18',
 'BID ITEMS 108-110 ALL 100%',
 'WORK AS DESCRIBED BY BID ITEMS LISTED',
 'BID ITEMS 23-25,27-31,45,68-79 ALL 100%',
 'WORK AS DESCRIBED IN BID ITEM(S): 24, 25, 26, 27, 28',
 'BID ITEMS 20-29, EACH 100%',
 'ITEMS 91,92,93,94,95,98 - 100%',
 'ITEMS 10, 18 & -23',
 'ITEMS 2, 3, 4  6 & 5',
 'QUICK SAND ',
 'SOMETHING ELSE',
 '']

# Save to excel

In [17]:
# Paths to your CSV files
csv_file_paths = RESULTS_PATH.glob('*.csv')

# Path to the output Excel file
excel_file_path = RESULTS_PATH / 'results.xlsx'

# Create a Pandas Excel writer using openpyxl as the engine
with pd.ExcelWriter(excel_file_path, engine='openpyxl') as writer:
    # Iterate over your CSV files
    for csv_file in tqdm(csv_file_paths):
        # Use Path from pathlib to work with file paths
        csv_path = Path(csv_file)
        
        # Extract the file name without the extension for the sheet name
        sheet_name = csv_path.stem
        
        # Read each CSV file into a DataFrame
        df = pd.read_csv(csv_file)
        
        # Write the DataFrame to a new sheet in the Excel file using the file name as the sheet name
        df.to_excel(writer, sheet_name=sheet_name, index=False)

print(f'Merged CSV files into {excel_file_path}')

7it [00:54,  7.75s/it]


Merged CSV files into results/results.xlsx
