# Caltrans Data Extraction

The goal of this project is to extract relevant data from text files, previously converted from PDF files. Since the text files are quite structured, the decision is to use regex to do this.

## Setup

Install the following packages if you don't have them yet:

In [1]:
# pip install pandas numpy tqdm ipykernel notebook python-dotenv openpyxl

In [16]:
from pathlib import Path
from pprint import pprint
from typing import List, Tuple
from collections import defaultdict
import re
import pandas as pd
import re
from tqdm import tqdm
import numpy as np
import os
from datetime import datetime
import shutil
from dotenv import load_dotenv

from utils1 import *

%reload_ext autoreload
%autoreload 2

# pd.set_option('display.max_rows', None)  # optional to see all rows in DataFrames

## Test parse_subcontracted_line_item

In [17]:
df = pd.read_csv('sample/subcontracted_line_item_examples.txt', header=None, delimiter="\t", names=['Subcontracted_Line_Item'])
df, df_outlier = parse_subcontracted_line_item(df)
df

FileNotFoundError: [Errno 2] No such file or directory: 'sample/subcontracted_line_item_examples.txt'

In [3]:
df_outlier

Unnamed: 0,Subcontracted_Line_Item,PARSED_1,PARSED_2,PARSED_3,PARSED_4,PARSED_5
8,ITEM LESS GC PROVIDED WATER,,ITEM,LESS GC PROVIDED WATER,,COULD NOT PARSE
10,BID ITEMS 108-110 ALL 100%,BID,ITEMS,108-110 ALL 100%,,COULD NOT PARSE
11,WORK AS DESCRIBED BY BID ITEMS LISTED,WORK AS DESCRIBED BY BID,ITEMS,LISTED,,COULD NOT PARSE
12,"BID ITEMS 23-25,27-31,45,68-79 ALL 100%",BID,ITEMS,"23-25,27-31,45,68-79 ALL 100%",,COULD NOT PARSE
14,"BID ITEMS 20-29, EACH 100%",BID,ITEMS,"20-29, EACH 100%",,COULD NOT PARSE
15,"ITEMS 91,92,93,94,95,98 - 100%",,ITEMS,"91,92,93,94,95,98 - 100%",,COULD NOT PARSE
16,"ITEMS 10, 18 & -23",,ITEMS,"10, 18 , -23",,COULD NOT PARSE
17,"ITEMS 2, 3, 4 6 & 5",,ITEMS,"2, 3, 4 6 , 5",,COULD NOT PARSE


# One sample study

In [11]:
# filepath = RAW_DATA_PATH.parent / 'sample' / '01-0A3804.pdf_2724.txt'
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0A3804.pdf_4353.txt'
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0A0904.pdf_2724.txt'
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0A1204.pdf_11468.txt'
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0F4304.pdf_12346.txt'  # issue # 11
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0K6104.pdf_12731.txt'  # issue # 9
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0K4604.pdf_12040.txt'  # issue # 1
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0H3204.pdf_9871.txt'  # issue # 5
filepath = RAW_DATA_PATH_TABLE / '01-0E7904.pdf_12975.txt'  # issue # 2

In [12]:
contract_number_from_filename, tag, identifier = get_contract_number_and_tag_from_filename(filepath.stem)
file_contents = read_file(filepath)

In [13]:
contract_number_from_filename, tag, identifier 

('01-0E7904', '12975', '01-0E7904_12975')

Extract contract data:

In [18]:
get_contract_number_and_date(file_contents)

('01-0E7904', '11/16/2022')

In [14]:
df_contract_data = pd.DataFrame([extract_contract_data(file_contents, identifier)])
df_contract_bid_data = pd.DataFrame(extract_contract_bid_data(file_contents, identifier))
df_bid_subcontractor_data = parse_subcontracted_line_item(fill_gaps_in_bidder_id(pd.DataFrame(extract_bid_subcontractor_data(file_contents, identifier))))
df_contract_line_item_data = pd.DataFrame(extract_contract_line_item_data(file_contents, identifier))

AttributeError: 'NoneType' object has no attribute 'group'

In [8]:
df_contract_data

Unnamed: 0,Identifier,Postponed_Contract,Bid_Opening_Date,Contract_Date,Contract_Code,Number_of_Contract_Items,Total_Number_of_Working_Days,Number_of_Bidders,Engineers_Est,Amount_Over,Amount_Under,Percent_Est_Over,Percent_Est_Under,Contract_Description
0,01-0A0404_10165,0,09/27/18,09/28/18,A,115,130,7,10366370.0,,2718364.5,,26.22,SHOULDER WIDENING


In [9]:
df_contract_bid_data

Unnamed: 0,Identifier,Bid_Rank,A_plus_B_indicator,Bid_Total,Bidder_ID,Bidder_Name,Bidder_Phone,Extra,Weird_Contract_Notes,CSLB_Number
0,01-0A0404_10165,1,1,8948005.5,3,GHILOTTI CONSTRUCTION CO. INC.,707 585-1221,,,644515
1,01-0A0404_10165,2,1,9794359.2,6,"O.C. JONES & SONS, INC.",510 526-3424,,,759729
2,01-0A0404_10165,3,1,10220181.75,5,ARGONAUT CONSTRUCTORS,707 542-4862,,,171432
3,01-0A0404_10165,4,1,10837402.25,7,WAHLUND CONSTRUCTION INC.,707 268-0150,,,678993
4,01-0A0404_10165,5,1,11298495.0,4,"STEVE MANNING CONSTRUCTION, INC.",530 222-0810,,,754230
5,01-0A0404_10165,6,1,11436342.95,2,MERCER FRASER COMPANY,707 443-6371,,,105709
6,01-0A0404_10165,7,1,11451749.2,1,GRANITE CONSTRUCTION COMPANY,707 467-4100,,,89


In [10]:
df_bid_subcontractor_data

(          Identifier Bidder_ID  ... PARSED_4 PARSED_5
 0    01-0A0404_10165        03  ...      NaN         
 1    01-0A0404_10165        03  ...      NaN         
 2    01-0A0404_10165        03  ...      NaN         
 3    01-0A0404_10165        03  ...     (8%)       34
 4    01-0A0404_10165        03  ...    (75%)       51
 ..               ...       ...  ...      ...      ...
 369  01-0A0404_10165        01  ...      NaN         
 370  01-0A0404_10165        01  ...      NaN         
 371  01-0A0404_10165        01  ...      NaN         
 372  01-0A0404_10165        01  ...      NaN         
 373  01-0A0404_10165        01  ...   (100%)       33
 
 [374 rows x 11 columns],
 Empty DataFrame
 Columns: [Identifier, Bidder_ID, Subcontractor_Name, Subcontracted_Line_Item, City, Subcontractor_License_Number, PARSED_1, PARSED_2, PARSED_3, PARSED_4, PARSED_5]
 Index: [])

In [11]:
# with pd.option_context('display.max_rows', None, 
#                        'display.max_columns', None, 
#                        'display.width', None, 
#                        'display.max_colwidth', None):
df_contract_line_item_data

Unnamed: 0,Identifier,Item_Number,Extra,Item_Code,Item_Description,Item_Dollar_Amount
0,01-0A0404_10165,1,,070030,LEAD COMPLIANCE PLAN,765.00
1,01-0A0404_10165,2,,080050,PROGRESS SCHEDULE (CRITICAL PATH METHOD),6000.00
2,01-0A0404_10165,3,,090105,TIME-RELATED OVERHEAD (LS),130000.00
3,01-0A0404_10165,4,,100100,DEVELOP WATER SUPPLY,5000.00
4,01-0A0404_10165,5,,120090,CONSTRUCTION AREA SIGNS,5900.00
...,...,...,...,...,...,...
110,01-0A0404_10165,111,,036442,ABANDON WELL,6000.00
111,01-0A0404_10165,112,,000003,ITEM DELETED PER ADDENDUM,0.00
112,01-0A0404_10165,113,,870600,TRAFFIC MONITORING STATION SYSTEM,86425.00
113,01-0A0404_10165,114,,872140,REMOVING EXISTING ELECTRICAL SYSTEM,2500.00


# Batch run

Here we run a batch or a single file (for example, if you want to run a specific outlier, make `files` a single element list):

In [None]:
files = (RAW_DATA_PATH/'Txt files - lineprinter').glob('*.txt')
# files = [RAW_DATA_PATH/'Txt files - lineprinter'/'01-0F9204.PDF_12364.txt']

In [None]:
contract_data = []
contract_bid_data = []
bid_subcontractor_data = []
contract_line_item_data = []
other_format = []

for filepath in tqdm(files):
    
    file_contents = read_file(filepath)
        
    filename = filepath.stem
    contract_number_from_filename, tag, identifier = get_contract_number_and_tag_from_filename(filename)
    contract_number_from_contents = get_contract_number(file_contents)
    
    if contract_number_from_filename == contract_number_from_contents:  
        contract_data.append(extract_contract_data(file_contents, identifier))
        contract_bid_data.extend(extract_contract_bid_data(file_contents, identifier))
        bid_subcontractor_data.extend(extract_bid_subcontractor_data(file_contents, identifier))
        contract_line_item_data.extend(extract_contract_line_item_data(file_contents, identifier))
    else:
        # if contract number doesn't match then something is off that needs investigation
        other_format.append({'other_format_filename': filename})
        # let's also copy the pdf to a folder for manual inspection
        source_path = RAW_DATA_PATH_PDF / f'{filename}.pdf'
        destination_path = OUTLIERS_PATH_PDF / f'{filename}.pdf'
        shutil.copy(source_path, destination_path)
        
        source_path = RAW_DATA_PATH_LINEPRINTER / f'{filename}.txt'
        destination_path = OUTLIERS_PATH_LINEPRINTER / f'{filename}.txt'
        shutil.copy(source_path, destination_path)

# Save files

Uncomment the timestamp line if you want to save all files with the timstampt prefix:

In [None]:
# timestamp = datetime.strftime(datetime.now(), '%m-%d-%Y-%H:%M:%S')
timestamp = None

In [None]:
write_to_results(contract_data, "contract_data", timestamp=timestamp)
write_to_results(contract_bid_data, "contract_bid_data", timestamp=timestamp)

df_bid_subcontractor_data, df_bid_subcontractor_data_could_not_parse = parse_subcontracted_line_item(
    fill_gaps_in_bidder_id(pd.DataFrame(bid_subcontractor_data)))

write_to_results(df_bid_subcontractor_data, "bid_subcontractor_data", timestamp=timestamp)
write_to_results(df_bid_subcontractor_data_could_not_parse, "bid_subcontractor_outliers", timestamp=timestamp)

write_to_results(contract_line_item_data, "contract_line_item_data", timestamp=timestamp)
write_to_results(other_format, "other_format", timestamp=timestamp)

# Save to excel

In [None]:
# Paths to your CSV files
csv_file_paths = RESULTS_PATH.glob('*.csv')

# Path to the output Excel file
excel_file_path = RESULTS_PATH / 'results.xlsx'

# Create a Pandas Excel writer using openpyxl as the engine
with pd.ExcelWriter(excel_file_path, engine='openpyxl') as writer:
    # Iterate over your CSV files
    for csv_file in csv_file_paths:
        # Use Path from pathlib to work with file paths
        csv_path = Path(csv_file)
        
        # Extract the file name without the extension for the sheet name
        sheet_name = csv_path.stem
        
        # Read each CSV file into a DataFrame
        df = pd.read_csv(csv_file)
        
        # Write the DataFrame to a new sheet in the Excel file using the file name as the sheet name
        df.to_excel(writer, sheet_name=sheet_name, index=False)

print(f'Merged CSV files into {excel_file_path}')