# Caltrans Data Extraction

The goal of this project is to extract relevant data from text files, previously converted from PDF files. Since the text files are quite structured, the decision is to use regex to do this.

## Setup

Install the following packages if you don't have them yet:

In [None]:
# pip install pandas numpy tqdm ipykernel notebook python-dotenv openpyxl

In [None]:
from pathlib import Path
from pprint import pprint
from typing import List, Tuple
from collections import defaultdict
import re
import pandas as pd
import re
from tqdm import tqdm
import numpy as np
import os
from datetime import datetime
import shutil
from dotenv import load_dotenv

from utils import *

%reload_ext autoreload
%autoreload 2

# pd.set_option('display.max_rows', None)  # optional to see all rows in DataFrames

## Test parse_subcontracted_line_item

In [None]:
df = pd.read_csv('subcontracted_line_item_examples.txt', header=None, delimiter="\t", names=['Subcontracted_Line_Item'])
df, df_outlier = parse_subcontracted_line_item(df)
df

In [None]:
df_outlier

# One sample study

In [None]:
# filepath = RAW_DATA_PATH.parent / 'sample' / '01-0A3804.pdf_2724.txt'
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0A3804.pdf_4353.txt'
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0A0904.pdf_2724.txt'
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0A1204.pdf_11468.txt'
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0F4304.pdf_12346.txt'  # issue # 11
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0K6104.pdf_12731.txt'  # issue # 9
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0K4604.pdf_12040.txt'  # issue # 1
# filepath = RAW_DATA_PATH_LINEPRINTER / '01-0H3204.pdf_9871.txt'  # issue # 5
filepath = RAW_DATA_PATH_LINEPRINTER / '01-0A0404.pdf_10165.txt'  # different format

In [None]:
contract_number_from_filename, tag, identifier = get_contract_number_and_tag_from_filename(filepath.stem)
file_contents = read_file(filepath)

In [None]:
contract_number_from_filename, tag, identifier 

Extract contract data:

In [None]:
df_contract_data = pd.DataFrame([extract_contract_data(file_contents, identifier)])
df_contract_bid_data = pd.DataFrame(extract_contract_bid_data(file_contents, identifier))
df_bid_subcontractor_data = parse_subcontracted_line_item(fill_gaps_in_bidder_id(pd.DataFrame(extract_bid_subcontractor_data(file_contents, identifier))))
df_contract_line_item_data = pd.DataFrame(extract_contract_line_item_data(file_contents, identifier))

In [None]:
df_contract_data

In [None]:
df_contract_bid_data

In [None]:
df_bid_subcontractor_data

In [None]:
# with pd.option_context('display.max_rows', None, 
#                        'display.max_columns', None, 
#                        'display.width', None, 
#                        'display.max_colwidth', None):
df_contract_line_item_data

# Batch run

Here we run a batch or a single file (for example, if you want to run a specific outlier, make `files` a single element list):

In [None]:
files = (RAW_DATA_PATH/'Txt files - lineprinter').glob('*.txt')
# files = [RAW_DATA_PATH/'Txt files - lineprinter'/'01-0F9204.PDF_12364.txt']

In [None]:
contract_data = []
contract_bid_data = []
bid_subcontractor_data = []
contract_line_item_data = []
other_format = []

for filepath in tqdm(files):
    
    file_contents = read_file(filepath)
        
    filename = filepath.stem
    contract_number_from_filename, tag, identifier = get_contract_number_and_tag_from_filename(filename)
    contract_number_from_contents = get_contract_number(file_contents)
    
    if contract_number_from_filename == contract_number_from_contents:  
        contract_data.append(extract_contract_data(file_contents, identifier))
        contract_bid_data.extend(extract_contract_bid_data(file_contents, identifier))
        bid_subcontractor_data.extend(extract_bid_subcontractor_data(file_contents, identifier))
        contract_line_item_data.extend(extract_contract_line_item_data(file_contents, identifier))
    else:
        # if contract number doesn't match then something is off that needs investigation
        other_format.append({'other_format_filename': filename})
        # let's also copy the pdf to a folder for manual inspection
        
        shutil.copy(
            RAW_DATA_PATH_PDF / f'{filename}.pdf', 
            OUTLIERS_PATH_PDF / f'{filename}.pdf'
            )
        
        shutil.copy(
            RAW_DATA_PATH_LINEPRINTER / f'{filename}.txt', 
            OUTLIERS_PATH_LINEPRINTER / f'{filename}.txt'
            )
        
        shutil.copy(
            RAW_DATA_PATH_TABLE / f'{filename}.txt',
            OUTLIERS_PATH_TABLE / f'{filename}.txt'
            )
        
        
        

# Save files

Uncomment the timestamp line if you want to save all files with the timstampt prefix:

In [None]:
# timestamp = datetime.strftime(datetime.now(), '%m-%d-%Y-%H:%M:%S')
timestamp = None

In [None]:
write_to_results(contract_data, "contract_data", timestamp=timestamp)
write_to_results(contract_bid_data, "contract_bid_data", timestamp=timestamp)

df_bid_subcontractor_data, df_bid_subcontractor_data_could_not_parse = parse_subcontracted_line_item(
    fill_gaps_in_bidder_id(pd.DataFrame(bid_subcontractor_data)))

write_to_results(df_bid_subcontractor_data, "bid_subcontractor_data", timestamp=timestamp)
write_to_results(df_bid_subcontractor_data_could_not_parse, "bid_subcontractor_outliers", timestamp=timestamp)

write_to_results(contract_line_item_data, "contract_line_item_data", timestamp=timestamp)
write_to_results(other_format, "other_format", timestamp=timestamp)

# Save to excel

In [None]:
# Paths to your CSV files
csv_file_paths = RESULTS_PATH.glob('*.csv')

# Path to the output Excel file
excel_file_path = RESULTS_PATH / 'results.xlsx'

# Create a Pandas Excel writer using openpyxl as the engine
with pd.ExcelWriter(excel_file_path, engine='openpyxl') as writer:
    # Iterate over your CSV files
    for csv_file in csv_file_paths:
        # Use Path from pathlib to work with file paths
        csv_path = Path(csv_file)
        
        # Extract the file name without the extension for the sheet name
        sheet_name = csv_path.stem
        
        # Read each CSV file into a DataFrame
        df = pd.read_csv(csv_file)
        
        # Write the DataFrame to a new sheet in the Excel file using the file name as the sheet name
        df.to_excel(writer, sheet_name=sheet_name, index=False)

print(f'Merged CSV files into {excel_file_path}')