## Type 3 contracts

Some contracts are merged from several contracts (let's call these type 3 contracts), for example 07-338004_11638 is merged from 5 contracts:

In [14]:
from contract import *
from experiment import *

import pyperclip

%reload_ext autoreload
%autoreload 2

In [15]:
c = Contract('07-338004_11638')
contract_number_regex = re.compile(r"CONTRACT NUMBER\s+([A-Za-z0-9-]+)")
print(re.findall(contract_number_regex, c.file_contents))
# pyperclip.copy(c.file_contents)

['07-338004', '01-0G4304', '05-1G9904', '05-1H0604', '07-338004']


So we can look into header and split all the contracts:

In [16]:
filepaths = get_some_contracts(num_contracts=None)
multiples = []
for filepath in tqdm(filepaths):
    regex = r'STATE OF CALIFORNIA\s+B I D   S U M M A R Y\s+DEPARTMENT OF TRANSPORTATION'  # this is very fast regex
    contract = Contract(filepath)
    matches = re.findall(regex, contract.file_contents, re.MULTILINE)
    if len(matches) > 1:
        multiples.append(contract.identifier)
len(multiples)
    

  0%|          | 0/8809 [00:00<?, ?it/s]

100%|██████████| 8809/8809 [00:01<00:00, 4800.18it/s]


34

And there are 34 contracts with multiple contracts. Let's write a method to parse them:

In [30]:
def split_contract(identifier, output_path):
    """
    Uses phrase in the header to split the contract into multiple partial_texts, then saves those into separate files where new 
    name is the contract number + tag + index.
    """
    c = Contract(identifier)
    pattern = re.compile(r'[^\n]*STATE OF CALIFORNIA\s+B I D   S U M M A R Y\s+DEPARTMENT OF TRANSPORTATION')
    matches = re.finditer(pattern, c.file_contents)

    # Extract and print starting positions
    positions = [match.start() for match in matches] + [None]

    new_identifiers = []
    for i in range(len(positions) - 1):
        text_portion = '\n\n\n' + c.file_contents[positions[i]:positions[i+1]]  # add some newlines at the beginning
        # read the contract from the header:
        contract_number_regex = re.compile(r"CONTRACT NUMBER\s+([A-Za-z0-9-]+)")
        match = re.search(contract_number_regex, text_portion)
        
        new_identifier = match.group(1) + '.pdf_' + c.tag
        
        if new_identifier in new_identifiers:
            print(f"Found duplicate new identifier when parsing: {identifier}")
            continue
        
        output_file_path = output_path / f"{new_identifier}.txt"
        with open(output_file_path, 'w') as output_file:
            output_file.write(text_portion)
            
        new_identifiers.append(new_identifier)

    return new_identifiers


and let's now write them into separate files:

In [32]:
SPLITS_PATH = RAW_DATA_PATH / 'splits'
SPLITS_PATH.mkdir(exist_ok=True)

notes = []
for identifier in multiples:
    try:
        new_identifiers = split_contract(identifier, SPLITS_PATH)
        for new_identifier in new_identifiers:
            notes.append({'original_identifier': identifier, 'new_identifier': new_identifier})
    except Exception as e:
        print(f'Error processing {identifier}: {e}')
        
pd.DataFrame(notes).to_csv(RAW_DATA_PATH / 'type_3_files.csv', index=False)

Error processing 06-0L8404_3005: 'NoneType' object has no attribute 'group'
Found duplicate new identifier when parsing: 07-338004_11638


(06-0L8404_3005 and 07-338004_11638 have errors already reported)