#### Loads all OpenSecrets bulk datasets ####
(With the exception of crp_ids.xlsx, which has many sheets)

In [None]:
print("Loading data module...")

In [35]:
def process_data(filepath, nrows=1e100, headers=None, n_expected_fields=None, show_errs=True, err_verbose=False):
    with open(filepath, 'r', encoding='ISO-8859-1') as file:
        data = file.read()

    # Must have a way to assign headers and expected fields.
    if (n_expected_fields == None) & (headers == None):
        raise ValueError("Invalid arguments: provide either headers or n_expected_fields, or both.")
    else:
        n_expected_fields = len(headers)

    # Process all the lines.
    i = 1
    bad_lines_count = 0
    processed_lines = []
    lines = data.splitlines()
    progress_interval = int(len(lines)/5)
    for line in lines:
        if i > nrows:
            break;
        
        print(f"Reading line {i} of {len(lines)}...") if i % progress_interval == 0 else None

        # Check for bad lines.
        field_count = len(line.split(','))
        if field_count != n_expected_fields:
            bad_lines_count += 1
            if show_errs == True:
                print(f"Bad line {i}")
                if err_verbose == True:
                    print(f"{line}")

        # Format as csv.
        processed_line = re.sub(r'\|([^|]*)\|', r'"\1"', line)
        processed_lines.append(processed_line)
        i += 1

    rel_path = os.path.dirname(filepath)
    filename_w_ext = os.path.basename(filepath)
    filename_wo_ext = os.path.splitext(filename_w_ext)[0]
    
    output_filepath = rel_path + '/' + filename_wo_ext + '.csv'
    
    # Save as a csv file.
    with open(output_filepath, 'w', encoding='utf-8') as file:
        if headers:
            # Append data source to headers, separated by double underscore.
            headers_processed = []
            for header in headers:
                headers_processed.append(str(header) + '__' + filename_wo_ext)
            file.write(','.join(headers_processed) + '\n')
        file.write('\n'.join(processed_lines))
    
    print(f"Processed data saved as {output_filepath}")

In [None]:
print("...data module loaded.")