#### Loads all OpenSecrets bulk datasets ####
(With the exception of crp_ids.xlsx, which has many sheets)

In [None]:
print("Loading data module...")

In [None]:
def create_dataframe(filepath, columns, nrows=1e100):
    # Load data from an OpenSecrets csv file and
    # clean up the data consistently.
    print("Creating dataframe...")

    columns = dict(enumerate(columns))
    df = pd.read_csv(filepath, nrows=nrows, on_bad_lines='skip', sep=',', header=None, 
                     na_values=['N/A', 'NA'], encoding='ISO-8859-1')
    df = df.rename(columns=columns)
    
    # Remove pipes surrounding data.
    df = df.applymap(lambda x: x.replace('|', '') if type(x) == str else x)

    filename = os.path.basename(filepath)

    if 'active' in df.columns:
        df['active'] = df['active'].apply(lambda x: int(x))
    
    if 'agencyid' in df.columns:
        df['agencyid'] = pd.to_numeric(df['agencyid'], errors='coerce')
        df = df.dropna(subset=['agencyid'])
        df['agencyid'] = df['agencyid'].astype(int)
    
    if 'amount' in df.columns:
        df['amount'] = pd.to_numeric(df['amount'], errors='coerce')
        df = df.dropna(subset=['amount'])
        df['amount'] = df['amount'].astype(int)

    if 'congno' in df.columns:
        df['congno'] = pd.to_numeric(df['congno'], errors='coerce')
        df = df.dropna(subset=['congno'])
        df['congno'] = df['congno'].astype(int)

    if 'cycle' in df.columns:
        df['cycle'] = df['cycle'].apply(lambda x: int(x))
    
    if 'date' in df.columns:
        if filename == 'indivs22.csv':
            df['date'] = df['date'][:9]
        df['date'] = pd.to_datetime(df['date'], format='%m/%d/%Y', errors='coerce')
        if filename == 'pac_other22.csv':
            df = df.dropna(subset=['date'])

    if 'fecrecno' in df.columns:
        df['fecrecno'] = df['fecrecno'].apply(lambda x: int(x))

    if 'fectransid' in df.columns:
        df['fectransid'] = df['fectransid'].apply(lambda x: int(x))

    if 'foreign' in df.columns:
        df['foreign'] = df['foreign'].apply(lambda x: int(x)) if 'foreign' in df.columns else None
    
    if 'formid' in df.columns:
        df['formid'] = pd.to_numeric(df['formid'], errors='coerce')
        df = df.dropna(subset=['formid'])
        df['formid'] = df['formid'].astype(int)
    
    if 'id' in df.columns:
        df['id'] = pd.to_numeric(df['id'], errors='coerce')
        df = df.dropna(subset=['id'])
        df['id'] = df['id'].astype(int)
    
    if 'microfilm' in df.columns:
        df['microfilm'] = df['microfilm'].fillna(0)
        df['microfilm'] = df['microfilm'].apply(lambda x: 0 if x == '' or not str(x).isdigit() else int(x))

    if 'schaid' in df.columns:
        df['schaid'] = pd.to_numeric(df['schaid'], errors='coerce')
        df = df.dropna(subset=['schaid'])
        df['schaid'] = df['schaid'].astype(int)

    if 'total' in df.columns:
        df['total'] = pd.to_numeric(df['total'], errors='coerce')
        df = df.dropna(subset=['total'])
        df['total'] = df['total'].astype(int)

    if 'year' in df.columns:
        df['year'] = pd.to_numeric(df['year'], errors='coerce')
        df = df.dropna(subset=['year'])
        df['year'] = df['year'].astype(int)

    if 'ytd' in df.columns:
        df['ytd'] = pd.to_numeric(df['ytd'], errors='coerce')
        df = df.dropna(subset=['ytd'])
        df['ytd'] = df['ytd'].astype(int)
    
    if 'zip' in df.columns:
        df['zip'] = pd.to_numeric(df['zip'], errors='coerce')
        df = df.dropna(subset=['zip'])
        df['zip'] = df['zip'].astype(int)

    print("dataframe created...")
    
    return df

In [None]:
print("...data module loaded.")