# Loading Cached Data

This file is responsible for loading locally cached data and to sanitize them.

In [31]:
import pandas as pd
import glob

pd.set_option('display.max_columns', None)

In [32]:
# host variables
host = 'opensource'
dateFrom = '2022-12-31T22:59:59.999Z'
dateTo = '2023-12-31T22:59:59.999Z'
year = '2023'

## Loading data sets

In [33]:
# load open collective data
df_platform = pd.read_pickle(f'data/{host}/{year}/df_{host}_{year}_all_platform_transactions.pkl')

# load stripe data
df_stripe = pd.read_pickle(f'data/{host}/{year}/df_{host}_{year}_stripe_transactions.pkl')

# load wise data
df_wise = pd.read_pickle(f'data/{host}/{year}/df_{host}_{year}_wise_transactions.pkl')

# load bank data
dir = f'data/{host}/{year}/oscbank_1'
df_bank_1 = pd.concat([pd.read_csv(f) for f in glob.glob(f'{dir}/*.csv')], ignore_index=True)
df_bank_1 = df_bank_1.dropna(axis=1, how='all')

dir = f'data/{host}/{year}/oscbank_2'
df_bank_2 = pd.concat([pd.read_csv(f) for f in glob.glob(f'{dir}/*.csv')], ignore_index=True)
df_bank_2 = df_bank_2.dropna(axis=1, how='all')

## Sanitizing Data

In [34]:
# Fix Column Names
df_bank_1.columns = df_bank_1.columns.str.strip()
df_bank_2.columns = df_bank_1.columns.str.strip()

# Dates
df_platform['createdAt'] = pd.to_datetime(df_platform['createdAt'])
df_stripe['Created date (UTC)'] = pd.to_datetime(df_stripe['Created date (UTC)'])
df_wise['created'] = pd.to_datetime(df_wise['created'], utc=True)
df_bank_1['Post Date'] = pd.to_datetime(df_bank_1['Post Date'], utc=True)
df_bank_2['Post Date'] = pd.to_datetime(df_bank_2['Post Date'], utc=True)

# Renaming Columns
df_wise = df_wise.set_index('id', drop=False).rename_axis(None).add_prefix('wise.')

## Filtering out Data

In [35]:
# remove failed and pending transactions from Stripe
df_stripe = df_stripe[df_stripe['Status'] != 'Failed']
df_stripe = df_stripe[df_stripe['Status'] != 'Pending']

# remove cancelled from Wise
df_wise = df_wise[df_wise['wise.status'] != 'cancelled']