# Loading Cached Data

This file is responsible for loading locally cached data and to sanitize them.

In [None]:
import pandas as pd
import glob

pd.set_option('display.max_columns', None)

In [None]:
# host variables
host = 'opensource'
dateFrom = '2022-12-31T22:59:59.999Z'
dateTo = '2023-12-31T22:59:59.999Z'
year = '2023'

## Loading data sets

In [None]:
# load open collective data
df_platform = pd.read_pickle(f'data/{host}/{year}/df_{host}_{year}_all_platform_transactions.pkl')

# load stripe data
df_stripe = pd.read_pickle(f'data/{host}/{year}/df_{host}_{year}_stripe_transactions.pkl')

# load wise data
df_wise = pd.read_pickle(f'data/{host}/{year}/df_{host}_{year}_wise_transactions.pkl')

# load paypal data
dir = f'data/{host}/{year}/paypal'
df_paypal = pd.concat([pd.read_csv(f) for f in glob.glob(f'{dir}/*.csv')], ignore_index=True)
df_paypal = df_paypal.dropna(axis=1, how='all')

# load bank data
dir = f'data/{host}/{year}/oscbank_1'
df_bank_1 = pd.concat([pd.read_csv(f) for f in glob.glob(f'{dir}/*.csv')], ignore_index=True)
df_bank_1 = df_bank_1.dropna(axis=1, how='all')

dir = f'data/{host}/{year}/oscbank_2'
df_bank_2 = pd.concat([pd.read_csv(f) for f in glob.glob(f'{dir}/*.csv')], ignore_index=True)
df_bank_2 = df_bank_2.dropna(axis=1, how='all')


## Sanitizing Data

In [None]:
# Fix Column Names
df_bank_1.columns = df_bank_1.columns.str.strip()
df_bank_2.columns = df_bank_1.columns.str.strip()

# Dates
df_platform['createdAt'] = pd.to_datetime(df_platform['createdAt']).dt.tz_convert(None)
df_stripe['Created date (UTC)'] = pd.to_datetime(df_stripe['Created date (UTC)'], utc=True).dt.tz_convert(None)
df_wise['created'] = pd.to_datetime(df_wise['created'], utc=True).dt.tz_convert(None)
df_bank_1['Post Date'] = pd.to_datetime(df_bank_1['Post Date'], utc=True).dt.tz_convert(None)
df_bank_2['Post Date'] = pd.to_datetime(df_bank_2['Post Date'], utc=True).dt.tz_convert(None)

# Renaming Columns
df_wise = df_wise.set_index('id', drop=False).rename_axis(None).add_prefix('wise.')
df_stripe.rename(columns={'id': 'stripe.id'}, inplace=True)

df_paypal['Net'] = df_paypal['Net'].replace(',', '', regex=True).astype(dtype=float)
df_paypal['Gross'] = df_paypal['Gross'].replace(',', '', regex=True).astype(dtype=float)
df_paypal['Fee'] = df_paypal['Fee'].replace(',', '', regex=True).astype(dtype=float)
df_paypal['Balance'] = df_paypal['Balance'].replace(',', '', regex=True).astype(dtype=float)
df_paypal['Note'] = df_paypal[~df_paypal['Note'].isna()]['Note'].str.replace(',', '')

In [None]:
import datetime
df_paypal['Datetime'] = pd.to_datetime(df_paypal['Date'] + ' ' + df_paypal['Time'], utc=True)
tz_offset = {
    'PST': -8,
    'PDT': -7,
}

def fix_tz(row):
    tz=datetime.timezone(datetime.timedelta(hours=tz_offset[row['TimeZone']]))
    row['Datetime'] = pd.to_datetime(row['Datetime'].replace(tzinfo=tz)).tz_convert(None)
    return row
    
df_paypal = df_paypal.apply(fix_tz, axis=1)
df_paypal.info()

In [None]:
df_platform['amountInHostCurrency.value'] = df_platform['amountInHostCurrency.valueInCents'] / 100

## Filtering out Data

In [None]:
# remove failed and pending transactions from Stripe
df_stripe = df_stripe[df_stripe['Status'] != 'Failed']
df_stripe = df_stripe[df_stripe['Status'] != 'Pending']

# only keep rows in paypal with status in ['Completed', 'Processed']
# df_paypal = df_paypal[df_paypal['Status'].isin(['Completed', 'Processed'])]
df_paypal = df_paypal[df_paypal['Balance Impact'] != 'Memo']

# remove cancelled from Wise
df_wise = df_wise[df_wise['wise.status'] != 'cancelled']

# Combining

In [None]:
df_banks = pd.concat([df_bank_1, df_bank_2]).reset_index()