In [None]:
# Generate the members table.
import pandas as pd

raw_members = pd.read_csv(
    'input/users-snapshot.csv',
    # Only include extra information that is required on sign-up ('Age') or widely pouplated ('Gender')
    usecols=(
        # A handful of users have string Member IDs
        'Member ID',
        # Gender is filled for about 1/3rd of users, but isn't required anymore
        'Sex',
        # Age is required now, and filled for 2/3rds of users
        'Age',
        # Include Race columns - even though they're not frequently filled in, glean what we can
        'American Indian or Alaskan Native', 'Asian', 'Black or African American', 'Hispanic', 'White', 'Native Hawaiian or Pacific Islander', 'Other',
        'Confirmed?', 'Member Created (M/D/YYYY)', 'Start of first full membership (M/D/YYYY)',
        'Current Membership Type', 'Latest Membership Change (request, upgrade, renewal, cancellation...) (M/D/YYYY)',
        'Current Membership Expiration (M/D/YYYY)', 'User Note', 'User Warning',
    ),
    parse_dates=[
        'Member Created (M/D/YYYY)', 'Start of first full membership (M/D/YYYY)', 
        'Latest Membership Change (request, upgrade, renewal, cancellation...) (M/D/YYYY)',
        'Current Membership Expiration (M/D/YYYY)',
    ],
    dtype={
        'Current Membership Type': 'category', 'Confirmed?': 'boolean',
        'Age': 'Int64',
    },
)

raw_members

In [None]:
better_cols = raw_members.rename(columns={
    # For consistency with the loans column name
    'Member ID': 'Membership ID',
    'Member Created (M/D/YYYY)': 'Created',
    'Start of first full membership (M/D/YYYY)': 'First Membership Started',
    'Latest Membership Change (request, upgrade, renewal, cancellation...) (M/D/YYYY)': 'Last Changed',
    'Current Membership Expiration (M/D/YYYY)': 'Expiration',
})
better_cols.info()

In [None]:
better_cols['Sex'].fillna('unknown', inplace=True)

In [None]:
# Add outstanding balance (overdue loan balance + any outsanding balance) to the member info
#
# This is spread between "Loans" > "Overdue Only" in MyTurn, and the "Outsatnding Balances"
# report in MyTurn.  Combining it one place helps identify people with heavy outsanding balances.
import pandas as pd

from chtl_helpers import balance_to_float

balances = pd.read_csv(
    'input/outstanding-balances.csv',
    # Skip duplicative personal info, just ID and balance is enough.
    usecols=('ID', 'Amount'),
    dtype={
        # Make the 'Member ID' a string for compatibility with other tables. Occassionally set it to something other than a number.
        'ID': 'string',
    },
    # Convert "$0.00" strings to actual numbers
    converters={'Amount': balance_to_float},
).rename(columns={'ID': 'Membership ID'}).set_index('Membership ID')

loans = pd.read_pickle('output/loans.pkl')
late_fees = loans.groupby('Membership ID').sum()


# late_fees.join(balances)[late_fees['Late Fees To Date'] > 0].sort_values(by='Late Fees To Date', ascending=False)

better_cols['Late Fees'] = better_cols.join(late_fees['Late Fees To Date'], on='Membership ID')['Late Fees To Date'].fillna(0.0)
better_cols['Balance'] = better_cols.join(balances['Amount'], on='Membership ID')['Amount'].fillna(0.0)
better_cols['Balance Including Late Fees'] = better_cols['Late Fees'] + better_cols['Balance']

better_cols['Balance Including Late Fees'][better_cols['Balance Including Late Fees'] > 0].plot.hist()

In [None]:
better_cols.to_csv('output/members.csv')
better_cols.to_pickle('output/members.pkl')