In [None]:
# Generate the members table.
import pandas as pd

raw_members = pd.read_csv(
    'data/input/users-snapshot.csv',
    # Only include extra information that is required on sign-up ('Age') or widely pouplated ('Gender')
    usecols=(
        # A handful of users have string Member IDs
        'Member ID',
        # Gender is filled for about 1/3rd of users, but isn't required anymore
        'Sex',
        # Age is required now, and filled for 2/3rds of users
        'Age',
        # Include Race columns - even though they're not frequently filled in, glean what we can
        'American Indian or Alaskan Native', 'Asian', 'Black or African American', 'Hispanic', 'White', 'Native Hawaiian or Pacific Islander', 'Other',
        'Confirmed?', 'Member Created (M/D/YYYY)', 'Start of first full membership (M/D/YYYY)',
        'Current Membership Type', 'Latest Membership Change (request, upgrade, renewal, cancellation...) (M/D/YYYY)',
        'Current Membership Expiration (M/D/YYYY)',
    ),
    parse_dates=[
        'Member Created (M/D/YYYY)', 'Start of first full membership (M/D/YYYY)', 
        'Latest Membership Change (request, upgrade, renewal, cancellation...) (M/D/YYYY)',
        'Current Membership Expiration (M/D/YYYY)',
    ],
    dtype={
        'Current Membership Type': 'category', 'Confirmed?': 'boolean',
        'Age': 'Int64',
    },
)

raw_members

In [None]:
better_cols = raw_members.rename(columns={
    # For consistency with the loans column name
    'Member ID': 'Membership ID',
    'Member Created (M/D/YYYY)': 'Created',
    'Start of first full membership (M/D/YYYY)': 'First Membership Started',
    'Latest Membership Change (request, upgrade, renewal, cancellation...) (M/D/YYYY)': 'Last Changed',
    'Current Membership Expiration (M/D/YYYY)': 'Expiration',
})
better_cols.info()

In [None]:
better_cols['Sex'].fillna('unknown', inplace=True)

In [None]:
better_cols.to_csv('data/output/members.csv')
better_cols.to_pickle('data/output/members.pkl')