In [1]:
# Generate the members table.
import pandas as pd

raw_members = pd.read_csv(
    'data/input/users-snapshot.csv',
    # Only include extra information that is required on sign-up ('Age') or widely pouplated ('Gender')
    usecols=(
        # A handful of users have string Member IDs
        'Member ID',
        # Gender is filled for about 1/3rd of users, but isn't required anymore
        'Sex',
        # Age is required now, and filled for 2/3rds of users
        'Age',
        # Include Race columns - even though they're not frequently filled in, glean what we can
        'American Indian or Alaskan Native', 'Asian', 'Black or African American', 'Hispanic', 'White', 'Native Hawaiian or Pacific Islander', 'Other',
        'Confirmed?', 'Member Created (M/D/YYYY)', 'Start of first full membership (M/D/YYYY)',
        'Current Membership Type', 'Latest Membership Change (request, upgrade, renewal, cancellation...) (M/D/YYYY)',
        'Current Membership Expiration (M/D/YYYY)',
    ),
    parse_dates=[
        'Member Created (M/D/YYYY)', 'Start of first full membership (M/D/YYYY)', 
        'Latest Membership Change (request, upgrade, renewal, cancellation...) (M/D/YYYY)',
        'Current Membership Expiration (M/D/YYYY)',
    ],
    dtype={
        'Current Membership Type': 'category', 'Confirmed?': 'boolean',
        'Age': 'Int64',
    },
)

raw_members

Unnamed: 0,Member ID,Confirmed?,Sex,Age,Member Created (M/D/YYYY),Start of first full membership (M/D/YYYY),Current Membership Type,"Latest Membership Change (request, upgrade, renewal, cancellation...) (M/D/YYYY)",Current Membership Expiration (M/D/YYYY),American Indian or Alaskan Native,Asian,Black or African American,Hispanic,White,Native Hawaiian or Pacific Islander,Other
0,34,True,,,2015-07-13,NaT,Request_CheckIDAddressPhoneDOB,2015-07-12,NaT,,,,,,,
1,51,True,,,2015-07-22,2015-07-21,regular,2023-08-19,2024-07-21,,,,,,,
2,62,False,,,2015-10-19,NaT,Request_CheckIDAddressPhoneDOB,2015-10-19,NaT,,,,,,,
3,82,False,,,2016-01-01,2018-05-27,regular,2019-02-13,2020-02-13,,,,,,,
4,347,False,,,2016-10-13,2016-10-12,regular,2016-10-12,2017-10-12,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5352,531,False,,,2017-03-02,2017-03-08,regular,2017-03-08,2018-03-08,,,,,,,
5353,1788,True,male,,2018-10-28,2018-10-28,regular,2021-05-08,2022-05-08,,,,,Y,,
5354,2445,True,,,2019-09-11,NaT,Request_CheckIDAddressPhoneDOB,2019-09-10,NaT,,,,,,,
5355,2330,False,,,2019-07-23,NaT,Request_CheckIDAddressPhoneDOB,2019-07-22,NaT,,,,,,,


In [2]:
better_cols = raw_members.rename(columns={
    # For consistency with the loans column name
    'Member ID': 'Membership ID',
    'Member Created (M/D/YYYY)': 'Created',
    'Start of first full membership (M/D/YYYY)': 'First Membership Started',
    'Latest Membership Change (request, upgrade, renewal, cancellation...) (M/D/YYYY)': 'Last Changed',
    'Current Membership Expiration (M/D/YYYY)': 'Expiration',
})
better_cols.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5357 entries, 0 to 5356
Data columns (total 16 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   Membership ID                        5357 non-null   object        
 1   Confirmed?                           5357 non-null   boolean       
 2   Sex                                  1469 non-null   object        
 3   Age                                  460 non-null    Int64         
 4   Created                              5348 non-null   datetime64[ns]
 5   First Membership Started             3724 non-null   datetime64[ns]
 6   Current Membership Type              5357 non-null   category      
 7   Last Changed                         5357 non-null   datetime64[ns]
 8   Expiration                           3652 non-null   datetime64[ns]
 9   American Indian or Alaskan Native    10 non-null     object        
 10  Asian       

In [3]:
better_cols['Sex'].fillna('unknown', inplace=True)

In [4]:
better_cols.to_csv('data/output/members.csv')
better_cols.to_pickle('data/output/members.pkl')