In [1]:
# imports
import pandas as pd
from pandas.api.types import CategoricalDtype

In [2]:
# load aarp
AARP = pd.read_csv('Data/aarp.csv', low_memory=False)

In [3]:
# list of vars
interesting_variables = ['AGE4', 'INCOME', 'D6', 'D8', 'D9C', 'D18', 'D26', 'MARITAL', 'EDUC4', 'RACETHNICITY', 'GENDER', 'HHSIZE']
                         
# d6: how is your health?
# d8: How often do you have contact with family, friends, or neighbors who do not live with you?
# D9C How often you feel isolated?
# D18 What is your current employment status?
# D26. Which best describes the community where you live?
# MARITAL. Marital status
# EDUC4. 4-level education

AARP = AARP[interesting_variables]

### Checking Nulls

In [4]:
# how many nulls?
# Loop through each column
# and check the amount of nulls.
# Only print if there are more than 0.

for col in AARP.columns:
    if AARP[col].isnull().sum() > 0:
        print(f'{round(AARP[col].isnull().sum()/AARP.shape[0],3)}%\t{AARP[col].isnull().sum()} null values in {col}')

In [5]:
# list all unique values
for col in AARP.columns:
    print(f'{col}:\t{AARP[col].unique()}')

AGE4:	['60+' '30-44' '45-59' '18-29']
INCOME:	['$25,000 to $29,999' '$40,000 to $49,999' '$100,000 to $124,999'
 '$200,000 or more' '$150,000 to $174,999' '$50,000 to $59,999'
 '$125,000 to $149,999' '$60,000 to $74,999' '$85,000 to $99,999'
 '$5,000 to $9,999' '$20,000 to $24,999' '$75,000 to $84,999'
 '$35,000 to $39,999' '$30,000 to $34,999' '$15,000 to $19,999'
 '$10,000 to $14,999' 'Less than $5,000' '$175,000 to $199,999']
D6:	['Good' 'Very good' 'Excellent' 'Fair' 'Poor' 'SKIPPED ON WEB']
D8:	['Everyday' 'Several times a week' 'Once every 2 or 3 weeks' 'Once a week'
 'Less than monthly' 'Once a month' 'Never' 'SKIPPED ON WEB']
D9C:	['Never' 'Rarely' 'Sometimes' 'Often' 'SKIPPED ON WEB']
D18:	['Self-employed part-time' 'Employed full-time'
 'Unemployed and looking for work' 'Self-employed full-time'
 'Employed part-time' 'Retired and not working at all'
 'Or are you not in the labor force for other reasons' 'SKIPPED ON WEB']
D26:	['Rural area' 'Suburban with a mix of offices, apa

In [6]:
AARP.shape

(1947, 12)

In [7]:
# okay... no nulls because we have 'SKIPPED ON WEB'
# let's turn those into nulls and then remove na's
AARP = AARP.replace(['SKIPPED ON WEB', "DON’T KNOW"], pd.NA)

In [8]:
# how many nulls?
# Loop through each column
# and check the amount of nulls.
# Only print if there are more than 0.

for col in AARP.columns:
    if AARP[col].isnull().sum() > 0:
        print(f'{round(AARP[col].isnull().sum()/AARP.shape[0],3)}%\t{AARP[col].isnull().sum()} null values in {col}')

0.002%	4 null values in D6
0.003%	6 null values in D8
0.006%	11 null values in D9C
0.004%	8 null values in D18
0.008%	15 null values in D26


In [9]:
# seriously? Fewer than like 30 individuals skipped these or didn't know?
AARP = AARP.dropna()
AARP.shape

(1921, 12)

### Recoding

In [10]:
# let's see these unique values again and then start recoding
# list all unique values
for col in AARP.columns:
    print(f'{col}:\t{AARP[col].unique()}')

AGE4:	['60+' '30-44' '45-59' '18-29']
INCOME:	['$25,000 to $29,999' '$40,000 to $49,999' '$100,000 to $124,999'
 '$200,000 or more' '$150,000 to $174,999' '$50,000 to $59,999'
 '$125,000 to $149,999' '$60,000 to $74,999' '$85,000 to $99,999'
 '$5,000 to $9,999' '$20,000 to $24,999' '$75,000 to $84,999'
 '$35,000 to $39,999' '$30,000 to $34,999' '$15,000 to $19,999'
 '$10,000 to $14,999' 'Less than $5,000' '$175,000 to $199,999']
D6:	['Good' 'Very good' 'Excellent' 'Fair' 'Poor']
D8:	['Everyday' 'Several times a week' 'Once every 2 or 3 weeks' 'Once a week'
 'Less than monthly' 'Once a month' 'Never']
D9C:	['Never' 'Rarely' 'Sometimes' 'Often']
D18:	['Self-employed part-time' 'Employed full-time'
 'Unemployed and looking for work' 'Self-employed full-time'
 'Employed part-time' 'Retired and not working at all'
 'Or are you not in the labor force for other reasons']
D26:	['Rural area' 'Suburban with a mix of offices, apartments and shops'
 'Urban near mix of offices, apartments and shops

In [11]:
# o AGE4 ['60+' '30-44' '45-59' '18-29'] -> [4,2,3,1] 
# o D6: ['Good' 'Very good' 'Excellent' 'Fair' 'Poor'] -> [3,4,5,2,1]

# o D8:['Everyday' 'Several times a week' 'Once every 2 or 3 weeks' 'Once a week' 'Less than monthly' 'Once a month' 'Never'] -> [7, 6, 4, 5, 2, 3, 1]

# o D9C: ['Never' 'Rarely' 'Sometimes' 'Often'] -> [1, 2, 3, 4]

# c D18: ['Self-employed part-time' 'Employed full-time'
 # 'Unemployed and looking for work' 'Self-employed full-time'
 # 'Employed part-time' 'Retired and not working at all'
 # 'Or are you not in the labor force for other reasons'] -> use pd categorize
    
# c D26: ['Rural area' 'Suburban with a mix of offices, apartments and shops'
 # 'Urban near mix of offices, apartments and shops'
 # 'Suburban and mostly residential' 'Urban and mostly residential'
 # 'Small town'] -> Rural & small town -> Rural [1] else not rural [2]

# c MARITAL: ['Widowed' 'Never married' 'Married' 'Living with partner' 'Separated' 'Divorced'] -> use pd categorize

# o EDUC4: ['BA or above' 'HS graduate or equivalent' 'Some college' 'No HS diploma'] -> [4, 2, 3, 1]
# c RACETHNICITY: ['White, non-Hispanic' 'Hispanic' '2+, non-Hispanic' 'Black, non-Hispanic' 'Asian, non-Hispanic' 'Other, non-Hispanic'] -> use pd categorize
# c GENDER: ['Female' 'Male'] -> [0, 1]
# o HHSIZE: [1. 6. 4. 5. 2. 3.] as is (maybe convert to int)


# recode age ordinal
AARP['AGE4_recode'] = AARP['AGE4'].apply(lambda x: 4 if x == '60+' else 2 if x == '30-44' else 3 if x == '45-59' else 1 if x == '18-29' else -1)

# recode D6 ordinal
AARP['D6_recode'] = AARP['D6'].apply(lambda x: 3 if x == 'Good' else 4 if x == 'Very good' else 5 if x == 'Excellent' else 2 if x == 'Fair' else 1 if x == 'Poor' else -1)

# recode D8 ordinal
AARP['D8_recode'] = AARP['D8'].apply(lambda x: 1 if x == 'Never' else 2 if x == 'Less than monthly' else 3 if x == 'Once a month' else 4 if x == 'Once every 2 or 3 weeks' else 5 if x == 'Once a week' else 6 if x == 'Several times a week' else 7 if x == 'Everyday' else -1)

# recode D9C ordinal
AARP['D9C_recode'] = AARP['D9C'].apply(lambda x: 1 if x == 'Never' else 2 if x == 'Rarely' else 3 if x == 'Sometimes' else 4 if x == 'Often' else -1)

# recode D18 categorical
# recode like usual, convert to categorical dtype
AARP['D18_recode'] = AARP['D18'].apply(lambda x: 1 if x == 'Self-employed part-time' else 2 if x == 'Employed full-time' else 3 if x == 'Employed part-time' else 4 if x == 'Retired and not working at all' else 5 if x == 'Or are you not in the labor force for other reasons' else 6 if x == 'Unemployed and looking for work' else 7 if x == 'Self-employed full-time' else -1)
AARP['D18_recode'] = pd.Categorical(AARP['D18_recode'])

# recode D26 categorical
AARP['D26_recode'] = AARP['D26'].apply(lambda x: 1 if x == 'Rural area' else 1 if x == 'Small town' else 2)
AARP['D26_recode'] = pd.Categorical(AARP['D26_recode'])

# recode MARITAL categoical
AARP['MARITAL_recode'] = AARP['MARITAL'].apply(lambda x: 1 if x == 'Widowed' else 2 if x == 'Never married' else 3 if x == 'Married' else 4 if x == 'Living with partner' else 5 if x == 'Separated' else 6 if x == 'Divorced' else -1)
AARP['MARITAL_recode'] = pd.Categorical(AARP['MARITAL_recode'])

# recode education ordinal
AARP['EDUC4_recode'] = AARP['EDUC4'].apply(lambda x: 1 if x == 'No HS diploma' else 2 if x == 'HS graduate or equivalent' else 3 if x == 'Some college' else 4 if x == 'BA or above' else -1)

# recode RACETHNICITY categorical
AARP['RACETHNICITY_recode'] = AARP['RACETHNICITY'].apply(lambda x: 1 if x == 'White, non-Hispanic' else 2 if x == 'Hispanic' else 3 if x == '2+, non-Hispanic' else 4 if x == 'Black, non-Hispanic' else 5 if x == 'Asian, non-Hispanic' else 6 if x == 'Other, non-Hispanic' else -1)
AARP['RACETHNICITY_recode'] = pd.Categorical(AARP['RACETHNICITY_recode'])

AARP['HHSIZE'] = AARP['HHSIZE'].astype(pd.Int32Dtype())

### save cleaned csv

In [12]:
AARP.to_csv('Data/clean_AARP.csv')