In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import pyreadstat
import os
from ydata_profiling import ProfileReport
import datetime as dt
import matplotlib.pyplot as plt

%matplotlib inline

import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

import scipy.stats as stats

import warnings

warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
cwd = os.getcwd()
print(cwd)

c:\Users\Nigel\Git\paidleave_mh


#### Data Exploration

Standard

In [3]:
filepath = r"C:\Users\Nigel\OneDrive\1_GradSchool\4_Dissertation\1_paidleave_mhdisparities\PRAMS  Phase 8 PRAMS ARF May 22 2024\PRAMS ARF\phase8_arf_2016_2021.sas7bdat"
df_s, meta = pyreadstat.read_sas7bdat(filepath)

In [4]:
# profile the dataset
s_profile = ProfileReport(df_s, title="Profiling Report", minimal=True)
s_profile.to_file("s_profile.html")

In [None]:
# identify covariates
cov = [
    'HISPANIC',
    'STATE'
]

Summarize dataset: 100%|██████████| 490/490 [00:08<00:00, 57.77it/s, Completed]                             
Generate report structure: 100%|██████████| 1/1 [01:04<00:00, 64.03s/it]
Render HTML: 100%|██████████| 1/1 [00:08<00:00,  8.01s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 25.62it/s]


In [27]:
# identify observations without a survey year
#yr_filt = df_s['TOD_YR4'].notnull()

# identify observations without the covariates of interest
#covariate_filt = ((df_s['XXX'].notnull()) | (df_s['XXX'].notnull()))

Occupational Status

Maternity Leave
 - C4. At any time during your most recent pregnancy, did you work at a job for pay?
 - C7. Have you returned to the job you had during your most recent pregnancy? Check ONE answer

NOTE: C8 requires C7 and C4. If a site adds a site-specific option to C8, insert “I took…” for options such as Family Medical Leave and “I took leave and used…” for options such as Temporary/Short-term Disability Insurance.

- C8. Did you take leave from work after your new baby was born?
- C9. How did you feel about the amount of time you were able to take off after the birth of your new baby?
- C10. Did any of the following things affect your decision about taking leave from work after your new baby was born?
- C14. How many weeks or months of leave, in total, did you take or will you take?

In [6]:
filepath = r"C:\Users\Nigel\OneDrive\1_GradSchool\4_Dissertation\1_paidleave_mhdisparities\PRAMS  Phase 8 Standard May 22 2024\Occupational Status and Work Place Leave\phase8_2016_2021_std_c.sas7bdat"
df_c, meta = pyreadstat.read_sas7bdat(filepath)

In [7]:
# profile the dataset
c_profile = ProfileReport(df_c, title="Profiling Report", minimal=True)
c_profile.to_file("c_profile.html")

Summarize dataset: 100%|██████████| 82/82 [00:01<00:00, 64.40it/s, Completed]                        
Generate report structure: 100%|██████████| 1/1 [00:11<00:00, 11.63s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.27s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 105.10it/s]


In [9]:
df_c.columns = df_c.columns.str.upper()

In [10]:
df_c.head()

Unnamed: 0,ID,WRK_PREG,WRK_TITL_RAW,WRK_DUTY_RAW,WRKRETRN_RAW,WRKPDLV_RAW,WRKUPDLV_RAW,WRKNOLV_RAW,LVAFFORD_RAW,LVAFRAID_RAW,LVWORKLD_RAW,LVUNPAID_RAW,LVNOFLEX_RAW,LVENOUGH_RAW,WRK_TITL,WRK_DUTY,WRKRETRN,WRKPDLV,WRKUPDLV,WRKNOLV,LVAFFORD,LVAFRAID,LVWORKLD,LVUNPAID,LVNOFLEX,LVENOUGH,C_WRKSCH_RAW,CC_WHO6_RAW,CC_OTH6_RAW,CC_FEEL_RAW,LV_AMTU_RAW,LV_AMT_RAW,WRKFEEL_RAW,C_WRKSCH,CC_OTH6,CC_WHO6,CC_FEEL,LV_AMTU,LV_AMT,WRKFEEL,WRKFMLV_RAW,WRKFMLV,WRK_TYPE_RAW,WRK_IDK_RAW,WRK_TYPE,WRK_IDK,WRKYCTDI_RAW,WRKYCTDI,DADLEAVE,DADLEAVE_DK,WRKSCHED_RAW,WRKRETRN_C6_RAW,WRKPDLV_C6_RAW,WRKUPDLV_C6_RAW,WRKNOLV_C6_RAW,LV_AMTU_C6_RAW,LV_AMT_C6_RAW,LVAFFORD_C6_RAW,LVAFRAID_C6_RAW,LVWORKLD_C6_RAW,LVUNPAID_C6_RAW,LVNOFLEX_C6_RAW,LVENOUGH_C6_RAW,WRKSCHED,WRKRETRN_C6,WRKPDLV_C6,WRKUPDLV_C6,WRKNOLV_C6,LV_AMTU_C6,LV_AMT_C6,LVAFFORD_C6,LVAFRAID_C6,LVWORKLD_C6,LVUNPAID_C6,LVNOFLEX_C6,LVENOUGH_C6
0,2016LA232001,2.0,CMA/smoke monitor,Made sure patients was wearing smoke smocks an...,2.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,CMA/smoke monitor,Made sure patients was wearing smoke smocks an...,2.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2016LA232002,2.0,Teacher,High school basic courses,3.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,Teacher,High school basic courses,3.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2016LA232003,2.0,Accountant,"Typing, research on computer, auditing, payrol...",3.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,1.0,2.0,Accountant,"Typing, research on computer, auditing, payrol...",3.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,1.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2016LA232004,1.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2016LA232006,2.0,Teller,Money transactions lifting 25 lbs. or more daily,3.0,2.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,Teller,Money transactions lifting 25 lbs. or more daily,3.0,2.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [20]:
# c8 - did you take leave from work after your new baby was born?
df_c[[
     'ID',
     'WRKPDLV', # paid
     'WRKUPDLV', # unpaid
     'WRKNOLV' # no leave
]].head(10)

Unnamed: 0,ID,WRKPDLV,WRKUPDLV,WRKNOLV
0,2016LA232001,1.0,2.0,1.0
1,2016LA232002,1.0,2.0,1.0
2,2016LA232003,2.0,2.0,1.0
3,2016LA232004,,,
4,2016LA232006,2.0,1.0,1.0
5,2016LA232007,2.0,2.0,1.0
6,2016LA232008,,,
7,2016LA232010,1.0,2.0,1.0
8,2016LA232011,2.0,2.0,1.0
9,2016LA232012,2.0,1.0,1.0


In [25]:
# paid leave (1=NO, 2=YES)
df_c[['ID','WRKPDLV']].groupby(['WRKPDLV'])['ID'].nunique()

WRKPDLV
1.0    18186
2.0    17984
Name: ID, dtype: int64

In [22]:
# unpaid leave (1=NO, 2=YES)
df_c[['ID','WRKUPDLV']].groupby(['WRKUPDLV'])['ID'].nunique()

WRKUPDLV
1.0    16430
2.0    19736
Name: ID, dtype: int64

In [23]:
# no leave (1=NO, 2=YES)
df_c[['ID','WRKNOLV']].groupby(['WRKNOLV'])['ID'].nunique()

WRKNOLV
1.0    34495
2.0     1642
Name: ID, dtype: int64

Mental Health

In [8]:
filepath = r"C:\Users\Nigel\OneDrive\1_GradSchool\4_Dissertation\1_paidleave_mhdisparities\PRAMS  Phase 8 Standard May 22 2024\Mental Health\phase8_2016_2021_std_m.sas7bdat"
df_m, meta = pyreadstat.read_sas7bdat(filepath)

In [11]:
# profile the dataset
m_profile = ProfileReport(df_m, title="Profiling Report", minimal=True)
m_profile.to_file("m_profile.html")

Summarize dataset: 100%|██████████| 31/31 [00:03<00:00,  8.77it/s, Completed]                      
Generate report structure: 100%|██████████| 1/1 [00:03<00:00,  3.11s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  2.87it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 500.75it/s]


In [12]:
df_m.head()

Unnamed: 0,ID,MH_PPDX,MH_PPRX_RAW,MH_PPRX,MH_PPHLP,MH_PPPSY_RAW,MH_PPPSY,PP_PANIC,PP_NREST,MH_PGRX8_RAW,MH_PGRX8,MH_PGHP8_RAW,MH_PGHP8,MH_HDANX,MH_RXANX_RAW,MH_RXANX,DEPR_TLK,MH_PGPSY8_RAW,MH_PREG,MH_PGPSY8,MH_ANXHP,MH_PGANX,MH_ANX,MH_ANXCN_RAW,MH_ANXCN
0,2016CO224001,1.0,,,,,,,,,,,,,,,,,,,,,,,
1,2016CO224003,1.0,,,,,,,,,,,,,,,,,,,,,,,
2,2016CO224006,1.0,,,,,,,,,,,,,,,,,,,,,,,
3,2016CO224007,1.0,,,,,,,,,,,,,,,,,,,,,,,
4,2016CO224009,1.0,,,,,,,,,,,,,,,,,,,,,,,
