In [318]:
import os
import glob
import pandas as pd
import numpy as np

# Read the two csv files
data1 = pd.read_csv('DPQ_I.csv')
data2 = pd.read_csv('DEMO_I.csv')

# Merge the two files with the SEQN column
merged_data = pd.merge(data1, data2 , how='left', on='SEQN')

# Only include the columns below in the DEMO_I spreadsheet
# Race, Gender, Age, Education, Marital Status, and PIR
# WTMEC2YR - Full sample 2 year interview weight (Depression doc: "should be used to analyze 2015-16")
# WTINT2YR - Full sample 2 year MEC exam weight
merged_data = merged_data[['SEQN', 'RIDRETH1', 'RIAGENDR', 'RIDAGEYR', 'DMDEDUC3', 'DMDEDUC2', 'DMDMARTL', 'INDFMPIR',
                           'WTMEC2YR', 
                           'WTINT2YR',
                           'DPQ010','DPQ020','DPQ030','DPQ040','DPQ050','DPQ060','DPQ070','DPQ080','DPQ090']]

# Explore the dataset
print(merged_data.info(), '\n')
merged_data.shape[0]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5735 entries, 0 to 5734
Data columns (total 19 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SEQN      5735 non-null   float64
 1   RIDRETH1  5735 non-null   float64
 2   RIAGENDR  5735 non-null   float64
 3   RIDAGEYR  5735 non-null   float64
 4   DMDEDUC3  261 non-null    float64
 5   DMDEDUC2  5474 non-null   float64
 6   DMDMARTL  5474 non-null   float64
 7   INDFMPIR  5134 non-null   float64
 8   WTMEC2YR  5735 non-null   float64
 9   WTINT2YR  5735 non-null   float64
 10  DPQ010    5164 non-null   float64
 11  DPQ020    5164 non-null   float64
 12  DPQ030    5164 non-null   float64
 13  DPQ040    5162 non-null   float64
 14  DPQ050    5162 non-null   float64
 15  DPQ060    5162 non-null   float64
 16  DPQ070    5161 non-null   float64
 17  DPQ080    5161 non-null   float64
 18  DPQ090    5161 non-null   float64
dtypes: float64(19)
memory usage: 896.1 KB
None 



5735

In [319]:
# Data cleaning for the depression columns 

# Drop the rows that have nan for questions 1-9
merged_data = merged_data[merged_data['DPQ010'].notna()]

# Add the values of each row in the depression spreadsheet (DPQI_)
merged_data['SCORE'] = merged_data['DPQ010'] + merged_data['DPQ020'] + merged_data['DPQ030'] + merged_data['DPQ040'] + merged_data['DPQ050'] + merged_data['DPQ060'] + merged_data['DPQ070'] + merged_data['DPQ080'] + merged_data['DPQ090']

# If a participant had total PHQ-9 ≥ 10, the person is considered having depression.
for index, row in merged_data.iterrows():
    if merged_data.at[index,'SCORE'] >= 10:
        merged_data.at[index,'DEPRESSED'] = 1
    else:
        merged_data.at[index,'DEPRESSED'] = 0

In [320]:
# Data cleaning for demographics spreadsheet (DEMO_I)

# Only include participants with an age 18 or older 
merged_data = merged_data[merged_data['RIDAGEYR'] >= 18]  

# Only include participants with an education level known for DMDEDUC2
merged_data = merged_data[merged_data['DMDEDUC2'] != 9] 

for index, row in merged_data.iterrows():
    
    # Divide the data into two age groups
    # 1.) Younger: 18-55 
    # 2.) Older: 56+
    if merged_data.at[index,'RIDAGEYR'] <= 55:
        merged_data.at[index,'AGEGRP'] = 1
    else:
        merged_data.at[index,'AGEGRP'] = 2

    # Divide the data into maritial status
    # 1.) Married: 1
    # 2.) Widowed/Divorced/Separated: 2, 3, 4 (combined)
    # 3.) Never Married: 5
    # 4.) Living with Partner: 6
    if merged_data.at[index,'DMDMARTL'] == 1:
        merged_data.at[index,'MARITALSTATUS'] = 1
    elif merged_data.at[index,'DMDMARTL'] == 2 or merged_data.at[index,'DMDMARTL'] == 3 or merged_data.at[index,'DMDMARTL'] == 4:
        merged_data.at[index,'MARITALSTATUS'] = 2
    elif merged_data.at[index,'DMDMARTL'] == 5:
        merged_data.at[index,'MARITALSTATUS'] = 3
    else:
        merged_data.at[index,'MARITALSTATUS'] = 4

    # Divide the data into PIR (SES)
    # 1.) Low: 0-1.36
    # 2.) Medium: 1.37-4.99
    # 3.) High: 5.00+
    if merged_data.at[index,'INDFMPIR'] >= 5:
        merged_data.at[index,'PIR'] = 3
    elif merged_data.at[index,'INDFMPIR'] <= 4.99 and merged_data.at[index,'INDFMPIR'] >= 1.37:
        merged_data.at[index,'PIR'] = 2
    else:
        merged_data.at[index,'PIR'] = 1

    # Divide the data by education level
    # 1.) Up to 11th grade: DMDEDUC3 = 0-11 & 66; DMDEDUC2 = 1-2
    # 2.) High School or GED: DMDEDUC3 = 12-14; DMDEDUC2 = 3
    # 3.) Some College or Associate Degree: DMDEDUC3 = 15; DMDEDUC2 = 4
    # 4.) College Graduate or Above: DMDEDUC2 = 5
    if merged_data.at[index,'DMDEDUC3'] <= 11 or merged_data.at[index,'DMDEDUC3'] == 66 or merged_data.at[index,'DMDEDUC2'] <= 2:
        merged_data.at[index,'EDUCATION'] = 1
    elif (merged_data.at[index,'DMDEDUC3'] >= 12 and merged_data.at[index,'DMDEDUC3'] <= 14) or merged_data.at[index,'DMDEDUC2'] == 3:
        merged_data.at[index,'EDUCATION'] = 2
    elif merged_data.at[index,'DMDEDUC3'] == 15 or merged_data.at[index,'DMDEDUC2'] == 4:
        merged_data.at[index,'EDUCATION'] = 3
    else:
        merged_data.at[index,'EDUCATION'] = 4
    
    # Divide the data by race/ethnicity
    # 1.) Mexican American: 1-2
    # 2.) White: 3
    # 3.) African-American: 4
    # 4.) Other Race: 5
    if merged_data.at[index,'RIDRETH1'] == 1 or merged_data.at[index,'RIDRETH1'] == 2:
        merged_data.at[index,'RACE'] = 1
    elif merged_data.at[index,'RIDRETH1'] == 3:
        merged_data.at[index,'RACE'] = 2
    elif merged_data.at[index,'RIDRETH1'] == 4:
        merged_data.at[index,'RACE'] = 3
    else:
        merged_data.at[index,'RACE'] = 4

In [321]:
# pandas count distinct values in column
merged_data = merged_data.rename(columns={'RIAGENDR': 'GENDER'})

print(merged_data['DEPRESSED'].value_counts())
print(merged_data['AGEGRP'].value_counts())
print(merged_data['MARITALSTATUS'].value_counts())
print(merged_data['PIR'].value_counts())
print(merged_data['GENDER'].value_counts())
print(merged_data['EDUCATION'].value_counts())
print(merged_data['RACE'].value_counts())
# print(merged_data['WTMEC2YR'].value_counts())
# print(merged_data['WTINT2YR'].value_counts())

print('\nVariables, n=', merged_data.shape[0])

0.0    4730
1.0     432
Name: DEPRESSED, dtype: int64
1.0    3185
2.0    1977
Name: AGEGRP, dtype: int64
1.0    2474
2.0    1063
3.0     902
4.0     723
Name: MARITALSTATUS, dtype: int64
2.0    2300
1.0    2122
3.0     740
Name: PIR, dtype: int64
2.0    2639
1.0    2523
Name: GENDER, dtype: int64
3.0    1530
4.0    1217
2.0    1215
1.0    1200
Name: EDUCATION, dtype: int64
2.0    1717
1.0    1596
3.0    1105
4.0     744
Name: RACE, dtype: int64

Variables, n= 5162


In [322]:
# Export the merged files to a csv
header = ['SEQN', 'DEPRESSED', 'AGEGRP', 'MARITALSTATUS', 'PIR', 'GENDER', 'RACE', 'EDUCATION', 'WTMEC2YR', 'WTINT2YR']
merged_data.to_csv('./mergeddata.csv', columns=header)

In [323]:
merged_data.shape[0]

5162

In [324]:
# Get the weighted frquencies 
weightRaceHis, weightRaceAA, weightRaceWht, weightRaceOth = (0, 0, 0, 0)
weightGenderM, weightGenderF = (0, 0)
weightAgeGroupYoung, weightAgeGroupOld = (0, 0)
weightEducation11, weightEducationHS, weightEducationCollege, weightEducationGrad = (0, 0, 0, 0)
weightMaritalMarried, weightMaritalWDS, weightMaritalNever, weightMaritalPart = (0, 0, 0, 0)
weightPIRLow, weightPIRMed, weightPIRHigh = (0, 0, 0)
weightDepressed, weightNotDepressed = (0, 0)

for index, row in merged_data.iterrows():

    # Weighted Frequency for Race groups 
    if merged_data.at[index,'RACE'] == 1:
        weightRaceHis += merged_data.at[index,'WTMEC2YR']
    elif merged_data.at[index,'RACE'] == 2:
        weightRaceWht += merged_data.at[index,'WTMEC2YR']
    elif merged_data.at[index,'RACE'] == 3:
        weightRaceAA += merged_data.at[index,'WTMEC2YR']
    else:
        weightRaceOth += merged_data.at[index,'WTMEC2YR']

    # Weighted Frequency for Gender groups 
    if merged_data.at[index,'GENDER'] == 1:
        weightGenderM += merged_data.at[index,'WTMEC2YR']
    else:
        weightGenderF += merged_data.at[index,'WTMEC2YR']

    # Weighted Frequency for Age groups 
    if merged_data.at[index,'AGEGRP'] == 1: 
        weightAgeGroupYoung += merged_data.at[index,'WTMEC2YR']
    else:
        weightAgeGroupOld += merged_data.at[index,'WTMEC2YR']
    
    # Weighted Frequency for Education groups 
    if merged_data.at[index,'EDUCATION'] == 1:
        weightEducation11 += merged_data.at[index,'WTMEC2YR']
    elif merged_data.at[index,'EDUCATION'] == 2:
        weightEducationHS += merged_data.at[index,'WTMEC2YR']
    elif merged_data.at[index,'EDUCATION'] == 3:
        weightEducationCollege += merged_data.at[index,'WTMEC2YR']
    else:
        weightEducationGrad += merged_data.at[index,'WTMEC2YR']

    # Weighted Frequency for Marital Status groups 
    if merged_data.at[index,'MARITALSTATUS'] == 1:
        weightMaritalMarried += merged_data.at[index,'WTMEC2YR']
    elif merged_data.at[index,'MARITALSTATUS'] == 2:
        weightMaritalWDS += merged_data.at[index,'WTMEC2YR']
    elif merged_data.at[index,'MARITALSTATUS'] == 3:
        weightMaritalNever += merged_data.at[index,'WTMEC2YR']
    else:
        weightMaritalPart += merged_data.at[index,'WTMEC2YR']
    
    # Weighted Frequency for PIR groups 
    if merged_data.at[index,'PIR'] == 1:
        weightPIRLow += merged_data.at[index,'WTMEC2YR']
    elif merged_data.at[index,'PIR'] == 2:
        weightPIRMed += merged_data.at[index,'WTMEC2YR']
    else:
        weightPIRHigh += merged_data.at[index,'WTMEC2YR']

    # Weighted Frequency for Depression groups
    if merged_data.at[index,'DEPRESSED'] == 1:
        weightDepressed += merged_data.at[index,'WTMEC2YR']
    else:
        weightNotDepressed += merged_data.at[index,'WTMEC2YR']

# Calculate the Weighted Percent 
weightRaceTotal = sum([weightRaceHis, weightRaceAA, weightRaceWht, weightRaceOth])
weightRaceHisPercent = (weightRaceHis / weightRaceTotal) * 100
weightRaceWhtPercent = (weightRaceWht / weightRaceTotal) * 100
weightRaceAAPercent = (weightRaceAA / weightRaceTotal) * 100
weightRaceOthPercent = (weightRaceOth / weightRaceTotal) * 100

weightGenderTotal = sum([weightGenderM, weightGenderF])
weightGenderMPercent = (weightGenderM / weightGenderTotal) * 100
weightGenderFPercent = (weightGenderF / weightGenderTotal) * 100

weightAgeGroupTotal = sum([weightAgeGroupYoung, weightAgeGroupOld])
weightAgeGroupYoungPercent = (weightAgeGroupYoung / weightAgeGroupTotal) * 100
weightAgeGroupOldPercent = (weightAgeGroupOld / weightAgeGroupTotal) * 100

weightEducationTotal = sum([weightEducation11, weightEducationHS, weightEducationCollege, weightEducationGrad])
weightEducation11Percent = (weightEducation11 / weightEducationTotal) * 100
weightEducationHSPercent = (weightEducationHS / weightEducationTotal) * 100
weightEducationCollegePercent = (weightEducationCollege / weightEducationTotal) * 100
weightEducationGradPercent = (weightEducationGrad / weightEducationTotal) * 100

weightMaritalTotal = sum([weightMaritalMarried, weightMaritalWDS, weightMaritalNever, weightMaritalPart])
weightMaritalMarriedPercent = (weightMaritalMarried / weightMaritalTotal) * 100
weightMaritalWDSPercent = (weightMaritalWDS / weightMaritalTotal) * 100
weightMaritalNeverPercent = (weightMaritalNever / weightMaritalTotal) * 100
weightMaritalPartPercent = (weightMaritalPart / weightMaritalTotal) * 100

weightPIRTotal = sum([weightPIRLow, weightPIRMed, weightPIRHigh])
weightPIRLowPercent = (weightPIRLow / weightPIRTotal) * 100
weightPIRMedPercent = (weightPIRMed / weightPIRTotal) * 100
weightPIRHighPercent = (weightPIRHigh / weightPIRTotal) * 100

weightDepressedTotal = sum([weightDepressed, weightNotDepressed])
weightDepressedPercent = (weightDepressed / weightDepressedTotal) * 100
weightNotDepressedPercent = (weightNotDepressed / weightDepressedTotal) * 100

# Display the results 
print('Race')
print('Hispanic: \t' + str(round(weightRaceHis)) + '\t\t' + str(round(weightRaceHisPercent, 2)))
print('White: \t\t' + str(round(weightRaceWht)) + '\t\t' + str(round(weightRaceWhtPercent, 2)))
print('AA: \t\t' + str(round(weightRaceAA)) + '\t\t' + str(round(weightRaceAAPercent, 2)))
print('Other: \t\t' + str(round(weightRaceOth)) + '\t\t' + str(round(weightRaceOthPercent, 2)))

print('\nGender')
print('Male: \t\t' + str(round(weightGenderM)) + '\t\t' + str(round(weightGenderMPercent, 2)))
print('Female: \t' + str(round(weightGenderF)) + '\t\t' + str(round(weightGenderFPercent, 2)))

print('\nAge')
print('Younger: \t' + str(round(weightAgeGroupYoung)) + '\t\t' + str(round(weightAgeGroupYoungPercent, 2)))
print('Older: \t\t' + str(round(weightAgeGroupOld)) + '\t\t' + str(round(weightAgeGroupOldPercent, 2)))

print('\nEducation')
print('Up to 11th: \t' + str(round(weightEducation11)) + '\t\t' + str(round(weightEducation11Percent, 2)))
print('High School: \t' + str(round(weightEducationHS)) + '\t\t' + str(round(weightEducationHSPercent, 2)))
print('College: \t' + str(round(weightEducationCollege)) + '\t\t' + str(round(weightEducationCollegePercent, 2)))
print('Graduate: \t' + str(round(weightEducationGrad)) + '\t\t' + str(round(weightEducationGradPercent, 2)))

print('\nMarital Status')
print('Married: \t' + str(round(weightMaritalMarried)) + '\t\t' + str(round(weightMaritalMarriedPercent, 2)))
print('Wid/Div/Sep: \t' + str(round(weightMaritalWDS)) + '\t\t' + str(round(weightMaritalWDSPercent, 2)))
print('Partner \t' + str(round(weightMaritalPart)) + '\t\t' + str(round(weightMaritalPartPercent, 2)))

print('\nPIR')
print('Low: \t\t' + str(round(weightPIRLow)) + '\t\t' + str(round(weightPIRLowPercent, 2)))
print('Medium: \t' + str(round(weightPIRMed)) + '\t\t' + str(round(weightPIRMedPercent, 2)))
print('High \t\t' + str(round(weightPIRHigh)) + '\t\t' + str(round(weightPIRHighPercent, 2)))

print('\nDepressed')
print('Not Depressed: \t\t' + str(round(weightNotDepressed)) + '\t\t' + str(round(weightNotDepressedPercent, 2)))
print('Depressed: \t\t' + str(round(weightDepressed)) + '\t\t' + str(round(weightDepressedPercent, 2)))


Race
Hispanic: 	33456024		15.23
White: 		142184815		64.72
AA: 		24689015		11.24
Other: 		19361848		8.81

Gender
Male: 		106900628		48.66
Female: 	112791074		51.34

Age
Younger: 	142978450		65.08
Older: 		76713253		34.92

Education
Up to 11th: 	31205208		14.2
High School: 	47762926		21.74
College: 	71924626		32.74
Graduate: 	68798943		31.32

Marital Status
Married: 	116726249		53.13
Wid/Div/Sep: 	38720030		17.62
Partner 	26516928		12.07

PIR
Low: 		61990645		28.22
Medium: 	103519218		47.12
High 		54181840		24.66

Depressed
Not Depressed: 		203059281		92.43
Depressed: 		16632422		7.57
