In [None]:
import numpy as np
import pandas as pd
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Function to calculate age given two date strings
def calculate_age(birthdate_str, measure_date_str):
    birthdate = datetime.strptime(birthdate_str, "%Y-%m-%d")
    measure_date = datetime.strptime(measure_date_str, "%Y-%m-%d")
    age = measure_date.year - birthdate.year
    if (measure_date.month, measure_date.day) < (birthdate.month, birthdate.day):
        age -= 1
    return age

In [None]:
measure = pd.read_csv('pheno_raw/measurements.tsv', sep='\t')

In [None]:
demo = pd.read_csv('Demographics.tsv', sep='\t')

In [None]:
survey = pd.read_csv('Survey.tsv', sep='\t')

### Height

In [None]:
height = measure[measure['standard_concept_name']== 'Body height']

In [None]:
height = height.merge(demo, on='person_id')

#### Calculate age at measurement

In [None]:
height['date_of_birth'] = height['date_of_birth'].str.split(' ').apply(lambda x: x[0])
height['measurement_datetime'] = height['measurement_datetime'].str.split(' ').apply(lambda x: x[0])
height['age_at_measure'] = height['date_of_birth'].combine(height['measurement_datetime'], calculate_age)

In [None]:
height = height[(height['age_at_measure'] >= 18) & (height['age_at_measure'] <= 50)]

In [None]:
### check sample size change for 20 or 18

In [None]:
height = height.dropna(subset=['value_as_number'])

#### Take mean height if multiple records

In [None]:
height = height[(height['value_as_number'] >= 140) & (height['value_as_number'] <= 210)]

In [None]:
height_value = height.groupby("person_id", as_index=False)['value_as_number'].mean()

In [None]:
### We keep age as latest measured
height = height.loc[height.groupby('person_id').apply(lambda x: x['age_at_measure'].idxmax())][['person_id', 'standard_concept_name', 'gender', 'race', 'ethnicity', 'sex_at_birth', 'age_at_measure']]

In [None]:
height = height.merge(height_value, on='person_id')

In [None]:
height.to_csv('pheno_cleaned/Height.tsv', sep='\t', index=False, header=True)

#### Distributions

In [None]:
height = pd.read_csv('pheno_cleaned/Height.tsv', sep='\t')

In [None]:
height = height[height['sex_at_birth'].isin(['Female', 'Male'])]
height = height[height['race'].isin(['White', 'Black or African American'])]
height = height[(height['age_at_measure'] >= 20) & (height['age_at_measure'] <= 50)]

In [None]:
height.value_as_number.max()

In [None]:
height.value_as_number.min()

In [None]:
pd.crosstab(height['sex_at_birth'], height['race'])

### BMI

In [None]:
### ask if drug taking affects BMI, standard correction available?
bmi = measure[measure['standard_concept_name'] == 'Body mass index (BMI) [Ratio]']

In [None]:
bmi = bmi.merge(demo, on='person_id')
bmi['date_of_birth'] = bmi['date_of_birth'].str.split(' ').apply(lambda x: x[0])
bmi['measurement_datetime'] = bmi['measurement_datetime'].str.split(' ').apply(lambda x: x[0])
bmi['age_at_measure'] = bmi['date_of_birth'].combine(bmi['measurement_datetime'], calculate_age)
bmi = bmi[(bmi['age_at_measure'] >= 18) & (bmi['age_at_measure'] <= 50)]
bmi = bmi[(bmi['value_as_number'] >= 15) & (bmi['value_as_number'] <= 60)]

In [None]:
bmi = bmi.dropna(subset=['value_as_number'])

In [None]:
bmi_value = bmi.groupby("person_id", as_index=False)['value_as_number'].mean()

In [None]:
bmi = bmi.loc[bmi.groupby('person_id').apply(lambda x: x['age_at_measure'].idxmax())][['person_id', 'standard_concept_name', 'gender', 'race', 'ethnicity', 'sex_at_birth', 'age_at_measure']]

In [None]:
bmi = bmi.merge(bmi_value, on='person_id')

In [None]:
bmi.to_csv('pheno_cleaned/BMI.tsv', sep='\t', index=False, header=True)

In [None]:
bmi = bmi[bmi['sex_at_birth'].isin(['Female', 'Male'])]
bmi = bmi[bmi['race'].isin(['White', 'Black or African American'])]

In [None]:
pd.crosstab(bmi['sex_at_birth'], bmi['race'])

### Waist Circumference, mean of closest 2 measures

In [None]:
wc = measure[measure['standard_concept_name'].str.contains('waist')]

In [None]:
wc = wc.merge(demo, on='person_id')
wc['date_of_birth'] = wc['date_of_birth'].str.split(' ').apply(lambda x: x[0])
wc['measurement_datetime'] = wc['measurement_datetime'].str.split(' ').apply(lambda x: x[0])
wc['age_at_measure'] = wc['date_of_birth'].combine(wc['measurement_datetime'], calculate_age)
wc = wc[(wc['age_at_measure'] >= 18) & (wc['age_at_measure'] <= 50)]
wc = wc.dropna(subset=['value_as_number'])
wc = wc[(wc['value_as_number'] >= 50) & (wc['value_as_number'] <= 160)]

In [None]:
wc_value = wc.groupby("person_id", as_index=False)['value_as_number'].mean()

In [None]:
wc = wc.loc[wc.groupby('person_id').apply(lambda x: x['age_at_measure'].idxmax())][['person_id', 'standard_concept_name', 'gender', 'race', 'ethnicity', 'sex_at_birth', 'age_at_measure']]

In [None]:
wc = wc.merge(wc_value, on='person_id')

In [None]:
wc.to_csv('pheno_cleaned/WC.tsv', sep='\t', index=False, header=True)

In [None]:
wc = pd.read_csv('pheno_cleaned/WC.tsv', sep='\t')
wc = wc[wc['sex_at_birth'].isin(['Female', 'Male'])]
wc = wc[wc['race'].isin(['White', 'Black or African American'])]

In [None]:
pd.crosstab(wc['sex_at_birth'], wc['race'])

### Educational Attainment

In [None]:
edu = survey[survey['question'] == 'Education Level: Highest Grade']
edu = edu.merge(demo, on='person_id')
edu['date_of_birth'] = edu['date_of_birth'].str.split(' ').apply(lambda x: x[0])
edu['survey_datetime'] = edu['survey_datetime'].str.split(' ').apply(lambda x: x[0])
edu['age_at_measure'] = edu['date_of_birth'].combine(edu['survey_datetime'], calculate_age)

In [None]:
edu = edu.loc[edu.groupby('person_id').apply(lambda x: x['age_at_measure'].idxmax())][['person_id', 'question', 'gender', 'race', 'ethnicity', 'sex_at_birth', 'age_at_measure', 'answer']]

In [None]:
edu = edu[~edu['answer'].str.contains('PMI')]
edu = edu[edu['age_at_measure'] >= 30]

In [None]:
education_years_dict = {
    'Highest Grade: Never Attended': 0,       # No formal education
    'Highest Grade: One Through Four': 4,     # Up to 4th grade
    'Highest Grade: Five Through Eight': 8,   # Up to 8th grade (middle school)
    'Highest Grade: Nine Through Eleven': 11, # Up to 11th grade (high school incomplete)
    'Highest Grade: Twelve Or GED': 12,       # High school graduate or GED
    'Highest Grade: College One to Three': 14, # Some college (no degree)
    'Highest Grade: College Graduate': 16,    # Bachelor's degree
    'Highest Grade: Advanced Degree': 20      # Graduate degree (Master's, PhD)
}

In [None]:
edu['value_as_number'] = edu['answer'].apply(lambda x: education_years_dict.get(x))

In [None]:
edu.to_csv('pheno_cleaned/EA.tsv', sep='\t', index=False, header=True)

In [None]:
edu = edu[edu['sex_at_birth'].isin(['Female', 'Male'])]
edu = edu[edu['race'].isin(['White', 'Black or African American'])]

In [None]:
pd.crosstab(edu['sex_at_birth'], edu['race'])