In [2]:
import pandas as pd
import numpy as np
from HEI_2015_Scoring import hei2015

In [3]:
#load and clean FFQ data
japan_data = pd.read_excel('/projects/thdmi/metadata/nutrient_ffq_data/ffq_japan_output.xlsx', dtype={'sample_ID': str}).set_index('sample_ID')
japan_data.index = ['10317.' + i for i in japan_data.index]
japan_data.columns = [c.strip() for c in japan_data.columns]
# # upload metadata
md = pd.read_csv('/home/lakhatib/thdmi/pangenome_filtered/data/thdmi_metadata_valid_covariates.tsv', sep = '\t', index_col=0, low_memory=False)
japan_data = japan_data.loc[japan_data.index.intersection(md.index)]

In [4]:
#Load and MPED Conversions
MPED_conversions = pd.read_excel('/home/lakhatib/thdmi/pangenome_filtered/data/Japan HEI-2015 equivalents.xlsx', sheet_name=1)
MPED_conversions.columns = [c.strip() for c in MPED_conversions.columns]
mped_mapping = MPED_conversions.set_index('Shukei_o_AggregatedResult_Original_English').filter(like='MPED_')
mped_mapping.index = mped_mapping.index.astype(str)
mped_mapping.index = [c.strip() for c in mped_mapping.index]

In [5]:
#Convert values from 100 grams to 1 gram
mped_mapping = mped_mapping / 100

In [6]:
#Align MPED mappings with japan data and filter foods
common_foods = japan_data.columns.intersection(mped_mapping.index)
japan_data_filtered = japan_data[common_foods]
mped_mapping_filtered = mped_mapping.loc[common_foods]

In [7]:
mped_mapping_filtered = mped_mapping_filtered[~mped_mapping_filtered.index.duplicated()]

In [8]:
#Ensure numeric values 
japan_data_filtered = japan_data_filtered.apply(pd.to_numeric, errors='coerce')
mped_mapping_filtered = mped_mapping_filtered.apply(pd.to_numeric, errors='coerce')

# Convert dataframes to NumPy arrays
mped_mapping_array = mped_mapping_filtered.values  # Shape: (n_foods, n_components)
japan_data_array = japan_data_filtered.values  # Shape: (n_participants, n_foods)

# Perform matrix multiplication
mped_scores = np.dot(japan_data_array, mped_mapping_array)

# Create a DataFrame with participants as rows and MPED components as columns
mped_df = pd.DataFrame(mped_scores, 
                              index=japan_data_filtered.index, 
                              columns=mped_mapping_filtered.columns)

In [9]:
# Calculate monopoly (sum of monounsaturated and polyunsaturated fats)
mped_df['monopoly'] = japan_data['monounsaturated fatty acid'] + japan_data['polyunsaturated fatty acids']

In [10]:
# Combine vegetable intake with legumes
mped_df['VTOTALLEG'] = mped_df['MPED_V_TOTAL'] + mped_df['MPED_V_LEGUMES']
mped_df['VDRKGRLEG'] = mped_df['MPED_V_DRKGR'] + mped_df['MPED_V_LEGUMES']

In [11]:
# Convert cup equivalents of legumes to ounce equivalents
mped_df['protlegumes'] = mped_df['MPED_V_LEGUMES'] * 4

In [12]:
# Calculate total protein intake from various sources
mped_df['PFALLPROTLEG'] = (
    mped_df['MPED_PF_TOTAL'] + mped_df['protlegumes']
)
mped_df['PFSEAPLANTLEG'] = (
    mped_df['MPED_PF_SEAFD_HI'] + mped_df['MPED_PF_SEAFD_LOW'] + 
    mped_df['MPED_PF_SOY'] + mped_df['MPED_PF_NUTSDS'] + mped_df['protlegumes']
)

In [13]:
#calculate HEI
hei_mped_df = hei2015(mped_df, japan_data['energy'], mped_df['VTOTALLEG'], mped_df['VDRKGRLEG'], mped_df['MPED_F_TOTAL'], mped_df['MPED_F_CITMLB'], mped_df['MPED_G_WHOLE'], 
                 mped_df['MPED_D_TOTAL'], mped_df['PFALLPROTLEG'], mped_df['PFSEAPLANTLEG'], mped_df['monopoly'], japan_data['saturated fatty acid'], 
                 japan_data['sodium'], mped_df['MPED_G_REFINED'], mped_df['MPED_ADD_SUGARS'])

In [14]:
hei_df = hei_mped_df.filter(like='HEI')

In [15]:
hei_df.describe()

Unnamed: 0,HEI2015C1_TOTALVEG,HEI2015C2_GREEN_AND_BEAN,HEI2015C3_TOTALFRUIT,HEI2015C4_WHOLEFRUIT,HEI2015C5_WHOLEGRAIN,HEI2015C6_TOTALDAIRY,HEI2015C7_TOTPROT,HEI2015C8_SEAPLANT_PROT,HEI2015C9_FATTYACID,HEI2015C10_SODIUM,HEI2015C11_REFINEDGRAIN,HEI2015C12_SFAT,HEI2015C13_ADDSUG,HEI2015_TOTAL_SCORE
count,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0
mean,3.767072,2.879015,2.13335,1.572987,3.976767,6.309898,4.543474,4.667401,5.743596,0.025268,5.120105,8.975055,8.80574,58.519729
std,1.296387,1.597415,1.590841,1.468224,2.974413,2.966478,0.893032,0.904506,1.425447,0.190872,3.624075,1.402782,2.240697,8.711086
min,0.214797,0.0,0.0,0.0,0.0,0.085066,0.14155,0.004486,0.0,0.0,0.0,0.0,0.0,28.637364
25%,2.712952,1.522544,0.752226,0.44846,1.608809,3.817187,4.522483,5.0,5.286641,0.0,1.728276,8.743421,8.559667,52.708513
50%,4.171612,2.727829,1.834801,1.086287,3.398779,6.222899,5.0,5.0,6.098584,0.0,5.095222,9.399947,10.0,58.362973
75%,5.0,4.629555,3.253765,2.380857,5.63386,10.0,5.0,5.0,6.599613,0.0,8.846715,9.81396,10.0,63.963941
max,5.0,5.0,5.0,5.0,10.0,10.0,5.0,5.0,8.504479,2.339658,10.0,10.0,10.0,82.945471


In [16]:
hei_df.to_csv('/home/lakhatib/thdmi/pangenome_filtered/data/japan_HEI_df.csv')