# Creating Master Dataframe for Modeling

In [1]:
import pandas as pd
import re

### Load Data

In [4]:
# Load each CSV file
patient_df = pd.read_csv('./data/patient.csv')
dexcom_cgm_df = pd.read_csv('./data/dexcom_cgm.csv')

# Inspect each DataFrame to determine the best way to combine them
print("Patient Data:", patient_df.shape)
print("Dexcom CGM Data:", dexcom_cgm_df.shape)

Patient Data: (33568, 22)
Dexcom CGM Data: (17118, 2)


## Merging Patient Data

In [176]:
# Patient-Based Keys have standardized formatting
patient_df['Age'] = patient_df['Age'].astype(int)  # integer format
patient_df['Sex'] = patient_df['Sex'].astype(int)  # Standardize 'Sex' column

In [177]:
patient_df.columns
patient_df.shape

(33568, 22)

In [178]:
dexcom_cgm_df.columns
dexcom_cgm_df.shape

(17118, 2)

In [179]:
# Calculate the necessary repetitions of patient data to join with glucose data
num_repeats = len(dexcom_cgm_df) // len(patient_df) + 1
expanded_patient_df = pd.concat([patient_df] * num_repeats, ignore_index=True)
expanded_patient_df = expanded_patient_df[:len(dexcom_cgm_df)]  # Truncate to match glucose data length

# Combine patient data with Dexcom glucose data (should result in 17118 patient records)
simulated_df = pd.concat([expanded_patient_df.reset_index(drop=True), dexcom_cgm_df.reset_index(drop=True)], axis=1)

simulated_df['Time Checked'] = simulated_df['displayTime']
patient = simulated_df.drop(columns=['displayTime'])

# Display the simulated dataframe
patient.head()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Glucose Value,Time Checked
0,2.0,1,0.0,1,28.0,0.0,0.0,1.0,0,1,...,2.0,0.0,0.0,0.0,0,11,4.0,3.0,117,2024-11-05T14:18:32
1,2.0,1,1.0,1,33.0,0.0,0.0,0.0,1,1,...,2.0,10.0,0.0,0.0,0,9,4.0,7.0,115,2024-11-05T14:13:32
2,2.0,0,1.0,1,29.0,0.0,1.0,1.0,1,1,...,5.0,0.0,30.0,1.0,1,12,3.0,4.0,114,2024-11-05T14:08:32
3,2.0,0,0.0,1,24.0,0.0,0.0,1.0,0,0,...,4.0,0.0,0.0,0.0,1,12,6.0,7.0,115,2024-11-05T14:03:33
4,2.0,0,0.0,1,33.0,1.0,0.0,0.0,1,0,...,4.0,0.0,0.0,0.0,1,6,5.0,2.0,115,2024-11-05T13:58:32


In [180]:
# Validate record count
patient.shape

(17118, 24)

In [181]:
# Export the final aggregated data to a CSV file
output_path = './data/patient_records.csv'
patient.to_csv(output_path, index=False)

print(f"Final combined data exported to: {output_path}")

Final combined data exported to: ./data/patient_records.csv
