In [None]:
import os
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from google.colab import drive
drive.mount('/content/drive')

excel_path = "/content/drive/MyDrive/BMI Photos/SMA-Harvard-Capstone_Data Extract.xlsx"
image_folder = "/content/drive/MyDrive/BMI Photos/"

# Read the Excel file and clean column names
df = pd.read_excel(excel_path)
df.columns = [str(c).strip() for c in df.columns]

id_col = "Participant ID"
week_col = "Timepoint"
bmi_col = df.columns[5]

samples = []
for i, row in df.iterrows():
    subject = str(row[id_col]).strip()
    week = str(row[week_col]).strip()
    bmi = float(row[bmi_col]) if pd.notna(row[bmi_col]) else None
    img_prefix = f"{subject}_{week}"
    image_paths = []
    # Construct file paths for 3 images (jpg or png)
    for idx in range(1, 4):
        candidate_jpg = os.path.join(image_folder, f"{img_prefix} ({idx}).jpg")
        candidate_png = os.path.join(image_folder, f"{img_prefix} ({idx}).png")
        if os.path.exists(candidate_jpg):
            image_paths.append(candidate_jpg)
        elif os.path.exists(candidate_png):
            image_paths.append(candidate_png)
    # Only retain samples with exactly 3 images and valid BMI
    if len(image_paths) == 3 and bmi is not None:
        samples.append({
            "subject": subject,
            "week": week,
            "bmi": bmi,
            "img_paths": image_paths
        })

# Split the data based on 'subject' so that all samples of one person remain in the same split
unique_subjects = sorted(list(set([s["subject"] for s in samples])))
train_ids, val_ids = train_test_split(unique_subjects, test_size=0.2, random_state=42)
for s in samples:
    s["split"] = "Training" if s["subject"] in train_ids else "Validation"

output_json = "/content/drive/MyDrive/sample_list.json"
with open(output_json, "w") as f:
    json.dump(samples, f)

# The following visualization is for data preview only and does not affect JSON creation.
pd.DataFrame(samples[:5])


In [None]:
import os
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import drive

drive.mount('/content/drive')

# Load BMI data from the JSON sample list
with open("/content/drive/MyDrive/sample_list.json", "r") as f:
    samples = json.load(f)
bmi_json = [s["bmi"] for s in samples]
df_json = pd.DataFrame({"BMI": bmi_json, "Source": "JSON"})

# Load BMI data from the original Excel file (using the 5th column by default)
excel_path = "/content/drive/MyDrive/BMI Photos/SMA-Harvard-Capstone_Data Extract.xlsx"
df_excel_raw = pd.read_excel(excel_path)
bmi_excel = df_excel_raw.iloc[:, 5].dropna().astype(float).tolist()
df_excel = pd.DataFrame({"BMI": bmi_excel, "Source": "Excel"})

# Merge the two data sources
df_all = pd.concat([df_json, df_excel], ignore_index=True)

# Print basic statistics
print("JSON BMI Stats:")
print(df_json["BMI"].describe(percentiles=[.25, .5, .75, .9, .95]))
print("\nExcel BMI Stats:")
print(df_excel["BMI"].describe(percentiles=[.25, .5, .75, .9, .95]))

# Visualize the BMI distribution
plt.figure(figsize=(10, 5))
sns.histplot(data=df_all, x="BMI", hue="Source", bins=30, kde=True, element="step", common_norm=False)
plt.title("BMI Distribution: JSON vs Excel")
plt.xlabel("BMI")
plt.ylabel("Count")
plt.grid(True)
plt.tight_layout()
plt.show()
