<a href="https://colab.research.google.com/github/mikad98/mastersthesis_ibd_wearable/blob/main/Data_Statistics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports


In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import ast
from ast import literal_eval
from datetime import date
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

Mounted at /content/drive


### Functions

In [None]:
def subjective_statistics(df: pd.DataFrame):
    print("========== SUBJECTIVE DATA STATISTICS ==========")

    # Total number of rows
    print(f"\nTotal datapoints: {len(df)}")

    # Unique user_ids
    n_users = df["user_id"].nunique()
    print(f" - Unique users: {n_users}")

    # Gender distribution per user
    if "gender" in df.columns:
        gender_per_user = df.groupby("user_id")["gender"].first().dropna()
        gender_counts = gender_per_user.value_counts()
        gender_percent = gender_per_user.value_counts(normalize=True) * 100
        gender_stats = [f"{g}: {gender_percent[g]:.2f}% ({gender_counts[g]})" for g in gender_counts.index]
        print(" - Gender: " + ", ".join(gender_stats))

    # Diagnosis distribution per user
    if "diagnosis" in df.columns:
        diagnosis_per_user = df.groupby("user_id")["diagnosis"].first().dropna()
        diag_counts = diagnosis_per_user.value_counts()
        diag_percent = diagnosis_per_user.value_counts(normalize=True) * 100
        diag_stats = [f"{d}: {diag_percent[d]:.2f}% ({diag_counts[d]})" for d in diag_counts.index]
        print(" - Diagnosis: " + ", ".join(diag_stats))

    # Average age per user_id
    if "age" in df.columns:
        age_per_user = df.groupby("user_id")["age"].first().dropna()
        mean_age = age_per_user.mean()
        std_age = age_per_user.std()
        print(f" - Average age: {mean_age:.2f} ± {std_age:.2f} years")

    # Average, min, max number of entries per user
    entries_per_user = df.groupby("user_id").size()
    avg_entries = entries_per_user.mean()
    med_entries = entries_per_user.median()
    min_entries = entries_per_user.min()
    max_entries = entries_per_user.max()
    print(f" - Entries per user: avg = {avg_entries:.2f}, med = {med_entries:.2f}, min = {min_entries}, max = {max_entries}")

    # Column completeness
    print("\nData completeness:")
    completeness = df.notna().mean() * 100
    counts = df.notna().sum()

    completeness_summary = {}
    for col in df.columns:
        comp = round(completeness[col], 2)
        valid_count = counts[col]
        key = f"{comp:.2f}% ({valid_count})"
        completeness_summary.setdefault(key, []).append(col)

    for key, cols in sorted(completeness_summary.items(),
                            key=lambda x: float(x[0].split("%")[0]),
                            reverse=True):
        print(f" - {key}: {', '.join(cols)}")

    # Symptom severity distribution
    if "symptom_deg" in df.columns:
        print("\nDistribution of 'symptom_deg':")
        counts = df["symptom_deg"].value_counts().sort_index()
        percentages = df["symptom_deg"].value_counts(normalize=True).sort_index() * 100
        for val in counts.index:
            if percentages[val] > 0:
                print(f" - {val}: {percentages[val]:.2f}% ({counts[val]})")
            else:
                print(f" - {val}: {percentages[val]:.2f}%")

    # Flare perception
    if "rate_as_flare" in df.columns:
        print("\nDistribution of 'rate_as_flare':")
        counts = df["rate_as_flare"].value_counts()
        percentages = df["rate_as_flare"].value_counts(normalize=True) * 100
        for val in counts.index:
            if percentages[val] > 0:
                print(f" - {val}: {percentages[val]:.2f}% ({counts[val]})")
            else:
                print(f" - {val}: {percentages[val]:.2f}%")

def objective_statistics(df: pd.DataFrame):
    print("========== OBJECTIVE DATA STATISTICS ==========")

    # Total number of rows
    print(f"\nTotal datapoints: {len(df)}")

    # Unique user_ids
    n_users = df["user_id"].nunique()
    print(f" - Unique users: {n_users}")

    # Average, min, max number of entries per user
    entries_per_user = df.groupby("user_id").size()
    avg_entries = entries_per_user.mean()
    med_entries = entries_per_user.median()
    min_entries = entries_per_user.min()
    max_entries = entries_per_user.max()
    print(f" - Entries per user: avg = {avg_entries:.2f}, med = {med_entries:.2f}, min = {min_entries}, max = {max_entries}")

    # Column completeness
    print("\nData completeness:")
    completeness = df.notna().mean() * 100
    counts = df.notna().sum()

    completeness_summary = {}
    for col in df.columns:
        comp = round(completeness[col], 2)
        valid_count = counts[col]
        key = f"{comp:.2f}% ({valid_count})"
        completeness_summary.setdefault(key, []).append(col)

    for key, cols in sorted(completeness_summary.items(),
                            key=lambda x: float(x[0].split("%")[0]),
                            reverse=True):
        print(f" - {key}: {', '.join(cols)}")

    # Datapoints grouped by provider
    if "provider" in df.columns:
        print("\nWearable Provider:")
        provider_counts = df["provider"].value_counts()
        for provider, count in provider_counts.items():
            pct = count / len(df) * 100
            if pct > 0:
                print(f" - {provider}: {pct:.2f}% ({count})")
            else:
                print(f" - {provider}: {pct:.2f}%")

def merged_statistics(df: pd.DataFrame):
    print("========== MERGED DATA STATISTICS ==========")

    # Total number of rows
    print(f"\nTotal datapoints: {len(df)}")

    # Unique user_ids
    n_users = df["user_id"].nunique()
    print(f" - Unique users: {n_users}")

    # Gender distribution per user
    if "gender" in df.columns:
        gender_per_user = df.groupby("user_id")["gender"].first().dropna()
        gender_counts = gender_per_user.value_counts()
        gender_percent = gender_per_user.value_counts(normalize=True) * 100
        gender_stats = [f"{g}: {gender_percent[g]:.2f}% ({gender_counts[g]})" for g in gender_counts.index]
        print(" - Gender: " + ", ".join(gender_stats))

    # Diagnosis distribution per user
    if "diagnosis" in df.columns:
        diagnosis_per_user = df.groupby("user_id")["diagnosis"].first().dropna()
        diag_counts = diagnosis_per_user.value_counts()
        diag_percent = diagnosis_per_user.value_counts(normalize=True) * 100
        diag_stats = [f"{d}: {diag_percent[d]:.2f}% ({diag_counts[d]})" for d in diag_counts.index]
        print(" - Diagnosis: " + ", ".join(diag_stats))

    # Average age per user_id
    if "age" in df.columns:
        age_per_user = df.groupby("user_id")["age"].first().dropna()
        mean_age = age_per_user.mean()
        std_age = age_per_user.std()
        print(f" - Average age: {mean_age:.2f} ± {std_age:.2f} years")

    # Average, min, max number of entries per user
    entries_per_user = df.groupby("user_id").size()
    avg_entries = entries_per_user.mean()
    med_entries = entries_per_user.median()
    min_entries = entries_per_user.min()
    max_entries = entries_per_user.max()
    print(f" - Entries per user: avg = {avg_entries:.2f}, med = {med_entries:.2f}, min = {min_entries}, max = {max_entries}")

    # Column completeness
    print("\nData completeness:")
    completeness = df.notna().mean() * 100
    counts = df.notna().sum()

    completeness_summary = {}
    for col in df.columns:
        comp = round(completeness[col], 2)
        valid_count = counts[col]
        key = f"{comp:.2f}% ({valid_count})"
        completeness_summary.setdefault(key, []).append(col)

    for key, cols in sorted(completeness_summary.items(),
                            key=lambda x: float(x[0].split("%")[0]),
                            reverse=True):
        print(f" - {key}: {', '.join(cols)}")

    # Symptom severity distribution
    if "symptom_deg" in df.columns:
        print("\nSymptom severity distribution (symptom_deg):")
        counts = df["symptom_deg"].value_counts().sort_index()
        percentages = df["symptom_deg"].value_counts(normalize=True).sort_index() * 100
        for val in counts.index:
            if percentages[val] > 0:
                print(f" - {val}: {percentages[val]:.2f}% ({counts[val]})")
            else:
                print(f" - {val}: {percentages[val]:.2f}%")

    # Flare perception
    if "rate_as_flare" in df.columns:
        print("\nDistribution of 'rate_as_flare':")
        counts = df["rate_as_flare"].value_counts()
        percentages = df["rate_as_flare"].value_counts(normalize=True) * 100
        for val in counts.index:
            if percentages[val] > 0:
                print(f" - {val}: {percentages[val]:.2f}% ({counts[val]})")
            else:
                print(f" - {val}: {percentages[val]:.2f}%")

### Load Data

In [None]:
subjective = pd.read_csv('/content/drive/My Drive/coreway_ml/Thesis - Mika/subjective.csv')
objective = pd.read_csv('/content/drive/My Drive/coreway_ml/Thesis - Mika/objective.csv')
merged = pd.read_csv('/content/drive/My Drive/coreway_ml/Thesis - Mika/merged.csv')

In [None]:
subjective.head()

Unnamed: 0,user_id,date,gender,age,diagnosis,alcohol_last_24h,on_period,hrv,provider,sleep,stress,activity_dur,activity_deg,symptom_deg,rate_as_flare,yearly_flares
0,004fa825-9b87-4da1-991c-44e9d621df83,2025-06-28,F,32,UC,1.0,0.0,,,1,1,3.0,0.0,0,No,
1,004fa825-9b87-4da1-991c-44e9d621df83,2025-06-30,F,32,UC,1.0,0.0,,,2,2,4.0,0.0,1,Unsure,
2,004fa825-9b87-4da1-991c-44e9d621df83,2025-07-01,F,32,UC,0.0,0.0,,,3,2,4.0,0.0,1,Unsure,
3,006c57da-1344-4d4f-8def-69b4ab9504ad,2025-09-07,F,34,CD,0.0,1.0,,,2,3,4.0,0.0,2,Unsure,
4,00s2nDOMhaYihE7WFZ3Q9yMBYFN2,2024-12-19,M,32,UC,0.0,0.0,50.0,APPLE,1,3,,2.0,3,No,4.0


### Subjective Statistics

In [None]:
subjective_statistics(subjective)


Total datapoints: 21966
 - Unique users: 744
 - Gender: F: 77.02% (573), M: 22.98% (171)
 - Diagnosis: UC: 55.24% (411), CD: 44.76% (333)
 - Average age: 30.64 ± 9.13 years
 - Entries per user: avg = 29.52, med = 5.00, min = 1, max = 461

Data completeness:
 - 100.00% (21966): user_id, date, gender, age, diagnosis, sleep, stress, symptom_deg
 - 99.95% (21956): alcohol_last_24h
 - 94.10% (20670): activity_deg
 - 68.75% (15102): provider
 - 66.02% (14501): rate_as_flare
 - 64.91% (14258): on_period
 - 56.65% (12444): hrv
 - 42.18% (9265): activity_dur
 - 36.69% (8059): yearly_flares

Distribution of 'symptom_deg':
 - 0: 40.01% (8788)
 - 1: 31.08% (6827)
 - 2: 19.11% (4198)
 - 3: 6.52% (1432)
 - 4: 2.99% (657)
 - 5: 0.29% (64)

Distribution of 'rate_as_flare':
 - No: 58.42% (8472)
 - Unsure: 24.62% (3570)
 - Yes: 16.96% (2459)


### Objective Statistics

In [None]:
objective_statistics(objective)


Total datapoints: 16892
 - Unique users: 194
 - Entries per user: avg = 87.07, med = 42.00, min = 1, max = 746

Data completeness:
 - 100.00% (16892): user_id, provider, date, start, end, length, dur_asleep, dur_REM, REM_pct, dur_deep, deep_pct, dur_light, light_pct
 - 99.14% (16747): dur_awake
 - 88.99% (15032): avg_breaths
 - 84.77% (14320): avg_bpm
 - 84.68% (14304): bpm
 - 83.51% (14106): sleep_eff
 - 77.37% (13069): rhr
 - 69.77% (11785): breaths
 - 61.64% (10412): avg_hrv_rmssd
 - 44.87% (7580): hrv_rmssd, std_rmssd, cv_rmssd, min_rmssd, max_rmssd, range_rmssd, slope_rmssd
 - 39.89% (6738): avg_SpO2
 - 39.88% (6736): SpO2
 - 16.10% (2719): hrv_sdnn, avg_hrv_sdnn

Wearable Provider:
 - GARMIN: 56.46% (9537)
 - FITBIT: 21.26% (3591)
 - APPLE: 16.49% (2786)
 - OURA: 4.91% (829)
 - POLAR: 0.56% (95)
 - GOOGLE: 0.32% (54)


### Merged Statistics

In [None]:
merged_statistics(merged)


Total datapoints: 3578
 - Unique users: 130
 - Gender: F: 76.15% (99), M: 23.85% (31)
 - Diagnosis: UC: 53.08% (69), CD: 46.92% (61)
 - Average age: 34.40 ± 9.92 years
 - Entries per user: avg = 27.52, med = 7.50, min = 1, max = 269

Data completeness:
 - 100.00% (3578): user_id, date, gender, age, diagnosis, sleep, stress, symptom_deg, provider, start, end, length, dur_asleep, dur_REM, REM_pct, dur_deep, deep_pct, dur_light, light_pct
 - 99.89% (3574): alcohol_last_24h
 - 99.19% (3549): dur_awake
 - 98.99% (3542): rate_as_flare
 - 92.82% (3321): activity_deg
 - 91.62% (3278): avg_breaths
 - 88.26% (3158): on_period
 - 86.95% (3111): bpm, avg_bpm
 - 74.01% (2648): rhr
 - 72.28% (2586): sleep_eff
 - 71.44% (2556): activity_dur
 - 65.23% (2334): breaths
 - 64.70% (2315): avg_hrv_rmssd
 - 51.79% (1853): avg_SpO2
 - 51.73% (1851): SpO2
 - 41.81% (1496): hrv_rmssd, std_rmssd, cv_rmssd, min_rmssd, max_rmssd, range_rmssd, slope_rmssd
 - 27.25% (975): avg_hrv_sdnn
 - 27.22% (974): hrv_sdnn
 -