### Imports


In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import ast
from ast import literal_eval
from datetime import date
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

Mounted at /content/drive


### Functions

In [None]:
def subjective_statistics(df: pd.DataFrame):
    print("========== SUBJECTIVE DATA STATISTICS ==========")

    # Total number of rows
    print(f"\nTotal datapoints: {len(df)}")

    # Unique user_ids
    n_users = df["user_id"].nunique()
    print(f" - Unique users: {n_users}")

    # Gender distribution per user
    if "gender" in df.columns:
        gender_per_user = df.groupby("user_id")["gender"].first().dropna()
        gender_counts = gender_per_user.value_counts()
        gender_percent = gender_per_user.value_counts(normalize=True) * 100
        gender_stats = [f"{g}: {gender_percent[g]:.2f}% ({gender_counts[g]})" for g in gender_counts.index]
        print(" - Gender: " + ", ".join(gender_stats))

    # Diagnosis distribution per user
    if "diagnosis" in df.columns:
        diagnosis_per_user = df.groupby("user_id")["diagnosis"].first().dropna()
        diag_counts = diagnosis_per_user.value_counts()
        diag_percent = diagnosis_per_user.value_counts(normalize=True) * 100
        diag_stats = [f"{d}: {diag_percent[d]:.2f}% ({diag_counts[d]})" for d in diag_counts.index]
        print(" - Diagnosis: " + ", ".join(diag_stats))

    # Average age per user_id
    if "age" in df.columns:
        age_per_user = df.groupby("user_id")["age"].first().dropna()
        mean_age = age_per_user.mean()
        std_age = age_per_user.std()
        print(f" - Average age: {mean_age:.2f} ± {std_age:.2f} years")

    # Average, min, max number of entries per user
    entries_per_user = df.groupby("user_id").size()
    avg_entries = entries_per_user.mean()
    med_entries = entries_per_user.median()
    min_entries = entries_per_user.min()
    max_entries = entries_per_user.max()
    print(f" - Entries per user: avg = {avg_entries:.2f}, med = {med_entries:.2f}, min = {min_entries}, max = {max_entries}")

    # Column completeness
    print("\nData completeness:")
    completeness = df.notna().mean() * 100
    counts = df.notna().sum()

    completeness_summary = {}
    for col in df.columns:
        comp = round(completeness[col], 2)
        valid_count = counts[col]
        key = f"{comp:.2f}% ({valid_count})"
        completeness_summary.setdefault(key, []).append(col)

    for key, cols in sorted(completeness_summary.items(),
                            key=lambda x: float(x[0].split("%")[0]),
                            reverse=True):
        print(f" - {key}: {', '.join(cols)}")

    # Symptom severity distribution
    if "symptom_deg" in df.columns:
        print("\nDistribution of 'symptom_deg':")
        counts = df["symptom_deg"].value_counts().sort_index()
        percentages = df["symptom_deg"].value_counts(normalize=True).sort_index() * 100
        for val in counts.index:
            if percentages[val] > 0:
                print(f" - {val}: {percentages[val]:.2f}% ({counts[val]})")
            else:
                print(f" - {val}: {percentages[val]:.2f}%")

    # Flare perception
    if "rate_as_flare" in df.columns:
        print("\nDistribution of 'rate_as_flare':")
        counts = df["rate_as_flare"].value_counts()
        percentages = df["rate_as_flare"].value_counts(normalize=True) * 100
        for val in counts.index:
            if percentages[val] > 0:
                print(f" - {val}: {percentages[val]:.2f}% ({counts[val]})")
            else:
                print(f" - {val}: {percentages[val]:.2f}%")

def objective_statistics(df: pd.DataFrame):
    print("========== OBJECTIVE DATA STATISTICS ==========")

    # Total number of rows
    print(f"\nTotal datapoints: {len(df)}")

    # Unique user_ids
    n_users = df["user_id"].nunique()
    print(f" - Unique users: {n_users}")

    # Average, min, max number of entries per user
    entries_per_user = df.groupby("user_id").size()
    avg_entries = entries_per_user.mean()
    med_entries = entries_per_user.median()
    min_entries = entries_per_user.min()
    max_entries = entries_per_user.max()
    print(f" - Entries per user: avg = {avg_entries:.2f}, med = {med_entries:.2f}, min = {min_entries}, max = {max_entries}")

    # Column completeness
    print("\nData completeness:")
    completeness = df.notna().mean() * 100
    counts = df.notna().sum()

    completeness_summary = {}
    for col in df.columns:
        comp = round(completeness[col], 2)
        valid_count = counts[col]
        key = f"{comp:.2f}% ({valid_count})"
        completeness_summary.setdefault(key, []).append(col)

    for key, cols in sorted(completeness_summary.items(),
                            key=lambda x: float(x[0].split("%")[0]),
                            reverse=True):
        print(f" - {key}: {', '.join(cols)}")

    # Datapoints grouped by provider
    if "provider" in df.columns:
        print("\nWearable Provider:")
        provider_counts = df["provider"].value_counts()
        for provider, count in provider_counts.items():
            pct = count / len(df) * 100
            if pct > 0:
                print(f" - {provider}: {pct:.2f}% ({count})")
            else:
                print(f" - {provider}: {pct:.2f}%")

def merged_statistics(df: pd.DataFrame):
    print("========== MERGED DATA STATISTICS ==========")

    # Total number of rows
    print(f"\nTotal datapoints: {len(df)}")

    # Unique user_ids
    n_users = df["user_id"].nunique()
    print(f" - Unique users: {n_users}")

    # Gender distribution per user
    if "gender" in df.columns:
        gender_per_user = df.groupby("user_id")["gender"].first().dropna()
        gender_counts = gender_per_user.value_counts()
        gender_percent = gender_per_user.value_counts(normalize=True) * 100
        gender_stats = [f"{g}: {gender_percent[g]:.2f}% ({gender_counts[g]})" for g in gender_counts.index]
        print(" - Gender: " + ", ".join(gender_stats))

    # Diagnosis distribution per user
    if "diagnosis" in df.columns:
        diagnosis_per_user = df.groupby("user_id")["diagnosis"].first().dropna()
        diag_counts = diagnosis_per_user.value_counts()
        diag_percent = diagnosis_per_user.value_counts(normalize=True) * 100
        diag_stats = [f"{d}: {diag_percent[d]:.2f}% ({diag_counts[d]})" for d in diag_counts.index]
        print(" - Diagnosis: " + ", ".join(diag_stats))

    # Average age per user_id
    if "age" in df.columns:
        age_per_user = df.groupby("user_id")["age"].first().dropna()
        mean_age = age_per_user.mean()
        std_age = age_per_user.std()
        print(f" - Average age: {mean_age:.2f} ± {std_age:.2f} years")

    # Average, min, max number of entries per user
    entries_per_user = df.groupby("user_id").size()
    avg_entries = entries_per_user.mean()
    med_entries = entries_per_user.median()
    min_entries = entries_per_user.min()
    max_entries = entries_per_user.max()
    print(f" - Entries per user: avg = {avg_entries:.2f}, med = {med_entries:.2f}, min = {min_entries}, max = {max_entries}")

    # Column completeness
    print("\nData completeness:")
    completeness = df.notna().mean() * 100
    counts = df.notna().sum()

    completeness_summary = {}
    for col in df.columns:
        comp = round(completeness[col], 2)
        valid_count = counts[col]
        key = f"{comp:.2f}% ({valid_count})"
        completeness_summary.setdefault(key, []).append(col)

    for key, cols in sorted(completeness_summary.items(),
                            key=lambda x: float(x[0].split("%")[0]),
                            reverse=True):
        print(f" - {key}: {', '.join(cols)}")

    # Symptom severity distribution
    if "symptom_deg" in df.columns:
        print("\nSymptom severity distribution (symptom_deg):")
        counts = df["symptom_deg"].value_counts().sort_index()
        percentages = df["symptom_deg"].value_counts(normalize=True).sort_index() * 100
        for val in counts.index:
            if percentages[val] > 0:
                print(f" - {val}: {percentages[val]:.2f}% ({counts[val]})")
            else:
                print(f" - {val}: {percentages[val]:.2f}%")

    # Flare perception
    if "rate_as_flare" in df.columns:
        print("\nDistribution of 'rate_as_flare':")
        counts = df["rate_as_flare"].value_counts()
        percentages = df["rate_as_flare"].value_counts(normalize=True) * 100
        for val in counts.index:
            if percentages[val] > 0:
                print(f" - {val}: {percentages[val]:.2f}% ({counts[val]})")
            else:
                print(f" - {val}: {percentages[val]:.2f}%")

### Load Data

In [None]:
subjective = pd.read_csv('/content/drive/My Drive/coreway_ml/Thesis - Mika/subjective.csv')
objective = pd.read_csv('/content/drive/My Drive/coreway_ml/Thesis - Mika/objective.csv')
merged = pd.read_csv('/content/drive/My Drive/coreway_ml/Thesis - Mika/merged.csv')

### Subjective Statistics

In [None]:
subjective_statistics(subjective)

### Objective Statistics

In [None]:
objective_statistics(objective)

### Merged Statistics

In [None]:
merged_statistics(merged)