In [146]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import pandas as pd
import os
import plotly.graph_objects as go
from plotly.subplots import make_subplots

### Data exploration
['AS14.01', 'AS14.02', 'AS14.03', 'AS14.05', 'AS14.06', 'AS14.07',
       'AS14.08', 'AS14.09', 'AS14.12', 'AS14.13', 'AS14.14', 'AS14.15',
       'AS14.16', 'AS14.17', 'AS14.19', 'AS14.20', 'AS14.23', 'AS14.24',
       'AS14.25', 'AS14.26', 'AS14.27', 'AS14.28', 'AS14.29', 'AS14.30',
       'AS14.31', 'AS14.32', 'AS14.33']

In [147]:
df = pd.read_csv('dataset_mood_smartphone.csv')
df.head()

df.isna().sum()

Unnamed: 0      0
id              0
time            0
variable        0
value         202
dtype: int64

In [148]:
df["id_num"] = df["id"].apply(lambda x: int(x.split(".")[1]))
print(df["id_num"].unique())
df["id"].unique()





[ 1  2  3  5  6  7  8  9 12 13 14 15 16 17 19 20 23 24 25 26 27 28 29 30
 31 32 33]


array(['AS14.01', 'AS14.02', 'AS14.03', 'AS14.05', 'AS14.06', 'AS14.07',
       'AS14.08', 'AS14.09', 'AS14.12', 'AS14.13', 'AS14.14', 'AS14.15',
       'AS14.16', 'AS14.17', 'AS14.19', 'AS14.20', 'AS14.23', 'AS14.24',
       'AS14.25', 'AS14.26', 'AS14.27', 'AS14.28', 'AS14.29', 'AS14.30',
       'AS14.31', 'AS14.32', 'AS14.33'], dtype=object)

In [149]:
# convert time to datetime


df["time"] = pd.to_datetime(df["time"])



In [150]:
for person in df["id_num"].unique():
    print("PERSON", person)
    df_person = df[df["id_num"] == person]
    # print(df_person["variable"].unique())
    # print(df_person["variable"].value_counts())
    # print(df_person["value"].describe())
    # print(df_person["time"].describe())
    # print(df_person["time"].max() - df_person["time"].min())
    # print(df_person["time"].max())
    # print(df_person["time"].min())

    for var in df_person["variable"].unique():
        print(var)
        df_var = df_person[df_person["variable"] == var]
        print(df_var["value"].describe())
        print(df_var["time"].describe())
        print(f"DIFFERENCE FOR {var}", df_var["time"].max() - df_var["time"].min())
        print(df_var["time"].max())
        print(df_var["time"].min())


PERSON 1
mood
count    222.000000
mean       7.067568
std        0.845500
min        4.000000
25%        7.000000
50%        7.000000
75%        8.000000
max        9.000000
Name: value, dtype: float64
count                              222
mean     2014-04-11 06:06:45.405405440
min                2014-02-26 13:00:00
25%                2014-03-31 09:45:00
50%                2014-04-11 20:00:00
75%                2014-04-23 14:00:00
max                2014-05-04 21:00:00
Name: time, dtype: object
DIFFERENCE FOR mood 67 days 08:00:00
2014-05-04 21:00:00
2014-02-26 13:00:00
circumplex.arousal
count    218.000000
mean      -0.243119
std        0.905983
min       -2.000000
25%       -1.000000
50%        0.000000
75%        0.000000
max        2.000000
Name: value, dtype: float64
count                              223
mean     2014-04-11 07:53:48.699551744
min                2014-02-26 13:00:00
25%                2014-03-31 10:30:00
50%                2014-04-11 22:00:00
75%                2

In [151]:
df_person_1 = df[df["id_num"] == 1].copy()

# df_person_1["time"].nunique()

# count which number occurs twice

# split the values into variable columns with unique time values

# count unique days in time
# print(df_person_1["time"].dt.date.nunique())

# rename time into date-time
df_person_1 = df_person_1.rename(columns={"time": "datetime"})
df_person_1["date"] = df_person_1["datetime"].dt.date
df_person_1["hour"] = df_person_1["datetime"].dt.hour
df_person_1["minute"] = df_person_1["datetime"].dt.minute
df_person_1



Unnamed: 0.1,Unnamed: 0,id,datetime,variable,value,id_num,date,hour,minute
0,1,AS14.01,2014-02-26 13:00:00.000,mood,6.000,1,2014-02-26,13,0
1,2,AS14.01,2014-02-26 15:00:00.000,mood,6.000,1,2014-02-26,15,0
2,3,AS14.01,2014-02-26 18:00:00.000,mood,6.000,1,2014-02-26,18,0
3,4,AS14.01,2014-02-26 21:00:00.000,mood,7.000,1,2014-02-26,21,0
4,5,AS14.01,2014-02-27 09:00:00.000,mood,6.000,1,2014-02-27,9,0
...,...,...,...,...,...,...,...,...,...
374684,2348657,AS14.01,2014-05-05 00:15:26.313,appCat.utilities,164.567,1,2014-05-05,0,15
374685,2348659,AS14.01,2014-05-05 00:18:14.953,appCat.utilities,48.138,1,2014-05-05,0,18
374686,2348660,AS14.01,2014-05-05 00:19:42.322,appCat.utilities,48.156,1,2014-05-05,0,19
374687,2348691,AS14.01,2014-05-05 09:41:03.888,appCat.utilities,3.470,1,2014-05-05,9,41


In [152]:
def get_time_descriptives(df, participant: int):
    """
    Get time descriptives for a given participant.
    """
    # Filter for the participant
    df_person = df[df["id_num"] == participant].copy()

    # rename time into datetime
    # print(df_person.columns) 
    
    # Group by variable and compute descriptive statistics
    stats = df_person.groupby("variable")["time"].agg(
        count_non_missing = "count",
        min_time = "min",
        max_time = "max"
    )
    # Count missing values per variable
    stats["n_missing"] = df_person.groupby("variable")["time"].apply(lambda x: x.isna().sum())
    # Calculate the time range (difference between max and min)
    stats["time_range"] = stats["max_time"] - stats["min_time"]
    
    # Optionally, reset the index for a nicer display
    stats = stats.reset_index()
    return stats


def plot_time_ranges(df, participant, save_fig=True, show_plot=True):
    """
    Plot the time range for each variable for a given participant.
    """
    # Get descriptive stats for plotting
    stats = get_time_descriptives(df, participant)
    
    # Create the plot
    fig, ax = plt.subplots(figsize=(12, 0.3 * len(stats)))  # Adjust height based on number of variables
    
    # Loop over each variable to draw its time range as a horizontal line
    earliest = stats["min_time"].min()
    latest = stats["max_time"].max()
    for idx, row in stats.iterrows():
        ax.hlines(y=idx, xmin=row["min_time"], xmax=row["max_time"], color='blue', lw=2)

        # Optionally, mark the endpoints
        if row["min_time"] == earliest:
            ax.plot(row["min_time"], idx, "o", color='purple')
        else:
            ax.plot(row["min_time"], idx, "o", color='green')
        if row["max_time"] == latest:
            ax.plot(row["max_time"], idx, "o", color='darkred')
        else:
            ax.plot(row["max_time"], idx, "o", color='red')

    # higlight background between the earliest and latest time of mood
    earliest_mood = stats[stats["variable"] == "mood"]["min_time"].values[0]
    latest_mood = stats[stats["variable"] == "mood"]["max_time"].values[0]
    ax.axvspan(earliest_mood, latest_mood, alpha=0.1, color='yellow')


    # get the earliest and latest time and write it to the plot

    plt.text(earliest, 5, earliest, color='black', fontsize=12)
    plt.text(latest, 5, latest, color='black', fontsize=12)
    
    # line for earliest and latest time
    ax.axvline(earliest, color='black', linestyle='--', lw=1)
    ax.axvline(latest, color='black', linestyle='--', lw=1)
    
    # Customize y-axis with variable names
    ax.set_yticks(range(len(stats)))
    ax.set_yticklabels(stats["variable"])
    ax.set_xlabel("Time")
    ax.set_title("Time Range per Variable for Participant {}".format(participant))


    plt.tight_layout()


    # save figure
    if save_fig:
        if not os.path.exists("figures/time_ranges_per_variable"):
            os.makedirs("figures/time_ranges_per_variable")
        plt.savefig(f"figures/time_ranges_per_variable/time_range_plot_p{participant}.png")
    
    # add one legend for all lines

    if show_plot:
        plt.show()
    else:
        plt.close()



for person in df["id_num"].unique():
    # print(f"Participant {person}")
    stats = get_time_descriptives(df, person)
    # print(stats)
    plot_time_ranges(df, person, save_fig=True, show_plot=False)




## Create daily pivot for particiapnt per day

In [169]:

def create_daily_pivot(df, participant="all", return_dict=False, counts=True):
    """
    Create a daily pivot table for a given participant or all participants or list of participants.
    
    Each row corresponds to a day (from the earliest to the latest day the participant has data),
    each column corresponds to a variable, and if multiple datapoints occur on a given day,
    the value is aggregated as the mean. Days with no data for a variable are represented as NaN.
    
    Parameters:
        df (pd.DataFrame): DataFrame containing at least the columns 'id_num', 'time', 
                           'variable', and 'value'. The 'time' column should be in datetime format.
        participant: Either "all" (to process all participants), a single participant id, or a list of participant ids.
        return_dict (bool): If True, returns a dictionary of pivot tables (keyed by participant).
                            Otherwise, returns a single DataFrame with 'id_num' and 'day' as columns.
        counts (bool): If True, includes a count of the number of values for each variable per day.
    
    Returns:
        Either a dict mapping participant IDs to their daily pivot table or a combined DataFrame.
    """
    # Determine the list of participant IDs to process
    if participant == "all":
        participants = df["id_num"].unique()
    elif isinstance(participant, list):
        participants = participant
    else:
        participants = [participant]
    
    # Ensure the time column is in datetime format
    df_copy = df.copy()
    if not pd.api.types.is_datetime64_any_dtype(df_copy["time"]):
        df_copy["time"] = pd.to_datetime(df_copy["time"])
    
    # Create a column with just the day (flooring the datetime to day)
    df_copy["day"] = df_copy["time"].dt.floor("D")
    
    pivot_dict = {}
    pivot_list = []
    
    # Process each participant separately
    for part in participants:
        df_part = df_copy[df_copy["id_num"] == part].copy()

        # get a count of the number of values of variable per day
        df_part["comprising_of"] = df_part.groupby(["day", "variable"])["value"].transform("count")

        # select the mean aggregation columns, time values should be aggregated by sum
        mean_agg = ["mood", "circumplex.valence", "circumplex.arousal", "activity"] # not sure about activity
        sum_agg = [col for col in df_part["variable"].unique() if col not in mean_agg + ["id_num", "time", "day"]]
        
        # Create a new DataFrame with the selected columns
        df_part_sum = df_part[df_part["variable"].isin(sum_agg)].copy()


    
        pivot_sum = df_part_sum.pivot_table(index="day",
                                            columns="variable", 
                                            values="value", 
                                            aggfunc="sum")

        df_part_mean = df_part[df_part["variable"].isin(mean_agg)].copy()
        pivot_mean = df_part_mean.pivot_table(index="day",
                                              columns="variable", 
                                              values="value", 
                                              aggfunc="mean")
    
        # get the count of the number of values comprising the variable per day
        pivot_count = df_part.pivot_table(index="day",
                                    columns="variable",
                                    values="value",
                                    aggfunc="count")
        #nans are 0
        pivot_count = pivot_count.fillna(0)
        pivot_count = pivot_count.add_suffix("_count")

        # Combine the two pivot tables
        pivot = pd.concat([pivot_sum, pivot_mean], axis=1)

        if counts:
            pivot = pd.concat([pivot, pivot_count], axis=1)
                                            
        # Create a complete date range from the earliest to the latest day for this participant
        full_range = pd.date_range(start=df_part["day"].min(), end=df_part["day"].max(), freq="D")
        pivot = pivot.reindex(full_range)
        pivot.index.name = "day"
        
        # Convert index to a column and add participant id
        pivot = pivot.reset_index()
        pivot["id_num"] = part

        # reorder the columns
        desired_order = ["id_num", "day", "mood", "screen", "activity", "circumplex.valence", "circumplex.arousal", "call", "sms"]
        if counts:
            desired_order = ["id_num", "day", "mood", "mood_count", "screen", "screen_count", "activity", "activity_count", "circumplex.valence","circumplex.valence_count", "circumplex.arousal","circumplex.arousal_count", "call", "call_count", "sms", "sms_count"]

        other_columns = [p for p in pivot.columns if p not in desired_order]
        new_order = desired_order + other_columns
        pivot = pivot[new_order]

        # rearrange the columns to have the id_num, day, mood, screen, activity, circumplex.valence, circumplex.arousal then the rest

        if return_dict:
            pivot_dict[part] = pivot
        else:
            pivot_list.append(pivot)
    
    if return_dict:
        return pivot_dict
    else:
        # Concatenate the list of dataframes without setting a multi-index so that
        # both 'day' and 'id_num' remain as regular columns
        combined = pd.concat(pivot_list, ignore_index=True)
        # sort by id_num and day
        combined = combined.sort_values(by=["id_num", "day"])
        # save the combined dataframe to a csv file
        if not os.path.exists("tables/pivot_tables_daily"):
            os.makedirs("tables/pivot_tables_daily")
        combined.to_csv(f"tables/pivot_tables_daily/daily_pivot_table_{participant}.csv", index=False)
        return combined

# For a single participant:
single_pivot = create_daily_pivot(df, participant=1)
single_pivot.describe()

# For multiple participants combined into one DataFrame:
combined_pivot = create_daily_pivot(df, participant="all", return_dict=False)

# For a dictionary of separate pivot tables:
pivot_dict = create_daily_pivot(df, participant=[1,2,3], return_dict=True)
# print(pivot_dict.values())

combined_pivot.head()


variable,id_num,day,mood,mood_count,screen,screen_count,activity,activity_count,circumplex.valence,circumplex.valence_count,...,appCat.entertainment_count,appCat.finance_count,appCat.game_count,appCat.office_count,appCat.other_count,appCat.social_count,appCat.travel_count,appCat.unknown_count,appCat.utilities_count,appCat.weather_count
0,1,2014-02-17,,0.0,,0.0,,0.0,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2014-02-18,,0.0,,0.0,,0.0,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,2014-02-19,,0.0,,0.0,,0.0,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,2014-02-20,,0.0,,0.0,,0.0,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,2014-02-21,,0.0,,0.0,,0.0,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Plotting daily pivots (histogram and timeseries)

In [154]:
def plotly_all_participants_histograms(df, save_html=True, show_plot=True):
    """
    Plot histograms for all participants per variable, with toggle to show specific participants.
    """
    df["date"] = pd.to_datetime(df["timestamp"]).dt.date
    grouped = df.groupby(["participant", "date"]).mean().reset_index()
    variables = [col for col in grouped.columns if col not in ["participant", "date"]]
    participants = grouped["participant"].unique()

    fig = make_subplots(rows=len(variables), cols=1, subplot_titles=variables)

    visibility_map = []

    for row_idx, var in enumerate(variables, 1):
        for pid in participants:
            pid_data = grouped[grouped["participant"] == pid]
            trace = go.Histogram(
                x=pid_data[var],
                name=str(pid),
                legendgroup=str(pid),
                opacity=0.6,
                showlegend=(row_idx == 1),
                visible=True if row_idx == 1 else False
            )
            fig.add_trace(trace, row=row_idx, col=1)
            visibility_map.append((row_idx, pid))

    # Dropdown toggle
    buttons = []
    for pid in participants:
        visible = []
        for (row, p) in visibility_map:
            visible.append(p == pid)
        buttons.append(dict(
            label=f"Participant {pid}",
            method="update",
            args=[{"visible": visible},
                  {"title": f"Histograms - Participant {pid}"}]
        ))

    buttons.insert(0, dict(
        label="Show All",
        method="update",
        args=[{"visible": [True] * len(visibility_map)},
              {"title": "Histograms - All Participants"}]
    ))

    fig.update_layout(
        height=300 * len(variables),
        title="Histograms - All Participants",
        barmode="overlay",
        updatemenus=[{
            "buttons": buttons,
            "direction": "down",
            "showactive": True,
            "x": 1.02,
            "xanchor": "left",
            "y": 1,
            "yanchor": "top"
        }]
    )

    if save_html:
        os.makedirs("figures/daily_pivot", exist_ok=True)
        fig.write_html("figures/daily_pivot/all_participants_histograms.html")

    if show_plot:
        fig.show()

## Plotting daily pivot table with Plotly (interactive html graphs)

In [170]:
def plotly_all_participants_timeseries(df, save_html=True, show_plot=True):
    """
    Plot time series for all participants, one line per participant per variable,
    with a dropdown toggle to show specific participants.
    
    The function expects a daily pivot table DataFrame with a column "day" (date)
    and "id_num" (participant identifier) along with other numeric variable columns.
    
    Parameters:
        df (pd.DataFrame): DataFrame that will be used by create_daily_pivot.
        save_html (bool): Whether to save the figure as an HTML file.
        show_plot (bool): Whether to display the plot.
    """
    # Get the daily pivot table for all participants.
    # This function should return a DataFrame with columns "day", "id_num", and variable columns.
    df_plot = create_daily_pivot(df, participant="all", return_dict=False)
    
    # Make sure "day" is a column; if it's not, throw an error.
    if "day" not in df_plot.columns:
        raise ValueError("Expected a 'day' column in the DataFrame.")
    
    # Convert "day" to a proper date format and rename as "date" for clarity
    df_plot["date"] = pd.to_datetime(df_plot["day"]).dt.date

    # Check that 'id_num' is present.
    if "id_num" not in df_plot.columns:
        raise ValueError("Expected a column 'id_num' to identify participants.")
    
    # Identify numeric columns (i.e. variables)
    numeric_cols = df_plot.select_dtypes(include="number").columns.tolist()
    # Remove group-by columns if present
    group_cols = ["id_num", "day", "date"]
    variables = [col for col in df_plot.columns if col not in group_cols]
    
    # Group by participant and date (if needed, here our pivot table should already be daily)
    # In case there are multiple entries per date for a participant (unlikely after pivoting),
    # we aggregate them by mean.
    grouped = df_plot.groupby(["id_num", "date"])[variables].mean(numeric_only=True).reset_index()
    
    participants = grouped["id_num"].unique()

    # Create a subplot for each variable, sharing the same x-axis.
    fig = make_subplots(
        rows=len(variables), cols=1, shared_xaxes=True,
        vertical_spacing=0.03, subplot_titles=variables
    )

    # This list will help map each trace to its participant for the dropdown.
    visibility_map = []

    # Create one trace per participant per variable.
    for row_idx, var in enumerate(variables, 1):
        for pid in participants:
            pid_data = grouped[grouped["id_num"] == pid]
            fig.add_trace(
                go.Scatter(
                    x=pid_data["date"],
                    y=pid_data[var],
                    mode="lines+markers",
                    name=f"Participant {pid}",
                    legendgroup=str(pid),
                    visible=True if row_idx == 1 else False,  # only show all for first subplot initially
                    showlegend=(row_idx == 1)
                ),
                row=row_idx, col=1
            )
            visibility_map.append((row_idx, pid))

    # Create dropdown buttons to toggle traces by participant.
    buttons = []
    for pid in participants:
        # Build visibility list: each trace is visible only if its participant matches pid.
        visible = []
        for (row, p) in visibility_map:
            visible.append(p == pid)
        buttons.append(dict(
            label=f"Participant {pid}",
            method="update",
            args=[{"visible": visible},
                  {"title": f"Time Series - Participant {pid}"}]
        ))

    # Add a button to show all participants.
    buttons.insert(0, dict(
        label="Show All",
        method="update",
        args=[{"visible": [True] * len(visibility_map)},
              {"title": "Time Series - All Participants"}]
    ))

    fig.update_layout(
        height=300 * len(variables),
        title="Time Series - All Participants",
        updatemenus=[{
            "buttons": buttons,
            "direction": "down",
            "showactive": True,
            "x": 1.02,
            "xanchor": "left",
            "y": 1,
            "yanchor": "top"
        }],
        hovermode="x unified"
    )

    # Optionally save as HTML.
    if save_html:
        outdir = "figures/plotly/all_participants"
        os.makedirs(outdir, exist_ok=True)
        fig.write_html(os.path.join(outdir, "all_participants_timeseries.html"))

    # Show the figure if requested.
    if show_plot:
        fig.show()

# Example usage:
plotly_all_participants_timeseries(df, save_html=True, show_plot=True)


ValueError: Vertical spacing cannot be greater than (1 / (rows - 1)) = 0.027027.
The resulting plot would have 38 rows (rows=38).

## Interactive histograms

In [171]:
def plotly_all_participants_histograms(df, save_html=True, show_plot=True):
    """
    Plot histograms for all participants, one histogram per variable,
    with a dropdown toggle to show specific participants.
    
    The function expects a daily pivot table DataFrame with a column "day" (date)
    and "id_num" (participant identifier) along with other numeric variable columns.
    
    Parameters:
        df (pd.DataFrame): DataFrame that will be used by create_daily_pivot.
        save_html (bool): Whether to save the figure as an HTML file.
        show_plot (bool): Whether to display the plot.
    """
    # Get the daily pivot table for all participants.
    # This function should return a DataFrame with columns "day", "id_num", and variable columns.
    df_plot = create_daily_pivot(df, participant="all", return_dict=False)
    
    # Make sure "day" is a column; if it's not, throw an error.
    if "day" not in df_plot.columns:
        raise ValueError("Expected a 'day' column in the DataFrame.")
    
    # Convert "day" to a proper date format and also create a 'date' column (for clarity)
    df_plot["date"] = pd.to_datetime(df_plot["day"]).dt.date

    # Check that 'id_num' is present.
    if "id_num" not in df_plot.columns:
        raise ValueError("Expected a column 'id_num' to identify participants.")
    
    # Identify variable columns by excluding group-by columns.
    group_cols = ["id_num", "day", "date"]
    variables = [col for col in df_plot.columns if col not in group_cols]
    
    # Group by participant and date (this should be daily data, but we aggregate if needed)
    grouped = df_plot.groupby(["id_num", "date"])[variables].mean(numeric_only=True).reset_index()
    
    participants = grouped["id_num"].unique()

    # Create a subplot for each variable.
    fig = make_subplots(
        rows=len(variables), cols=1, shared_xaxes=False,
        vertical_spacing=0.03, subplot_titles=variables
    )

    # Map each trace to its participant for the dropdown toggle.
    visibility_map = []

    # Create one histogram trace per participant per variable.
    for row_idx, var in enumerate(variables, 1):
        for pid in participants:
            pid_data = grouped[grouped["id_num"] == pid]
            fig.add_trace(
                go.Histogram(
                    x=pid_data[var],
                    name=f"Participant {pid}",
                    legendgroup=str(pid),
                    visible=True if row_idx == 1 else False,  # show all traces for the first subplot initially
                    showlegend=(row_idx == 1)
                ),
                row=row_idx, col=1
            )
            visibility_map.append((row_idx, pid))
    
    # Create dropdown buttons to toggle traces by participant.
    buttons = []
    for pid in participants:
        # Build visibility list: each trace is visible only if its participant matches pid.
        visible = []
        for (row, p) in visibility_map:
            visible.append(p == pid)
        buttons.append(dict(
            label=f"Participant {pid}",
            method="update",
            args=[{"visible": visible},
                  {"title": f"Histograms - Participant {pid}"}]
        ))
    
    # Add a button to show all participants.
    buttons.insert(0, dict(
        label="Show All",
        method="update",
        args=[{"visible": [True] * len(visibility_map)},
              {"title": "Histograms - All Participants"}]
    ))
    
    fig.update_layout(
        height=300 * len(variables),
        title="Histograms - All Participants",
        updatemenus=[{
            "buttons": buttons,
            "direction": "down",
            "showactive": True,
            "x": 1.02,
            "xanchor": "left",
            "y": 1,
            "yanchor": "top"
        }],
        barmode="overlay",
        hovermode="x unified"
    )
    
    # Optionally save the figure as HTML.
    if save_html:
        outdir = "figures/plotly/all_participants"
        os.makedirs(outdir, exist_ok=True)
        fig.write_html(os.path.join(outdir, "all_participants_histograms.html"))
    
    # Show the figure if requested.
    if show_plot:
        fig.show()

# Example usage:
plotly_all_participants_histograms(df, save_html=True, show_plot=False)

ValueError: Vertical spacing cannot be greater than (1 / (rows - 1)) = 0.027027.
The resulting plot would have 38 rows (rows=38).

In [157]:
import os
import itertools
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plotly_all_participants_correlations(df, save_html=True, show_plot=True):
    """
    Create an interactive correlation analysis figure.
    
    For each participant option (either all participants combined or individual participants)
    and for each scatter plot variable pair (chosen from the numeric variables), this function creates:
    
    1. A heatmap correlation matrix (top row) computed on all numeric variables available for that participant.
    2. A scatter plot (bottom row) for the selected variable pair.
    
    A single combined update menu (dropdown) allows the user to select the (participant, variable pair)
    combination to display.
    
    Parameters:
      df (pd.DataFrame): DataFrame that will be used by create_daily_pivot.
                         Expected to contain at least 'id_num', 'day', and numeric variable columns.
      save_html (bool): Whether to save the figure as an HTML file.
      show_plot (bool): Whether to display the plot.
      
    Notes:
      - This function handles lots of NaNs by relying on pandas’ .corr() (which computes pairwise correlations).
      - It creates separate traces for each combination, and uses an update menu to toggle visibility.
    """
    # Create the daily pivot table for all participants.
    # Use return_dict=True to get separate DataFrames for each participant.
    pivot_dict = create_daily_pivot(df, participant="all", return_dict=True)
    
    # Also create an "all" option by concatenating all participants' data.
    df_all = pd.concat(pivot_dict.values(), ignore_index=True)
    pivot_dict["all"] = df_all
    
    # Identify participant options. They will be the keys of pivot_dict.
    participant_options = list(pivot_dict.keys())
    
    # Assume the numeric variables are those not in the grouping columns:
    group_cols = ["id_num", "day"]
    sample_df = pivot_dict[participant_options[0]]
    all_vars = list(sample_df.columns)
    numeric_vars = [col for col in all_vars if col not in group_cols and col != "date"]
    
    # Optionally force a specific order for some variables.
    desired_order = ["mood", "screen", "activity", "circumplex.valence", "circumplex.arousal"]
    ordered_vars = [v for v in desired_order if v in numeric_vars] + [v for v in numeric_vars if v not in desired_order]
    numeric_vars = ordered_vars

    # Create a list of scatter plot pairs. Here we take all unordered pairs.
    scatter_pairs = list(itertools.combinations(numeric_vars, 2))
    if not scatter_pairs:
        raise ValueError("Not enough numeric variables to form scatter plot pairs.")

    # Build a figure with 2 rows: row 1 for the heatmap, row 2 for the scatter plot.
    fig = make_subplots(
        rows=2, cols=1,
        row_heights=[0.5, 0.5],
        vertical_spacing=0.1,
        subplot_titles=("Correlation Matrix", "Scatter Plot")
    )
    
    # This list will be used to later update trace visibility.
    trace_visibility_defaults = []
    # Map (participant, scatter_pair index) to a "visible" vector.
    visibility_dict = {}
    
    # --- Create heatmap traces for each participant option.
    heatmap_traces = []
    for p_idx, p in enumerate(participant_options):
        df_p = pivot_dict[p]
        # Use only the numeric variables that are actually in df_p.
        available_vars = [col for col in numeric_vars if col in df_p.columns]
        if available_vars:
            df_corr = df_p[available_vars].corr()
            heat_trace = go.Heatmap(
                z = df_corr.values,
                x = df_corr.columns.tolist(),
                y = df_corr.index.tolist(),
                colorbar=dict(title="r"),
                visible=False  # update later
            )
        else:
            # If no variables exist for this participant, create an empty trace.
            heat_trace = go.Heatmap(z=[], x=[], y=[], visible=False)
        heatmap_traces.append(heat_trace)
        fig.add_trace(heat_trace, row=1, col=1)
        trace_visibility_defaults.append(False)
    
    # --- Create scatter plot traces for each participant and each scatter pair.
    scatter_traces = []
    for p_idx, p in enumerate(participant_options):
        df_p = pivot_dict[p]
        for sp_idx, (var_x, var_y) in enumerate(scatter_pairs):
            # Only use data if both variables exist for the participant.
            if var_x in df_p.columns and var_y in df_p.columns:
                scatter_trace = go.Scatter(
                    x = df_p[var_x],
                    y = df_p[var_y],
                    mode = "markers",
                    marker = dict(size=8, opacity=0.7),
                    name = f"{p} - {var_x} vs {var_y}",
                    visible = False  # update later
                )
            else:
                # Create an empty trace if one or both variables are missing.
                scatter_trace = go.Scatter(
                    x = [],
                    y = [],
                    mode = "markers",
                    marker = dict(size=8, opacity=0.7),
                    name = f"{p} - {var_x} vs {var_y}",
                    visible = False
                )
            scatter_traces.append(scatter_trace)
            fig.add_trace(scatter_trace, row=2, col=1)
            trace_visibility_defaults.append(False)
    
    total_traces = len(heatmap_traces) + len(scatter_traces)
    
    # Build mapping from (participant, scatter_pair index) to a full "visible" vector.
    for p_idx, p in enumerate(participant_options):
        for sp_idx in range(len(scatter_pairs)):
            # Create a boolean list (one per trace) initialized to False.
            visible = [False] * total_traces
            # For the heatmap: only the trace for participant p should be visible.
            visible[p_idx] = True
            # For scatter traces, they are arranged in blocks per participant.
            scatter_trace_index = len(heatmap_traces) + p_idx * len(scatter_pairs) + sp_idx
            visible[scatter_trace_index] = True
            visibility_dict[(p, sp_idx)] = visible

    # Set the default selection: use participant "all" and the first scatter pair.
    default_key = ("all", 0)
    default_visible = visibility_dict[default_key]
    for i, vis in enumerate(default_visible):
        fig.data[i].visible = vis

    # --- Create an update menu with one button per (participant, scatter pair) combination.
    menu_buttons = []
    for p in participant_options:
        for sp_idx, (var_x, var_y) in enumerate(scatter_pairs):
            label = f"{'All' if p=='all' else 'Participant '+str(p)}: {var_x} vs {var_y}"
            visible = visibility_dict[(p, sp_idx)]
            button = dict(
                label = label,
                method = "update",
                args = [
                    {"visible": visible},
                    {"title": f"Correlation Analysis - {'All' if p=='all' else 'Participant '+str(p)}: {var_x} vs {var_y}"}
                ]
            )
            menu_buttons.append(button)
    
    # Update the layout with the dropdown menu.
    fig.update_layout(
        updatemenus=[{
            "buttons": menu_buttons,
            "direction": "down",
            "showactive": True,
            "x": 1.05,
            "xanchor": "left",
            "y": 1,
            "yanchor": "top"
        }],
        height=800,
        title="Correlation Analysis"
    )
    
    # Optionally save as HTML.
    if save_html:
        outdir = "figures/plotly/correlations"
        os.makedirs(outdir, exist_ok=True)
        fig.write_html(os.path.join(outdir, "correlations.html"))
    
    if show_plot:
        fig.show()

plotly_all_participants_correlations(df, save_html=True, show_plot=False)

0         4
1         4
2         4
3         4
4         3
         ..
374684    4
374685    4
374686    4
374687    4
376657    1
Name: comprising_of, Length: 21999, dtype: int64
length of comprising_of 21999
length of df_part 21999
222       3
223       3
224       3
225       4
226       4
         ..
374711    9
374712    9
374713    9
374714    9
374715    9
Name: comprising_of, Length: 14581, dtype: int64
length of comprising_of 14581
length of df_part 14581
381       2
382       2
383       5
384       5
385       5
         ..
374745    1
374746    1
376658    1
376659    2
376660    2
Name: comprising_of, Length: 14425, dtype: int64
length of comprising_of 14425
length of df_part 14425
602       1
603       5
604       5
605       5
606       5
         ..
374886    4
374887    4
374888    4
374889    1
374890    1
Name: comprising_of, Length: 15745, dtype: int64
length of comprising_of 15745
length of df_part 15745
843       1
844       5
845       5
846       5
847       5


## count NANs per variable per participant

In [158]:
def nan_exploration(df):
    """Create a pd dataframe with the percentage of NaN values for each variable per person, along with the count of unique values per variable"""

    unique_participants = df["id_num"].unique()

    # create a pivot
    pivot = create_daily_pivot(df, participant="all", return_dict=False)

    # get the percentage of NaN values for each variable per person
    df_nans_list = []

    for participant in unique_participants:
        # get the pivot for the participant
        pivot_participant = pivot[pivot["id_num"] == participant]
        # get the percentage of NaN values for each variable
        nan_percentage = round(pivot_participant.isna().mean(),3)

        nan_count = pivot_participant.isna().sum()
        # get the count of unique values per variable
        unique_values = pivot_participant.nunique()
        # create a dataframe with the results
        df_nan = pd.DataFrame({"nan_percentage": nan_percentage,"nan_count": nan_count, "unique_values": unique_values})
        df_nan["participant"] = participant
        df_nan = df_nan.reset_index()
        df_nans_list.append(df_nan)

    # concatenate the dataframes
    df_nans = pd.concat(df_nans_list, ignore_index=True)

    # save the dataframe
    if not os.path.exists("tables/nan_exploration"):
        os.makedirs("tables/nan_exploration")
    df_nans.to_csv("tables/nan_exploration/nan_exploration.csv", index=False)
    return df_nans
    

df_nans = nan_exploration(df)

# sort df_nans by nan_percentage
df_nans = df_nans.sort_values(by=["nan_percentage"], ascending=False)





0         4
1         4
2         4
3         4
4         3
         ..
374684    4
374685    4
374686    4
374687    4
376657    1
Name: comprising_of, Length: 21999, dtype: int64
length of comprising_of 21999
length of df_part 21999
222       3
223       3
224       3
225       4
226       4
         ..
374711    9
374712    9
374713    9
374714    9
374715    9
Name: comprising_of, Length: 14581, dtype: int64
length of comprising_of 14581
length of df_part 14581
381       2
382       2
383       5
384       5
385       5
         ..
374745    1
374746    1
376658    1
376659    2
376660    2
Name: comprising_of, Length: 14425, dtype: int64
length of comprising_of 14425
length of df_part 14425
602       1
603       5
604       5
605       5
606       5
         ..
374886    4
374887    4
374888    4
374889    1
374890    1
Name: comprising_of, Length: 15745, dtype: int64
length of comprising_of 15745
length of df_part 15745
843       1
844       5
845       5
846       5
847       5


## unique values

In [168]:
# get unique values per variable

pivot = create_daily_pivot(df, participant="all", return_dict=False)
# get the unique values per variable
unique_values = pivot.nunique()
print(unique_values)

variable
id_num                          27
day                            113
mood                            59
mood_count                       7
screen                        1207
screen_count                   217
activity                      1161
activity_count                  25
circumplex.valence              30
circumplex.valence_count         7
circumplex.arousal              42
circumplex.arousal_count         7
call                            25
call_count                      26
sms                             16
sms_count                       17
appCat.builtin                1195
appCat.communication          1183
appCat.entertainment           849
appCat.finance                 207
appCat.game                    191
appCat.office                  275
appCat.other                  1116
appCat.social                  988
appCat.travel                  428
appCat.unknown                 264
appCat.utilities               424
appCat.weather                 112
appCat.buil