In [1]:
import pandas as pd
import numpy as np
import os
import glob

path = os.getcwd()
csv_files = glob.glob(os.path.join(path, "data/*.csv"))

In [12]:
dataframes = []

i = 0
for f in csv_files:

    # read the csv file
    df = pd.read_csv(f)
    
    # skip setup files
    if len(df) < 2:
        continue
    
    # add UUID as column
    df["participantUuid"] = os.path.splitext(f)[0][-36:]
    
    # convert timestamps to datetime
    df["newTimestamp"] = pd.to_datetime(df["timestamp"])
    df["isNewUser"] = False

    # convert timestamps to relative time span
    if "didUpdateDailyBudget" in df.columns:
        # get first daily budget update, which indicates study setup
        updates = df[~df["didUpdateDailyBudget"].isna()]
        first_setup = updates["newTimestamp"].min()
        
        df["secondsSinceStudySetup"] = (df["newTimestamp"] - first_setup) // pd.Timedelta('1s')
    else:
        df["secondsSinceStudySetup"] = (df["newTimestamp"] - df["newTimestamp"].min()) // pd.Timedelta('1s')
        df["isNewUser"] = True
        
    df["weeksSinceStudySetup"] = df["secondsSinceStudySetup"].apply(lambda x: x / (604800)).apply(np.ceil).astype("Int64")
    df["secondsSinceStudySetup"] = df["secondsSinceStudySetup"].astype("Int64")
    
    df["weeksSinceStudySetup"] = df["weeksSinceStudySetup"].apply(lambda x: x-1 if not pd.isnull(x) and x <= 0 else x)
    
    if df["weeksSinceStudySetup"].max() <= 6:
        # append to array
        dataframes.append(df)
    else:
        print(f"Dataset {f} was exluded because max weeksSinceStudySetup is greater 6")
        

Dataset /Users/philipp/Developer/donatus-ma/data/study-donatus-master-thesis-one-sec-early-2023-donatus-master-thesis-one-sec-early-2023-4926C34D-8F67-4608-92A0-D907B2E94692.csv was exluded because max weeksSinceStudySetup is greater 6
Dataset /Users/philipp/Developer/donatus-ma/data/study-donatus-master-thesis-one-sec-early-2023-donatus-master-thesis-one-sec-early-2023-06C81CEA-594B-4B6F-A0C5-7F6512A7B68D.csv was exluded because max weeksSinceStudySetup is greater 6
Dataset /Users/philipp/Developer/donatus-ma/data/study-donatus-master-thesis-one-sec-early-2023-donatus-master-thesis-one-sec-early-2023-63A35AB3-E72C-4970-BCB9-B282BD102F53.csv was exluded because max weeksSinceStudySetup is greater 6
Dataset /Users/philipp/Developer/donatus-ma/data/study-donatus-master-thesis-one-sec-early-2023-donatus-master-thesis-one-sec-early-2023-8E4CD33F-820D-484B-85F1-3B23EDA74314.csv was exluded because max weeksSinceStudySetup is greater 6
Dataset /Users/philipp/Developer/donatus-ma/data/study-d

In [13]:
# concatenate data frames
merged = pd.concat(dataframes)

display(merged)

Unnamed: 0,pre_study_showUsageStats,pre_study_healthyAlternativesEnabled,startOfWeek,pre_study_interventionsSelected,app,resolution,timestamp,interventionDuration,interventionType,purpose,didUpdateDailyBudget,hot_fix_study_group,participantUuid,newTimestamp,isNewUser,secondsSinceStudySetup,weeksSinceStudySetup,terminationReason,studyGroup
0,1.0,0.0,Monday,['breathingExercise'],,,,,,,,,C13F4AB6-6D48-4767-8997-33B562856A36,NaT,False,,,,
1,,,,,instagram,openedApp,2023-02-23 15:22:38.000000000,6.0,breathingExercise,,,,C13F4AB6-6D48-4767-8997-33B562856A36,2023-02-23 15:22:38.000000000,False,-1389724,-3,,
2,,,,,instagram,dismissedAppOpening,2023-02-23 16:33:44.000000000,6.0,breathingExercise,,,,C13F4AB6-6D48-4767-8997-33B562856A36,2023-02-23 16:33:44.000000000,False,-1385458,-3,,
3,,,,,instagram,openedApp,2023-02-23 16:46:24.000000000,6.0,breathingExercise,Arbeit,,,C13F4AB6-6D48-4767-8997-33B562856A36,2023-02-23 16:46:24.000000000,False,-1384698,-3,,
4,,,,,instagram,openedApp,2023-02-23 19:50:22.000000000,6.0,breathingExercise,Langeweile,,,C13F4AB6-6D48-4767-8997-33B562856A36,2023-02-23 19:50:22.000000000,False,-1373660,-3,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
856,,,,,instagram,dismissedAppOpening,2023-04-01 01:33:47.191160064,6.0,minimalBreathingExercise,,,,DEFAC6B4-BA34-43B5-A533-451AE011C457,2023-04-01 01:33:47.191160064,False,2952719,5,,
857,,,,,instagram,openedApp,2023-04-01 01:34:07.028591104,6.0,minimalBreathingExercise,Watch Reel with lady,,,DEFAC6B4-BA34-43B5-A533-451AE011C457,2023-04-01 01:34:07.028591104,False,2952739,5,,
858,,,,,instagram,closedApp,2023-04-01 01:41:11.392625920,,,,,,DEFAC6B4-BA34-43B5-A533-451AE011C457,2023-04-01 01:41:11.392625920,False,2953163,5,,
859,,,,,instagram,openedApp,2023-04-01 02:40:03.744712704,6.0,minimalBreathingExercise,Watch Reel with lady,,,DEFAC6B4-BA34-43B5-A533-451AE011C457,2023-04-01 02:40:03.744712704,False,2956695,5,,


In [14]:
# save to csv
merged.to_csv("merged.csv")