In [30]:
import pandas as pd
import numpy as np 
import csv 
from datetime import datetime
import itertools

dataset = "dataset_mood_smartphone.csv" 
df = pd.read_csv(dataset)

In [31]:
def create_daily_df(df): 
    """
    This function aggregates the variable values per day, where it takes the average of mood, arousal and valence. 
    It takes the sum of the other variables. 
    """

    # Add date column to dataframe 
    df["date"] = df["time"].str[:10]

    # Define the variables of which we want the mean per day - all other variables are aggregated in a sum. 
    mean_vars = ["mood", "circumplex.arousal", "circumplex.valence", "activity"]

    # Create the dataframe with the means of variables per day  
    df_means = df[df["variable"].isin(mean_vars)]
    df_means =  df_means.groupby(["id", "date", "variable"], as_index=False)[["value"]].mean()

    # Create the datafram with the sums of variables per day
    df_sums = df[~df["variable"].isin(mean_vars)]
    df_sums = df_sums.groupby(["id", "date", "variable"], as_index=False)[["value"]].sum()

    # Concatenate the two dataframes into the df with daily values
    df_daily = pd.concat([df_sums, df_means])
    df_daily = df_daily.sort_values(by=["id", "date"])

    return df_daily

In [32]:
df_daily = create_daily_df(df)
df_daily

Unnamed: 0,id,date,variable,value
0,AS14.01,2014-02-17,call,2.0
1,AS14.01,2014-02-18,call,1.0
2,AS14.01,2014-02-19,call,7.0
3,AS14.01,2014-02-19,sms,2.0
4,AS14.01,2014-02-20,call,2.0
...,...,...,...,...
4987,AS14.33,2014-05-30,circumplex.valence,-0.4
4988,AS14.33,2014-05-30,mood,6.8
4989,AS14.33,2014-05-31,circumplex.arousal,-2.0
4990,AS14.33,2014-05-31,circumplex.valence,1.0


In [33]:
def create_date_list(df_daily): 

    # Create a list of patient ids 
    ids = df_daily["id"].drop_duplicates().tolist()

    date_instances = []
    date_targets = []

    # Loop through the patient ids and create a list of lists of dates which will later correspond to a data instance. 
    for patient in ids: 
        
        date_instances_id = []
        date_targets_id = []
        
        # Get list of all dates that have a mood recording 
        date_strs = df_daily[(df_daily["variable"]=="mood") & (df_daily["id"] == patient)]["date"].tolist()
        
        # convert dates list to actual times
        dates = [datetime.strptime(d, "%Y-%m-%d") for d in date_strs]
        
        # range through all dates of of a patient 
        for i in range(len(dates)-5):
            diff_time = dates[i+5] - dates[i]
            if diff_time.days != 5: 
                # To ensure there are 5 consecutive days 
                continue 
            date_instances_id.append(date_strs[i:i+4])
            date_targets_id.append(date_strs[i+4])
        
        date_instances.append(date_instances_id)
        date_targets.append(date_targets_id)

    return date_instances, date_targets 

In [38]:
def create_dataframe(df): 
    df_daily = create_daily_df(df)
    date_instances, date_targets = create_date_list(df_daily)
    ids = df_daily["id"].drop_duplicates().tolist()    
    variables = df_daily["variable"].drop_duplicates().tolist() 

    # Initialize 
    x_no = 0
    list_of_rows = []

    for (idx, id_dates, id_targets) in zip(ids, date_instances, date_targets): 
        for (dates, target_date) in zip(id_dates, id_targets): 
            x_no +=1
            df_dates_id = df_daily[(df_daily["date"].isin(dates)) & (df_daily["id"] == idx)]
            for t, date in enumerate(dates): 
                
                # Create dataframe based on the patient id and 
                df_t = df_dates_id[df_dates_id["date"]==date]
                
                # Get a list of variable names 
                names = df_t[df_t["variable"].isin(variables)]['variable'].tolist()
                names_missing = list(set(variables)-set(names))
                names_all = names + names_missing
                
                # Get list of variable values
                values = df_t[df_t["variable"].isin(variables)]['value'].tolist()
                print(idx, names_missing)
                if "screen" in names_missing: 
                    
                    # the average for that person 
                    imputations = []
                    for var in names_missing: 
                        print(var)
                        imputation = float(df_daily[(df_daily["id"]==idx) & (df_daily["variable"]==var)].mean())
                        imputations.append(imputation)
                    values_all = values + imputations
                    
                else:
                    values_all = values + len(names_missing) * [0]
                

                
                # Sort values based on the names_all list 
                values_sorted = [val for name,val in sorted(zip(names_all,values_all))]
                target = float(df_daily[(df_daily["date"]==target_date) & (df_daily["id"]==idx) & (df_daily["variable"]=="mood")]["value"])
                target = round(target)
                
                # Create list with: [data_instance_no, target, t, activity, appCat.builtin, ...]
                row = [x_no, target, t] + values_sorted 
                list_of_rows.append(row)

    columns = ["no", "target", "t"] + sorted(variables)
    dataframe = pd.DataFrame(list_of_rows, columns=columns)

    return dataframe 

# Read dataset as pandas df
dataset = "dataset_mood_smartphone.csv" 
df = pd.read_csv(dataset)

# Create new dataframe 
dataframe = create_dataframe(df)
dataframe

AS14.01 ['appCat.weather', 'appCat.unknown', 'sms', 'appCat.game']
AS14.01 ['appCat.weather', 'appCat.unknown', 'appCat.office', 'appCat.game']
AS14.01 ['sms', 'appCat.game', 'appCat.travel', 'appCat.unknown', 'call', 'appCat.office']
AS14.01 ['appCat.weather', 'appCat.unknown', 'sms', 'appCat.game']
AS14.01 ['appCat.weather', 'appCat.unknown', 'appCat.office', 'appCat.game']
AS14.01 ['sms', 'appCat.game', 'appCat.travel', 'appCat.unknown', 'call', 'appCat.office']
AS14.01 ['appCat.weather', 'appCat.unknown', 'sms', 'appCat.game']
AS14.01 ['appCat.weather', 'appCat.game', 'appCat.travel', 'call', 'appCat.office']
AS14.01 ['sms', 'appCat.game', 'appCat.travel', 'appCat.unknown', 'call', 'appCat.office']
AS14.01 ['appCat.weather', 'appCat.unknown', 'sms', 'appCat.game']
AS14.01 ['appCat.weather', 'appCat.game', 'appCat.travel', 'call', 'appCat.office']
AS14.01 ['appCat.weather', 'sms', 'appCat.game', 'appCat.travel', 'appCat.unknown', 'call', 'appCat.office']
AS14.01 ['appCat.weather', '

AS14.01 ['appCat.weather', 'sms', 'appCat.game', 'appCat.unknown', 'appCat.office']
AS14.01 ['appCat.weather', 'appCat.unknown', 'appCat.office', 'sms']
AS14.01 ['appCat.weather', 'appCat.game', 'appCat.travel', 'appCat.unknown', 'appCat.office']
AS14.01 ['appCat.weather', 'sms', 'appCat.game', 'appCat.unknown', 'appCat.office']
AS14.01 ['appCat.weather', 'appCat.unknown', 'appCat.office', 'sms']
AS14.01 ['appCat.weather', 'appCat.game', 'appCat.travel', 'appCat.unknown', 'appCat.office']
AS14.01 ['appCat.weather', 'sms', 'appCat.game', 'appCat.unknown', 'appCat.office']
AS14.01 ['appCat.weather', 'sms', 'appCat.game', 'appCat.travel', 'appCat.unknown', 'appCat.office']
AS14.01 ['appCat.weather', 'appCat.game', 'appCat.travel', 'appCat.unknown', 'appCat.office']
AS14.01 ['appCat.weather', 'sms', 'appCat.game', 'appCat.unknown', 'appCat.office']
AS14.01 ['appCat.weather', 'sms', 'appCat.game', 'appCat.travel', 'appCat.unknown', 'appCat.office']
AS14.01 ['appCat.weather', 'sms', 'appCat.

AS14.02 ['appCat.weather', 'appCat.finance', 'sms', 'appCat.game', 'appCat.unknown', 'appCat.utilities', 'appCat.office']
AS14.02 ['appCat.weather', 'appCat.finance', 'appCat.game', 'appCat.travel', 'appCat.unknown', 'appCat.utilities', 'appCat.office']
AS14.02 ['appCat.weather', 'appCat.finance', 'appCat.game', 'appCat.travel', 'appCat.unknown', 'appCat.utilities', 'appCat.office']
AS14.02 ['appCat.weather', 'appCat.finance', 'appCat.game', 'appCat.unknown', 'appCat.utilities', 'appCat.office']
AS14.02 ['appCat.weather', 'appCat.finance', 'appCat.game', 'appCat.travel', 'appCat.unknown', 'appCat.utilities', 'appCat.office']
AS14.02 ['appCat.weather', 'appCat.finance', 'appCat.game', 'appCat.travel', 'appCat.unknown', 'appCat.utilities', 'appCat.office']
AS14.02 ['appCat.weather', 'appCat.finance', 'appCat.game', 'appCat.unknown', 'appCat.utilities', 'appCat.office']
AS14.02 ['appCat.weather', 'appCat.finance', 'sms', 'appCat.game', 'appCat.travel', 'appCat.unknown', 'appCat.utilities'

TypeError: cannot convert the series to <class 'float'>

In [9]:
util_apps = ["appCat.builtin", 'appCat.finance', 'appCat.office', 'appCat.social', 
       'appCat.travel', 'appCat.unknown', 'appCat.utilities', 'appCat.weather']
com_other_apps = ['appCat.communication', 'appCat.other']
entertain_apps = ['appCat.entertainment', 'appCat.game']

In [11]:
dataframe[dataframe["screen"].isna()]

Unnamed: 0,no,target,t,activity,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,appCat.game,appCat.office,...,appCat.travel,appCat.unknown,appCat.utilities,appCat.weather,call,circumplex.arousal,circumplex.valence,mood,screen,sms
283,71,6,3,,,,,,,,...,,,,,3.0,0.000000,0.0,6.500000,,
286,72,5,2,,,,,,,,...,,,,,3.0,0.000000,0.0,6.500000,,
289,73,6,1,,,,,,,,...,,,,,3.0,0.000000,0.0,6.500000,,
291,73,6,3,,,,,,,,...,,,,,9.0,-0.666667,0.0,5.000000,,3.0
292,74,6,0,,,,,,,,...,,,,,3.0,0.000000,0.0,6.500000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,999,7,3,,,,,,,,...,,,,,,1.000000,1.0,7.000000,,
3996,1000,7,0,,,,,,,,...,,,,,,1.000000,1.0,7.333333,,
3997,1000,7,1,,,,,,,,...,,,,,,0.400000,1.0,7.000000,,
3998,1000,7,2,,,,,,,,...,,,,,,1.000000,1.0,7.000000,,


In [40]:
df_daily = create_daily_df(df)
names_missing = ['screen', 'appCat.weather', 'appCat.finance', 'sms', 'activity', 'appCat.communication', 'appCat.travel', 'appCat.other', 'appCat.entertainment', 'appCat.game', 'appCat.unknown', 'appCat.builtin', 'appCat.utilities', 'appCat.social', 'appCat.office']
df_id = df_daily[(df_daily["id"] == "AS14.02")]
missing = []
for var in names_missing: 
    miss = df_id[df_id["variable"]==var].mean()
    print(miss)
    missing.append(float(df_id[df_id["variable"]==var].mean()))
print(missing) 

value    4743.092514
dtype: float64
id          NaN
date        NaN
variable    NaN
value       NaN
dtype: object


TypeError: cannot convert the series to <class 'float'>