# Assignment 1 - DMT 
Notebook for the first assignment

### Import dataset 

In [1]:
# Import 
import pandas as pd
import numpy as np 
import csv 

In [2]:
# Import dataset and read as pandas dataframe 

dataset = "dataset_mood_smartphone.csv" 
df = pd.read_csv(dataset)

# Add date column to dataframe 
df["date"] = df["time"].str[:10]

# Show example of dataset 
df.head()

Unnamed: 0.1,Unnamed: 0,id,time,variable,value,date
0,1,AS14.01,2014-02-26 13:00:00.000,mood,6.0,2014-02-26
1,2,AS14.01,2014-02-26 15:00:00.000,mood,6.0,2014-02-26
2,3,AS14.01,2014-02-26 18:00:00.000,mood,6.0,2014-02-26
3,4,AS14.01,2014-02-26 21:00:00.000,mood,7.0,2014-02-26
4,5,AS14.01,2014-02-27 09:00:00.000,mood,6.0,2014-02-27


In [3]:
# Define the variables of which we want the mean per day - all other variables are aggregated in a sum. 
mean_vars = ["mood", "circumplex.arousal", "circumplex.valence"]

# Create the dataframe with the means of variables per day  
df_means = df[df["variable"].isin(mean_vars)]
df_means =  df_means.groupby(["id", "date", "variable"], as_index=False)[["value"]].mean()

# Create the datafram with the sums of variables per day
df_sums = df[~df["variable"].isin(mean_vars)]
df_sums = df_sums.groupby(["id", "date", "variable"], as_index=False)[["value"]].sum()

# Concatenate the two dataframes into the df with daily values
df_daily = pd.concat([df_sums, df_means])
df_daily = df_daily.sort_values(by=["id", "date"])

# Print preview df_daily
df_daily


Unnamed: 0,id,date,variable,value
0,AS14.01,2014-02-17,call,2.0
1,AS14.01,2014-02-18,call,1.0
2,AS14.01,2014-02-19,call,7.0
3,AS14.01,2014-02-19,sms,2.0
4,AS14.01,2014-02-20,call,2.0
...,...,...,...,...
3799,AS14.33,2014-05-30,circumplex.valence,-0.4
3800,AS14.33,2014-05-30,mood,6.8
3801,AS14.33,2014-05-31,circumplex.arousal,-2.0
3802,AS14.33,2014-05-31,circumplex.valence,1.0


In [4]:
# Get 
ids = df_daily[df_daily["variable"] == "mood"]["id"].tolist()

from datetime import datetime

date_instances = []
date_targets = []

for patient in ids: 
    
    date_instances_id = []
    date_targets_id = []
    
    # Get list of all dates that have a mood recording 
    date_strs = df_daily[(df_daily["variable"]=="mood") & (df_daily["id"] == patient)]["date"].tolist()
    
    # convert dates list to actual times
    dates = [datetime.strptime(d, "%Y-%m-%d") for d in date_strs]
    
    # range through all dates of of a patient 
    for i in range(len(dates)-5):
        diff_time = dates[i+5] - dates[i]
        if diff_time.days != 5: 
            # To ensure there are 5 consecutive days 
            continue 
        date_instances_id.append(date_strs[i:i+4])
        date_targets_id.append(date_strs[i+5])
    
    date_instances.append(date_instances_id)
    date_targets.append(date_targets_id)
        
# This creates a list of lists, where the first item corresponds to the first patient id, and this list is: 
# date_instances = [[t1, t2, t3, t4], [t2, t3, t4, t5], ...]
# date_targets = [t5, t6, ...]
print(date_instances[0])

[['2014-03-21', '2014-03-22', '2014-03-23', '2014-03-24'], ['2014-03-22', '2014-03-23', '2014-03-24', '2014-03-25'], ['2014-03-23', '2014-03-24', '2014-03-25', '2014-03-26'], ['2014-03-24', '2014-03-25', '2014-03-26', '2014-03-27'], ['2014-03-25', '2014-03-26', '2014-03-27', '2014-03-28'], ['2014-03-26', '2014-03-27', '2014-03-28', '2014-03-29'], ['2014-03-27', '2014-03-28', '2014-03-29', '2014-03-30'], ['2014-03-28', '2014-03-29', '2014-03-30', '2014-03-31'], ['2014-03-29', '2014-03-30', '2014-03-31', '2014-04-01'], ['2014-03-30', '2014-03-31', '2014-04-01', '2014-04-02'], ['2014-03-31', '2014-04-01', '2014-04-02', '2014-04-03'], ['2014-04-01', '2014-04-02', '2014-04-03', '2014-04-04'], ['2014-04-02', '2014-04-03', '2014-04-04', '2014-04-05'], ['2014-04-03', '2014-04-04', '2014-04-05', '2014-04-06'], ['2014-04-04', '2014-04-05', '2014-04-06', '2014-04-07'], ['2014-04-05', '2014-04-06', '2014-04-07', '2014-04-08'], ['2014-04-06', '2014-04-07', '2014-04-08', '2014-04-09'], ['2014-04-07'

In [6]:
print(date_instances[0])

[['2014-03-21', '2014-03-22', '2014-03-23', '2014-03-24'], ['2014-03-22', '2014-03-23', '2014-03-24', '2014-03-25'], ['2014-03-23', '2014-03-24', '2014-03-25', '2014-03-26'], ['2014-03-24', '2014-03-25', '2014-03-26', '2014-03-27'], ['2014-03-25', '2014-03-26', '2014-03-27', '2014-03-28'], ['2014-03-26', '2014-03-27', '2014-03-28', '2014-03-29'], ['2014-03-27', '2014-03-28', '2014-03-29', '2014-03-30'], ['2014-03-28', '2014-03-29', '2014-03-30', '2014-03-31'], ['2014-03-29', '2014-03-30', '2014-03-31', '2014-04-01'], ['2014-03-30', '2014-03-31', '2014-04-01', '2014-04-02'], ['2014-03-31', '2014-04-01', '2014-04-02', '2014-04-03'], ['2014-04-01', '2014-04-02', '2014-04-03', '2014-04-04'], ['2014-04-02', '2014-04-03', '2014-04-04', '2014-04-05'], ['2014-04-03', '2014-04-04', '2014-04-05', '2014-04-06'], ['2014-04-04', '2014-04-05', '2014-04-06', '2014-04-07'], ['2014-04-05', '2014-04-06', '2014-04-07', '2014-04-08'], ['2014-04-06', '2014-04-07', '2014-04-08', '2014-04-09'], ['2014-04-07'

In [7]:
d = date_instances[0][0]
# df_daily[(df_daily["date"].isin(d)) & (df_daily["id"] == "AS14.01")]
test_inst = df_daily[(df_daily["date"].isin(d)) & (df_daily["id"] == "AS14.01")]
l = test_inst[(test_inst["date"]=="2014-03-21") & (test_inst["variable"]=="appCat.builtin")]["value"]
# int(l)
print(type(l))
# print(test_inst)
print(l)
t1 = test_inst[test_inst["date"]=="2014-03-21"]
print(t1)
variables = df_daily["variable"].drop_duplicates().tolist()
print(variables)
t1[t1["variable"]]

<class 'pandas.core.series.Series'>
42    3139.218
Name: value, dtype: float64
         id        date              variable         value
41  AS14.01  2014-03-21              activity      3.083152
42  AS14.01  2014-03-21        appCat.builtin   3139.218000
43  AS14.01  2014-03-21  appCat.communication   6280.890000
44  AS14.01  2014-03-21  appCat.entertainment   1007.456000
45  AS14.01  2014-03-21        appCat.finance     49.544000
46  AS14.01  2014-03-21         appCat.office    172.206000
47  AS14.01  2014-03-21          appCat.other    239.751000
48  AS14.01  2014-03-21         appCat.social   4508.500000
49  AS14.01  2014-03-21         appCat.travel    915.445000
50  AS14.01  2014-03-21      appCat.utilities    598.754000
51  AS14.01  2014-03-21                  call      6.000000
52  AS14.01  2014-03-21                screen  17978.907000
6   AS14.01  2014-03-21    circumplex.arousal      0.200000
7   AS14.01  2014-03-21    circumplex.valence      0.200000
8   AS14.01  2014-03-

KeyError: "None of [Index(['activity', 'appCat.builtin', 'appCat.communication',\n       'appCat.entertainment', 'appCat.finance', 'appCat.office',\n       'appCat.other', 'appCat.social', 'appCat.travel', 'appCat.utilities',\n       'call', 'screen', 'circumplex.arousal', 'circumplex.valence', 'mood'],\n      dtype='object')] are in the [columns]"