# EXAMPLE: Combined QS and Personal Tracking Data

This Notebook provides an example on how to combine multiple data sources into a combined dataframe. In this example, we combine both manual exports and some data downloaded from specific services' APIs, including RescueTime, Apple Health, Last.fm, and others. 

NOTE: This is still a work-in-progress. 

-----

## Dependencies and Libraries

In [1]:
from datetime import date, datetime as dt, timedelta as td
import pytz
import numpy as np
import pandas as pd

In [2]:
# functions to convert UTC to Shanghai time zone and extract date/time elements
convert_tz = lambda x: x.to_pydatetime().replace(tzinfo=pytz.utc).astimezone(pytz.timezone('Asia/Shanghai'))
get_year = lambda x: convert_tz(x).year
get_month = lambda x: '{}-{:02}'.format(convert_tz(x).year, convert_tz(x).month) #inefficient
get_date = lambda x: '{}-{:02}-{:02}'.format(convert_tz(x).year, convert_tz(x).month, convert_tz(x).day) #inefficient
get_day = lambda x: convert_tz(x).day
get_hour = lambda x: convert_tz(x).hour
get_day_of_week = lambda x: convert_tz(x).weekday()

-----

## Import Data

### Steps

In [3]:
# fitbit
# TODO

# apple health
# steps = pd.read_csv("apple_health/data/steps_per_day.csv", names=['Date', 'Steps'])

# Gyroscope Steps Import
steps = pd.read_csv("/Users/markkoester/Dropbox/tracking/gyroscope/gyroscope-Mark-steps-export.csv", names=["Date", "Steps", "Service"])
steps = steps.drop(['Service'], axis=1)

# steps.head()

## Tasks

In [4]:
# todoist
tasks = pd.read_csv("todoist/data/todoist-daily-completed.csv", names=["Date", "Tasks"])

# tasks.tail()

### Time

In [5]:
# toggl
toggl = pd.read_csv('toggl/data/daily_project_time.csv', names=["Date", "ProjectSeconds"])

toggl['ProjectTime'] = toggl['ProjectSeconds'] / 60 # in minutes
# toggl['ProjectTimeHours'] = toggl['ProjectSeconds'] / 60 / 60
# toggl['ProjectTimePer'] = round((toggl['ProjectSeconds'] / 60 / 60 / 24), 3)
toggl = toggl.drop(['ProjectSeconds'], axis=1)


# toggl.tail()

In [6]:
# RescueTime
rescuetime = pd.read_csv("rescuetime/data/days_productive_time.csv", 
                        names=['Date', 'NeutralTime', 'ProductiveTime', 'DistractingTime'])

# rescuetime.tail()

### HRV and Other Morning Data Points

In [7]:
# HRV4Training
hrv_data = pd.read_csv("/Users/markkoester/Dropbox/tracking/hrv4training/hr4training.csv")
hrv_data = hrv_data.drop(['test_duration',
       'training', 'training_performance', 'training_type', 'training_phase',
       'physical_condition', 'trainingRPE', ' trainingTSS', ' suffer_score',
       ' trainingMotivation', ' trainingDistance', 'mental_energy',
       'muscle_soreness', 'fatigue', 'advice', 'note', 'signal_quality',
       ' supplements', ' diet', ' custom_tag_1_name', ' custom_tag_2_name',
       ' custom_tag_3_name', ' custom_tag_1_value', ' custom_tag_2_value',
       ' custom_tag_3_value', ' menstrual_cycle', ' trainingTime',
       ' current_lifestyle', ' run_distance', ' run_time', ' run_pace',
       ' run_hr', ' run_elevation', ' bike_distance', ' bike_time',
       ' bike_speed', ' bike_elevation', ' bike_hr', ' bike_power',
       ' swim_distance', ' swim_time', ' swim_speed', ' swim_hr',
       ' daily_message  '], axis=1)
hrv_data['date'] = pd.to_datetime(hrv_data['date'])
hrv_data['date'] = hrv_data['date'].apply(lambda x: x.strftime('%Y-%m-%d')) # note: not very efficient
hrv_data.columns = ['Date', 'timestamp_measurement', 'HR', 'AVNN', 'SDNN', 'rMSSD', 'pNN50',
       'LF', 'HF', 'LFHF', 'HRV4T_Recovery_Points', 'sleep_quality',
       'sleep_time', 'sleep_tobed', 'sleep_awake', 'traveling', 'sickness',
       'alcohol', 'baseline', 'location', 'vo2max', 'latitude',
       'longitude', 'altitude', 'temperature', 'humidity']

# hrv_data.tail()

In [8]:
# TODO: convert sleep duration text to minutes

In [9]:
hrv_data.to_csv("data/hrv_data.csv", index=None)

### Running

In [10]:
# strava
# strava = pd.read_csv('strava/data/strava-activities-raw.csv')
# TODO: Process

# Gyroscope 
running = pd.read_csv("/Users/markkoester/Dropbox/tracking/gyroscope/gyroscope-Mark-running-export.csv")
running = running.drop(['service', 'id'], axis=1)

running['start_time'] = pd.to_datetime(running['start_time'])
running['end_time'] = pd.to_datetime(running['end_time'])

running['seconds'] = (running['end_time'] - running['start_time']).dt.total_seconds()
running['minutes'] = running['seconds'] / 60

# parse out date and time elements as Shanghai time
running['year'] = running['start_time'].map(get_year)
running['month'] = running['start_time'].map(get_month)
running['Date'] = running['start_time'].map(get_date)
running['day'] = running['start_time'].map(get_day)
running['hour'] = running['start_time'].map(get_hour)
running['dow'] = running['start_time'].map(get_day_of_week)

# running.head()

In [11]:
# combine to daily number
# how many days did I run?
daily_running = running.groupby(['Date'])['minutes', 'meters', 'climb'].sum()
daily_running.columns = ['RunningMinutes', 'RunningMeters', 'RunningClimb']
print('{:,} total days of running'.format(len(daily_running)))
daily_running.to_csv('data/daily_running.csv')

# daily_running.tail(5)

463 total days of running


In [12]:
daily_running = pd.read_csv('data/daily_running.csv')

### Reading

In [13]:
# Books: Good Reads

# books = pd.read_csv("goodreads/data/books.csv")
# books_read = pd.read_csv("goodreads/data/books_read.csv")
# books_read.tail()
# len(books_read)

books_read_with_date = pd.read_csv("goodreads/data/books-read.csv")

#books_read_with_date.tail()

In [14]:
# books_read_with_date.columns

In [15]:
# combine to daily number
# how many days did I finish a book?
# daily_books = books_read_with_date.groupby(['date'])['num_page', 'meters', 'climb'].sum()
#daily_books.columns = ['RunningSeconds', 'RunningMeters', 'RunningClimb']
#print('{:,} total days of running'.format(len(daily_running)))
#daily_running.to_csv('data/daily_running.csv')
#daily_running.tail(5)

In [16]:
# kindle
highlights_count = pd.read_csv("kindle/data/daily_count_kindle_clippings.csv", names=["Date", "KindleHighlights"])

#highlights_count.tail()

In [17]:
# Articles Read
# TODO: Pocket Articles
# TODO: Instapaper Articles

### Photos

In [18]:
# Instagram Photo Taking from Gyroscope Export
instagram = pd.read_csv("/Users/markkoester/Dropbox/tracking/gyroscope/gyroscope-Mark-photos-export.csv")
instagram['Time'] = pd.to_datetime(instagram['Time'])
instagram['year'] = instagram['Time'].map(get_year)
instagram['month'] = instagram['Time'].map(get_month)
instagram['date'] = instagram['Time'].map(get_date)
instagram['day'] = instagram['Time'].map(get_day)
instagram['hour'] = instagram['Time'].map(get_hour)
instagram['dow'] = instagram['Time'].map(get_day_of_week)

# instagram.head()

In [19]:
# how many days did I post a photo on social media?
daily_social_photos = instagram['date'].value_counts().sort_index()
print('{:,} total days when I posted a photo on social media'.format(len(daily_social_photos)))
daily_social_photos.to_csv('data/social_photos_count.csv')

# daily_social_photos.tail(10)

536 total days when I posted a photo on social media


In [20]:
daily_social_photos = pd.read_csv('data/social_photos_count.csv', names=['Date', 'Photos'])

In [21]:
# Device Photo Taking from PhotoStats
# TODO

In [22]:
# Podcast Listening
# TODO

### Music Listening

In [23]:
# Last.fm
music = pd.read_csv("last_fm/data/lastfm_daily_listens.csv", names=["Date", "Songs"])

# music.tail()

## Stats

In [24]:
print('{:,} total computer days'.format(len(rescuetime)))
print('{:,} total toggl project time days'.format(len(toggl)))
print('{:,} total days with steps'.format(len(steps)))
print('{:,} total days with tasks completed'.format(len(tasks)))
print('{:,} total days with HRV data'.format(len(hrv_data)))
print('{:,} total days when I posted a photo on social media'.format(len(daily_social_photos)))
print('{:,} total days with songs listened to'.format(len(music)))
print('{:,} total days when I did a run'.format(len(daily_running)))

2,171 total computer days
1,883 total toggl project time days
826 total days with steps
630 total days with tasks completed
597 total days with HRV data
536 total days when I posted a photo on social media
540 total days with songs listened to
463 total days when I did a run


----

## Combine and Merge into Unified Dataframe

In [25]:
from functools import reduce

In [26]:
dfs = [rescuetime, toggl, steps, tasks, highlights_count, hrv_data, daily_social_photos, music, daily_running]
data = reduce(lambda left,right: pd.merge(left,right,on='Date', how='left'), dfs)

In [27]:
print('{:,} total number of data days'.format(len(data)))

2,171 total number of data days


In [28]:
print('{:,} total number of columns'.format(len(data.columns)))
data.columns

38 total number of columns


Index(['Date', 'NeutralTime', 'ProductiveTime', 'DistractingTime',
       'ProjectTime', 'Steps', 'Tasks', 'KindleHighlights',
       'timestamp_measurement', 'HR', 'AVNN', 'SDNN', 'rMSSD', 'pNN50', 'LF',
       'HF', 'LFHF', 'HRV4T_Recovery_Points', 'sleep_quality', 'sleep_time',
       'sleep_tobed', 'sleep_awake', 'traveling', 'sickness', 'alcohol',
       'baseline', 'location', 'vo2max', 'latitude', 'longitude', 'altitude',
       'temperature', 'humidity', 'Photos', 'Songs', 'RunningMinutes',
       'RunningMeters', 'RunningClimb'],
      dtype='object')

In [29]:
data.to_csv("data/combined_personal_data.csv")