In [266]:
import json
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import glob
import os
import seaborn as sns
import matplotlib.ticker as ticker
from datetime import datetime
from collections import defaultdict

# In this work, I first import and analyze data from Google browsing, Fitbit, Strava, Instagram, and LinkedIn, and then will use it all together for some correlations
# Then look at how it changes during different life events
# This specific file just gets all the data and merges it into 1 csv file

## Google Browsing Data

In [267]:
def get_google_data(type_data):
    file_in = f'./Takeout_{type_data}/Chrome_{type_data}/History.json'
    with open(file_in, "r", encoding="utf-8") as f:
        google_data = json.load(f)

    browser_history = google_data.get("Browser History", []) 

    # Convert JSON to DataFrame
    google_df = pd.DataFrame(browser_history)

    # Convert time from microseconds to standard datetime
    google_df["datetime"] = pd.to_datetime(google_df["time_usec"] // 1_000_000, unit="s")
    google_df = google_df[["datetime", "title", "url"]]
    print(google_df.head(2))

    # Find the first and last date!
    first_date = google_df["datetime"].min()
    last_date = google_df["datetime"].max()

    print(f"First recorded browsing activity for {type_data}: {first_date}")
    print(f"Last recorded browsing activity for {type_data}: {last_date}")

    # google searches vs other browsing
    # Create a new column to classify searches
    google_df["is_google_search"] = google_df["title"].str.contains("- Google Search", na=False)

    # Count searches vs. other browsing
    search_count = google_df["is_google_search"].sum()
    other_count = len(google_df) - search_count

    print(f"Total Google Searches for {type_data}: {search_count}")
    print(f"Total Other Browsing for {type_data}: {other_count}")

    # Visualization
    counts = [search_count, other_count]
    labels = ["Google Searches", "Other Browsing"]

    plt.figure(figsize=(6, 4))
    plt.bar(labels, counts)
    plt.xlabel("Activity Type")
    plt.ylabel("Count")
    plt.title(f"Google Searches vs. Other Browsing for {type_data}")
    plt.show()

    # Total the number of sites visited per day
    daily_site_visits = google_df.groupby(google_df["datetime"].dt.date).size()

    # Plot the time series of daily site visits
    plt.figure(figsize=(12, 5))
    plt.plot(daily_site_visits.index, daily_site_visits.values, marker="o", linestyle="-", label="Sites Visited")

    plt.xlabel("Date")
    plt.ylabel("Number of Sites Visited")
    plt.title(f"Daily Number of Sites Visited Over Time for {type_data}")
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.legend()
    plt.show()


    # Find the number of sites visited per day
    daily_site_visits = google_df.groupby(google_df["datetime"].dt.date).size()

    # Find the number of Google searches per day
    daily_google_searches = google_df[google_df["is_google_search"]].groupby(google_df["datetime"].dt.date).size()

    # Calc statistics
    avg_sites_per_day = daily_site_visits.mean()
    std_sites_per_day = daily_site_visits.std()
    avg_google_searches_per_day = daily_google_searches.mean()
    std_google_searches_per_day = daily_google_searches.std()

    # Print the results
    print(f"Average number of sites visited per day for {type_data}: {avg_sites_per_day:.2f}")
    print(f"Standard deviation of sites visited per day for {type_data}: {std_sites_per_day:.2f}")
    print(f"Average number of Google searches per day for {type_data}: {avg_google_searches_per_day:.2f}")
    print(f"Standard deviation of Google searches per day for {type_data}: {std_google_searches_per_day:.2f}")

    # Find the number of non-Google search site visits per day
    daily_non_google_visits = google_df[~google_df["is_google_search"]].groupby(google_df["datetime"].dt.date).size()

    # Get statistics
    avg_non_google_sites_per_day = daily_non_google_visits.mean()
    std_non_google_sites_per_day = daily_non_google_visits.std()

    print(f"Average number of non-Google search sites visited per day for {type_data}: {avg_non_google_sites_per_day:.2f}")
    print(f"Standard deviation of non-Google search sites visited per day for {type_data}: {std_non_google_sites_per_day:.2f}")

    # Daily time series
    daily = pd.DataFrame({
        "date": daily_site_visits.index,
        f"total_sites_{type_data}": daily_site_visits.values,
        f"google_searches_{type_data}": daily_google_searches.reindex(daily_site_visits.index, fill_value=0).values,
    })

    # Summary stats
    summary = {
        "type": type_data,
        "total_searches": search_count,
        "total_other": other_count,
        "avg_sites_per_day": avg_sites_per_day,
        "std_sites_per_day": std_sites_per_day,
        "avg_searches_per_day": avg_google_searches_per_day,
        "std_searches_per_day": std_google_searches_per_day,
        "avg_non_searches_per_day": avg_non_google_sites_per_day,
        "std_non_searches_per_day": std_non_google_sites_per_day,
    }

# return it
    return daily, pd.DataFrame([summary])



In [None]:
personal_daily, personal_summary = get_google_data("personal")
school_daily, school_summary = get_google_data("UVA")

In [None]:
# Merge by date!! So we can just have total sites visited
merged_daily = pd.merge(personal_daily, school_daily, on="date", how="outer").fillna(0)
merged_daily["date"] = pd.to_datetime(merged_daily["date"])

# Plot side-by-side comparison
plt.figure(figsize=(12, 6))
plt.plot(merged_daily["date"], merged_daily["total_sites_personal"], label="Personal Browsing")
plt.plot(merged_daily["date"], merged_daily["total_sites_UVA"], label="UVA Browsing")
plt.xlabel("Date")
plt.ylabel("Sites Visited")
plt.title("Personal vs. UVA Browsing Over Time")
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.show()

combined_summary = pd.concat([personal_summary, school_summary])
print(combined_summary)

# Total activity
total_searches = combined_summary["total_searches"].sum()
total_other = combined_summary["total_other"].sum()
print(f"Total Google Searches (combined): {total_searches}")
print(f"Total Other Browsing (combined): {total_other}")

In [None]:
merged_daily["total_sites_visited"] = (
    merged_daily["total_sites_personal"] + merged_daily["total_sites_UVA"]
)

google_browser_data = merged_daily
google_browser_data

In [None]:
# Just keep date and total sites visited
use_google_browser_data = google_browser_data[["date", "total_sites_visited"]]
use_google_browser_data

### From now on, we use use_google_browser_data to access total browsing activity!

## Fitbit Data

### Sleep

In [None]:
# Extract sleep start/end times
# Define the date range
start_date = pd.to_datetime("2024-05-14").date()
end_date = pd.to_datetime("2025-04-15").date()

# Find all files that start with "sleep"
sleep_files = glob.glob("./Takeout_personal/Fitbit/Global_Export_Data/sleep*.json")

print("Found sleep files:", sleep_files)

records = []

for file in sleep_files:
    print(f"\nProcessing file: {file}") 
    with open(file, "r", encoding="utf-8") as f:
        try:
            data = json.load(f)
            
            if not isinstance(data, list):
                print(f"Unexpected data format in {file}: {type(data)}. Skipping.")
                continue
            
            for session in data:
                try:
                    # Extract sleep session date
                    date_str = session.get("dateOfSleep", None)
                    if not date_str:
                        print(f"Missing dateOfSleep in {file}, skipping entry.")
                        continue  # Skip if no dateOfSleep

                    session_date = pd.to_datetime(date_str, errors="coerce").date()
                    
                    print(f"Extracted sleep date: {session_date}")

                    # Ensure it is within the desired date range
                    if start_date <= session_date <= end_date:
                        sleep_start = pd.to_datetime(session.get("startTime", None), errors="coerce")
                        wake_time = pd.to_datetime(session.get("endTime", None), errors="coerce")

                        if pd.notnull(sleep_start) and pd.notnull(wake_time):  
                            records.append({
                                "date": session_date,
                                "startTime": sleep_start,
                                "endTime": wake_time
                            })
                        else:
                            print(f"Invalid sleep times for {session_date}, skipping!")
                except Exception as e:
                    print(f"Error processing session in {file}: {e}")
        except json.JSONDecodeError:
            print(f"Error reading JSON file: {file}")

sleep_df = pd.DataFrame(records)

if not sleep_df.empty and "date" in sleep_df.columns:
    sleep_df = sleep_df.sort_values("date").reset_index(drop=True)
else:
    print("No valid sleep records found in the date range.")


In [None]:
sleep_df

In [270]:
# Calc sleep duration!
sleep_df["startTime"] = pd.to_datetime(sleep_df["startTime"])
sleep_df["endTime"] = pd.to_datetime(sleep_df["endTime"])
sleep_df["sleep_duration"] = sleep_df["endTime"] - sleep_df["startTime"]
sleep_df["sleep_hours"] = sleep_df["sleep_duration"].dt.total_seconds() / 3600


In [None]:
# Get some stats
sleep_df["sleep_hours"].describe()

In [None]:
#Sleep duration visualization...

plt.figure(figsize=(12, 5))

plt.plot(sleep_df["date"], sleep_df["sleep_hours"], linestyle='-', linewidth=2, color='steelblue', label="Sleep Duration")
plt.title("Daily Sleep Duration Over Time", fontsize=16)
plt.xlabel("Date", fontsize=12)
plt.ylabel("Hours of Sleep", fontsize=12)
plt.xticks(rotation=45)
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend()
plt.tight_layout()
plt.show()

# 7-day rolling average of sleep hours
sleep_df["rolling_7day_avg"] = sleep_df["sleep_hours"].rolling(window=7).mean()

# Plot original + rolling average
plt.figure(figsize=(12, 5))
plt.plot(sleep_df["date"], sleep_df["sleep_hours"], alpha=0.5, label="Daily Sleep")
plt.plot(sleep_df["date"], sleep_df["rolling_7day_avg"], color='red', linewidth=2, label="7-Day Rolling Avg")
plt.title("Sleep Duration with 7-Day Rolling Average")
plt.xlabel("Date")
plt.ylabel("Hours of Sleep")
plt.xticks(rotation=45)
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()



In [None]:
# Visualize sleep
plt.figure(figsize=(8, 4))
plt.hist(sleep_df["sleep_hours"], bins=20, color="skyblue", edgecolor="black")
plt.title("Distribution of Sleep Durations")
plt.xlabel("Hours of Sleep")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()

poor_sleep = sleep_df[sleep_df["sleep_hours"] < 6]
good_sleep = sleep_df[(sleep_df["sleep_hours"] >= 6) & (sleep_df["sleep_hours"] <= 9)]
long_sleep = sleep_df[sleep_df["sleep_hours"] > 9]

print(f"Number of short sleep nights (<6 hrs): {len(poor_sleep)}")
print(f"Number of good sleep nights (6-9 hrs): {len(good_sleep)}")
print(f"Number of long sleep nights (>9 hrs): {len(long_sleep)}")

sleep_df["date"] = pd.to_datetime(sleep_df["date"]) 
weekly_avg = sleep_df.set_index("date")["sleep_hours"].resample("W").mean()


#### Now add sleep score too

In [193]:
score_df = pd.read_csv("./Takeout_personal/Fitbit/Sleep_Score/sleep_score.csv")

# Convert timestamp to datetime and extract date
score_df["date"] = pd.to_datetime(score_df["timestamp"]).dt.date

# Keep only the date and score
score_df = score_df[["date", "overall_score"]]

In [None]:
score_df

In [195]:
# Convert both to pandas datetime (no time part)
sleep_df["date"] = pd.to_datetime(sleep_df["date"]).dt.normalize()
score_df["date"] = pd.to_datetime(score_df["date"]).dt.normalize()
sleep_df = pd.merge(sleep_df, score_df, on="date", how="left")


In [None]:
sleep_df

In [None]:
# Visualize!!


# Rolling average
sleep_df["rolling_7day_avg"] = sleep_df["sleep_hours"].rolling(window=7).mean()

# Set up dual-axis plot
fig, ax1 = plt.subplots(figsize=(12, 5))

# Left Y-axis: Sleep hours
ax1.plot(sleep_df["date"], sleep_df["sleep_hours"], color='lightblue', alpha=0.5, label="Daily Sleep")
ax1.plot(sleep_df["date"], sleep_df["rolling_7day_avg"], color='red', linewidth=2, label="7-Day Rolling Avg")
ax1.set_xlabel("Date")
ax1.set_ylabel("Sleep Duration (hrs)", color='black')
ax1.tick_params(axis='y')
ax1.grid(True, linestyle='--', alpha=0.5)

# Right Y-axis: Sleep score
ax2 = ax1.twinx()
ax2.plot(sleep_df["date"], sleep_df["overall_score"], color='green', linestyle='--', label="Sleep Score")
ax2.set_ylabel("Sleep Score", color='green')
ax2.tick_params(axis='y', labelcolor='green')

# Combine legends
lines_1, labels_1 = ax1.get_legend_handles_labels()
lines_2, labels_2 = ax2.get_legend_handles_labels()
ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc="upper left")

# Title and layout
plt.title("Sleep Duration and Sleep Score Over Time")
plt.xticks(rotation=45)
fig.tight_layout()
plt.show()


In [None]:
# Fix some naming to make sense for later
use_sleep_data = sleep_df[["date", "startTime", "endTime", "sleep_hours", "overall_score"]]
use_sleep_data.rename(columns={"startTime": "sleep_start_time"}, inplace=True)
use_sleep_data.rename(columns={"endTime": "sleep_end_time"}, inplace=True)
use_sleep_data.rename(columns={"overall_score": "sleep_score"}, inplace=True)

In [None]:
use_sleep_data

### Fitbit Heart Rate data

In [None]:
# Define date range
start_date = pd.to_datetime("2024-05-14").date()
end_date = pd.to_datetime("2025-04-15").date()

# All heart rate JSON files
hr_files = glob.glob("./Takeout_personal/Fitbit/Global_Export_Data/heart_rate-*.json")

daily_averages = []

# Loop through files
for file_path in hr_files:
    # Extract date from filename
    filename = os.path.basename(file_path)
    date_str = filename.replace("heart_rate-", "").replace(".json", "")
    file_date = pd.to_datetime(date_str).date()

    # Skip if out of date range
    if not (start_date <= file_date <= end_date):
        continue

    with open(file_path, "r") as f:
        try:
            data = json.load(f)
            bpm_values = []

            for entry in data:
                if isinstance(entry, dict) and "value" in entry:
                    val = entry["value"]
                    if isinstance(val, dict) and "bpm" in val:
                        bpm_values.append(val["bpm"])

            if bpm_values:
                avg_bpm = sum(bpm_values) / len(bpm_values)
                daily_averages.append({"date": file_date, "avg_bpm": avg_bpm})

        except Exception as e:
            print(f"Skipping {file_path} due to error: {e}")

# Final DataFrame
heart_rate_daily_avg_df = pd.DataFrame(daily_averages)
heart_rate_daily_avg_df.sort_values("date", inplace=True)

print(heart_rate_daily_avg_df.head())


In [None]:
heart_rate_daily_avg_df.rename(columns={"avg_bpm": "avg_heart_rate"}, inplace=True)
heart_rate_daily_avg_df

In [None]:
# Visualize and stats
plt.figure(figsize=(14, 5))
plt.plot(heart_rate_daily_avg_df["date"], heart_rate_daily_avg_df["avg_heart_rate"], color="salmon", linewidth=2)
plt.title("Daily Average Heart Rate Over Time")
plt.xlabel("Date")
plt.ylabel("Avg Heart Rate (BPM)")
plt.grid(True, linestyle="--", alpha=0.6)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

heart_rate_daily_avg_df["rolling_avg"] = heart_rate_daily_avg_df["avg_heart_rate"].rolling(window=7).mean()

plt.figure(figsize=(14, 5))
plt.plot(heart_rate_daily_avg_df["date"], heart_rate_daily_avg_df["avg_heart_rate"], color="lightcoral", alpha=0.5, label="Daily Avg")
plt.plot(heart_rate_daily_avg_df["date"], heart_rate_daily_avg_df["rolling_avg"], color="darkred", linewidth=2, label="7-Day Rolling Avg")
plt.title("Heart Rate with 7-Day Rolling Average")
plt.xlabel("Date")
plt.ylabel("Avg BPM")
plt.grid(True, linestyle="--", alpha=0.6)
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

plt.figure(figsize=(8, 4))
plt.hist(heart_rate_daily_avg_df["avg_heart_rate"], bins=20, color="salmon", edgecolor="black")
plt.title("Distribution of Daily Average Heart Rate")
plt.xlabel("Avg BPM")
plt.ylabel("Frequency")
plt.grid(True, linestyle="--", alpha=0.5)
plt.tight_layout()
plt.show()

max_hr = heart_rate_daily_avg_df["avg_heart_rate"].max()
min_hr = heart_rate_daily_avg_df["avg_heart_rate"].min()
avg_hr = heart_rate_daily_avg_df["avg_heart_rate"].mean()
std_hr = heart_rate_daily_avg_df["avg_heart_rate"].std()

print(f"Max avg hr in a day: {max_hr}")
print(f"Min avg hr in a day: {min_hr}")
print(f"Average avg hr per day: {avg_hr:.2f}")
print(f"Standard deviation: {std_hr:.2f}")



### Fitbit active level

In [None]:
# Set date range
start_date = pd.to_datetime("2024-05-14").date()
end_date = pd.to_datetime("2025-04-15").date()

# Mapping from prefix to new column name
activity_types = {
    "very_active_minutes": "mins_very_active",
    "moderately_active_minutes": "mins_moderately_active",
    "lightly_active_minutes": "mins_lightly_active",
    "sedentary_minutes": "mins_sedentary"
}

# Collect daily rows for each activity type
all_dfs = []

for prefix, col_name in activity_types.items():
    files = glob.glob(f"./Takeout_personal/Fitbit/Global_Export_Data/{prefix}-*.json")
    rows = []

    for file_path in files:
        with open(file_path, "r") as f:
            try:
                data = json.load(f)
                for entry in data:
                    if "dateTime" in entry and "value" in entry:
                        dt = pd.to_datetime(entry["dateTime"]).date()
                        if start_date <= dt <= end_date:
                            value = int(entry["value"])
                            rows.append({"date": dt, col_name: value})
            except Exception as e:
                print(f"Skipping {file_path} due to error: {e}")

    # Convert this activity type to DataFrame
    df = pd.DataFrame(rows).groupby("date").sum().reset_index()
    all_dfs.append(df)

# Merge all activity type DataFrames on date
from functools import reduce
activity_df = reduce(lambda left, right: pd.merge(left, right, on="date", how="outer"), all_dfs)

# Fill missing with 0 and sort
activity_df.fillna(0, inplace=True)
activity_df.sort_values("date", inplace=True)

activity_df

In [None]:
# Visualize !!
# Make sure dates are sorted and clean
activity_df = activity_df.sort_values("date").reset_index(drop=True)

# Create x positions and labels
x = activity_df.index
labels = activity_df["date"].astype(str)  # safer if dt.strftime fails

# Build stacked bar chart
plt.figure(figsize=(20, 6))

plt.bar(x, activity_df["mins_sedentary"], label="Sedentary", color="#8c8c8c")
plt.bar(x, activity_df["mins_lightly_active"],
        bottom=activity_df["mins_sedentary"], label="Lightly Active", color="#a1d99b")
plt.bar(x, activity_df["mins_moderately_active"],
        bottom=activity_df["mins_sedentary"] + activity_df["mins_lightly_active"],
        label="Moderately Active", color="#fc8d62")
plt.bar(x, activity_df["mins_very_active"],
        bottom=activity_df["mins_sedentary"] + activity_df["mins_lightly_active"] + activity_df["mins_moderately_active"],
        label="Very Active", color="#1f78b4")

# X-ticks every N days (optional, prevents overcrowding)
tick_step = max(len(x) // 30, 1)  # Show ~30 labels max
plt.xticks(ticks=x[::tick_step], labels=labels[::tick_step], rotation=90)

# Set y-axis limit slightly above the tallest stacked bar
max_total = (activity_df[["mins_sedentary", "mins_lightly_active", "mins_moderately_active", "mins_very_active"]]
             .sum(axis=1).max())

plt.ylim(0, max_total * 1.05)  # 5% headroom

# Axis labels and grid
plt.xlabel("Date")
plt.ylabel("Minutes")
plt.title("Daily Activity Breakdown")
plt.legend(loc="upper right")
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()


In [None]:
# Diff types of visualization

plt.figure(figsize=(14, 5))
for col, color in zip(
    ["mins_very_active", "mins_moderately_active", "mins_lightly_active", "mins_sedentary"],
    ["#1f78b4", "#fc8d62", "#a1d99b", "#8c8c8c"]
):
    plt.plot(activity_df["date"], activity_df[col], label=col.replace("mins_", "").replace("_", " ").title(), linewidth=2, alpha=0.8, color=color)

plt.title("Activity Level Trends Over Time")
plt.xlabel("Date")
plt.ylabel("Minutes")
plt.xticks(rotation=45)
plt.grid(True, linestyle="--", alpha=0.5)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
activity_df["active_total"] = (
    activity_df["mins_very_active"] + activity_df["mins_moderately_active"] + activity_df["mins_lightly_active"]
)
activity_df["sedentary_total"] = activity_df["mins_sedentary"]

plt.figure(figsize=(14, 5))
plt.plot(activity_df["date"], activity_df["active_total"], label="Total Active", color="green", linewidth=2)
plt.plot(activity_df["date"], activity_df["sedentary_total"], label="Sedentary", color="gray", linewidth=2)
plt.title("Total Active vs Sedentary Time")
plt.xlabel("Date")
plt.ylabel("Minutes")
plt.legend()
plt.grid(True, linestyle="--", alpha=0.5)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
activity_df[["mins_very_active", "mins_moderately_active", "mins_lightly_active", "mins_sedentary"]].plot(
    kind="box", figsize=(10, 5), title="Distribution of Daily Activity Minutes", grid=True)



In [None]:
activity_df["mins_very_active"].describe()

In [None]:
activity_df["mins_moderately_active"].describe()

In [None]:
activity_df["mins_lightly_active"].describe()

In [None]:
activity_df["mins_sedentary"].describe()

In [None]:
activity_df

### STEPS!

In [None]:
# Define date range
start_date = pd.to_datetime("2024-05-14").date()
end_date = pd.to_datetime("2025-04-15").date()

# Find all relevant files
step_files = glob.glob("./Takeout_personal/Fitbit/Physical_Activity_GoogleData/steps_*.csv")

daily_steps = []

for file in step_files:
    df = pd.read_csv(file)

    # Parse timestamp and extract date
    df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True)
    df["date"] = df["timestamp"].dt.date

    # Filter by date range
    df = df[(df["date"] >= start_date) & (df["date"] <= end_date)]

    # Group and sum steps per day
    daily_totals = df.groupby("date")["steps"].sum().reset_index()

    daily_steps.append(daily_totals)

# Combine all files' data
steps_df = pd.concat(daily_steps).groupby("date").sum().reset_index()

steps_df.rename(columns={"steps": "total_steps"}, inplace=True)
steps_df.sort_values("date", inplace=True)
steps_df


In [None]:
# Visualize steps

# Compute 7-day rolling average
steps_df["rolling_avg"] = steps_df["total_steps"].rolling(window=7).mean()

# Plot
plt.figure(figsize=(14, 5))
plt.plot(steps_df["date"], steps_df["total_steps"], color="steelblue", alpha=0.4, label="Daily Steps")
plt.plot(steps_df["date"], steps_df["rolling_avg"], color="navy", linewidth=2, label="7-Day Rolling Avg")
plt.title("Daily Step Count with 7-Day Rolling Average")
plt.xlabel("Date")
plt.ylabel("Steps")
plt.grid(True, linestyle="--", alpha=0.6)
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
max_steps = steps_df["total_steps"].max()
min_steps = steps_df["total_steps"].min()
avg_steps = steps_df["total_steps"].mean()
std_steps = steps_df["total_steps"].std()

print(f"Max steps in a day: {max_steps}")
print(f"Min steps in a day: {min_steps}")
print(f"Average steps per day: {avg_steps:.2f}")
print(f"Standard deviation: {std_steps:.2f}")


# Strava!

In [None]:
strava_df = pd.read_csv("./strava/activities.csv")

# Select relevant columns
strava_clean = strava_df[["Activity Date", "Activity Type", "Distance"]].copy()

# Convert to datetime and filter date range
strava_clean["Activity Date"] = pd.to_datetime(strava_clean["Activity Date"]).dt.date
start_date = pd.to_datetime("2024-05-14").date()
end_date = pd.to_datetime("2025-04-15").date()
strava_clean = strava_clean[
    (strava_clean["Activity Date"] >= start_date) & 
    (strava_clean["Activity Date"] <= end_date)
]

# Convert times/distances to numeric
# strava_clean["Elapsed Time"] = pd.to_numeric(strava_clean["Elapsed Time"], errors="coerce")
strava_clean["Distance"] = pd.to_numeric(strava_clean["Distance"], errors="coerce")

# Drop incomplete rows
strava_clean.dropna(subset=["Distance"], inplace=True)

# Sort by date (optional)
strava_clean.sort_values("Activity Date", inplace=True)

strava_clean.rename(columns={"Activity Date": "date"}, inplace=True)
strava_clean.rename(columns={"Activity Type": "strava_activity_type"}, inplace=True)
# strava_clean.rename(columns={"Elapsed Time": "strava_activity_time(mins)"}, inplace=True)
strava_clean.rename(columns={"Distance": "strava_activity_distance(km)"}, inplace=True)

strava_clean


In [None]:
# Vis!!
plt.figure(figsize=(14, 6))
sns.lineplot(data=strava_clean, x="date", y="strava_activity_distance(km)", hue="strava_activity_type")
plt.title("Distance Over Time by Activity Type")
plt.xlabel("Date")
plt.ylabel("Distance (km)")
plt.xticks(rotation=45)
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()


In [None]:
strava_clean


In [None]:
# Ensure datetime format
strava_clean["date"] = pd.to_datetime(strava_clean["date"])

# Group by month and count number of activities
monthly_counts = strava_clean.groupby(pd.Grouper(key="date", freq="M")).size()

# Plot
plt.figure(figsize=(14, 5))
monthly_counts.plot(kind="bar", color="skyblue")
plt.title("Total Number of Strava Activities per Month")
plt.xlabel("Month")
plt.ylabel("Number of Activities")
plt.xticks(rotation=45)
plt.gca().yaxis.set_major_formatter(ticker.StrMethodFormatter("{x:,.0f}"))
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(10, 5))
sns.boxplot(data=strava_clean, x="strava_activity_type", y="strava_activity_distance(km)", palette="Set2")
plt.title("Distribution of Distances by Activity Type")
plt.ylabel("Distance (km)")
plt.tight_layout()
plt.show()


In [None]:
# Day of the weeeek
strava_clean["day_of_week"] = pd.to_datetime(strava_clean["date"]).dt.day_name()
dow_counts = strava_clean.groupby(["day_of_week", "strava_activity_type"]).size().unstack().fillna(0)

ordered_days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
dow_counts = dow_counts.reindex(ordered_days)

plt.figure(figsize=(10, 6))
sns.heatmap(dow_counts, annot=True, fmt=".0f", cmap="YlGnBu")
plt.title("Strava Activities by Day of Week and Type")
plt.tight_layout()
plt.show()


# LinkedIn

## Connections

In [None]:
# Load the csv
linkedin_df = pd.read_csv("./Linkedin/Connections.csv")

# Keep only the relevant columns
linkedin_clean = linkedin_df[["Connected On", "Position", "Company"]].copy()

# Convert 'Connected On' to datetime
linkedin_clean["Connected On"] = pd.to_datetime(linkedin_clean["Connected On"], errors="coerce")

# Filter for relevant date range
start_date = pd.to_datetime("2024-05-14")
end_date = pd.to_datetime("2025-04-15")
linkedin_clean = linkedin_clean[
    (linkedin_clean["Connected On"] >= start_date) & (linkedin_clean["Connected On"] <= end_date)
]

# Sort and rename columns
linkedin_clean.sort_values("Connected On", inplace=True)


linkedin_clean.rename(columns={"Connected On": "date"}, inplace=True)
linkedin_clean.rename(columns={"Position": "linkedin_connection_position"}, inplace=True)
linkedin_clean.rename(columns={"Company": "linkedin_connection_company"}, inplace=True)

linkedin_clean


In [None]:
# Group and count by month
monthly_connections = linkedin_clean.groupby(pd.Grouper(key="date", freq="M")).size()

# Plot
plt.figure(figsize=(12, 5))
ax = monthly_connections.plot(kind="bar", color="steelblue")
plt.title("LinkedIn Connections per Month")
plt.xlabel("Month")
plt.ylabel("Number of Connections")

month_labels = [d.strftime("%b") for d in monthly_connections.index]
plt.xticks(ticks=range(len(month_labels)), labels=month_labels, rotation=0)

plt.tight_layout()
plt.show()


In [None]:
linkedin_clean["linkedin_connection_company"].value_counts().head(5).plot(kind="barh", figsize=(8, 5), title="Top 5 Companies I Connected With")


In [None]:
linkedin_clean["linkedin_connection_position"].value_counts().head(5).plot(kind="barh", figsize=(8, 5), title="Top 5 Connection Titles")


In [223]:
# Group and aggregate
linkedin_summary = (
    linkedin_clean.groupby("date")
    .agg({
        "linkedin_connection_position": lambda x: "; ".join(x.dropna().astype(str).unique()),
        "linkedin_connection_company": lambda x: "; ".join(x.dropna().astype(str).unique()),
    })
)


# Also add connection count per day
linkedin_summary["linkedin_connection_count"] = (
    linkedin_clean.groupby("date").size()
)

# Reset index to keep 'date' as a column
linkedin_summary.reset_index(inplace=True)


In [None]:
linkedin_summary

#### Add message count

In [225]:
messages_df = pd.read_csv("./Linkedin/messages.csv")

# Parse date column
messages_df["DATE"] = pd.to_datetime(messages_df["DATE"], errors="coerce")
messages_df.dropna(subset=["DATE"], inplace=True)

# Extract just the date
messages_df["date"] = messages_df["DATE"].dt.date

# Filter to relevant date range
start_date = pd.to_datetime("2024-05-14").date()
end_date = pd.to_datetime("2025-04-15").date()
messages_df = messages_df[(messages_df["date"] >= start_date) & (messages_df["date"] <= end_date)]

# Count messages per day
message_counts = messages_df.groupby("date").size().reset_index(name="linkedin_message_count")


In [None]:
message_counts

In [227]:
# merge
# Make sure linkedin_summary has date in the same format
linkedin_summary["date"] = pd.to_datetime(linkedin_summary["date"]).dt.date

# Merge
linkedin_summary = linkedin_summary.merge(message_counts, on="date", how="left")

# Fill missing days with 0 messages
linkedin_summary["linkedin_message_count"] = linkedin_summary["linkedin_message_count"].fillna(0).astype(int)


In [None]:
linkedin_summary

In [None]:
# Visualize combo 
# Ensure 'date' is datetime type
linkedin_summary["date"] = pd.to_datetime(linkedin_summary["date"])

# Sort by date for plotting
linkedin_summary.sort_values("date", inplace=True)

# Plot
plt.figure(figsize=(14, 5))

plt.plot(linkedin_summary["date"], linkedin_summary["linkedin_connection_count"], label="Connections", marker="o", color="steelblue")
plt.plot(linkedin_summary["date"], linkedin_summary["linkedin_message_count"], label="Messages", marker="s", color="darkorange")

plt.title("LinkedIn Connections and Messages Over Time")
plt.xlabel("Date")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.grid(True, linestyle="--", alpha=0.5)
plt.legend()
plt.tight_layout()
plt.show()


# Instagram

## Messages

In [None]:
start_date = pd.to_datetime("2024-05-14")
end_date   = pd.to_datetime("2025-04-15")

# Generate a date range covering every day in [start_date..end_date].
all_dates = pd.date_range(start=start_date, end=end_date, freq='D')

# Count how many messages were sent each day
message_counts = defaultdict(int)

inbox_path = "instagram/your_instagram_activity/messages/inbox"
for convo_folder in os.listdir(inbox_path):
    convo_path = os.path.join(inbox_path, convo_folder)
    if os.path.isdir(convo_path):
        for filename in os.listdir(convo_path):
            if filename.endswith(".json"):
                with open(os.path.join(convo_path, filename), "r", encoding="utf-8") as f:
                    try:
                        data = json.load(f)
                        for msg in data.get("messages", []):
                            # Match sender name as needed here 
                            if msg.get("sender_name") == "Maria":
                                ts_ms = msg.get("timestamp_ms")
                                if ts_ms:
                                    # Convert ms -> seconds to create a proper datetime
                                    dt = datetime.fromtimestamp(ts_ms / 1000.0)
                                    # Only count if within  date window
                                    if start_date <= dt <= end_date:
                                        # Use a YYYY-MM-DD string or a normalized datetime
                                        day = dt.replace(hour=0, minute=0, second=0, microsecond=0)
                                        message_counts[day] += 1
                    except Exception as e:
                        print(f"Error in {filename}: {e}")

# Convert those counts into a DataFrame
messages_df = pd.DataFrame({"date": all_dates})
# Map each date in the range to the count, default 0
messages_df["instagram_messages_sent"] = messages_df["date"].map(lambda d: message_counts.get(d, 0))

messages_df["instagram_messages_sent"] = messages_df["instagram_messages_sent"].astype(int)

print(messages_df.head(5))
print(messages_df.tail(5))


## Liked posts

In [None]:
start_date = pd.to_datetime("2024-05-14")
end_date   = pd.to_datetime("2025-04-15")

# Full daily range again
full_dates = pd.date_range(start=start_date, end=end_date, freq="D")

# Load liked posts
with open("./instagram/your_instagram_activity/likes/liked_posts.json", "r") as f:
    post_data = json.load(f)

post_counts = {}

# Loop over each post-like record
for item in post_data.get("likes_media_likes", []):
    for entry in item.get("string_list_data", []):
        ts = entry.get("timestamp")
        if ts:
            # If timestamps are in ms, convert them:
            # had issues
            if ts > 1e12:
                ts = ts // 1000

            dt = datetime.fromtimestamp(ts)
            if start_date <= dt <= end_date:
                # Normalize to midnight for consistent day matching
                dt_midnight = dt.replace(hour=0, minute=0, second=0, microsecond=0)
                post_counts[dt_midnight] = post_counts.get(dt_midnight, 0) + 1

# Build DataFrame from these counts
liked_posts_df = pd.DataFrame({"date": list(post_counts.keys()),
                               "instagram_liked_posts_count": list(post_counts.values())})

# Make sure 'date' is a full Timestamp (midnight)
liked_posts_df["date"] = pd.to_datetime(liked_posts_df["date"])

# Reindex so *all* dates in the range appear, missing ones filled with 0
liked_posts_df = (
    liked_posts_df
    .set_index("date")
    .reindex(full_dates, fill_value=0)
    .rename_axis("date")
    .reset_index()
)

liked_posts_df["instagram_liked_posts_count"] = liked_posts_df["instagram_liked_posts_count"].astype(int)

print(liked_posts_df.head(5))
print(liked_posts_df.tail(5))


In [None]:
instagram_df = pd.merge(messages_df, liked_posts_df, on="date", how="outer")

# Fill any NaNs (in case one side had a day the other did not) with 0
instagram_df[["instagram_messages_sent","instagram_liked_posts_count"]] = instagram_df[["instagram_messages_sent","instagram_liked_posts_count"]].fillna(0).astype(int)

instagram_df.sort_values("date", inplace=True)

instagram_df


In [None]:
instagram_df["instagram_messages_sent"].describe()


In [None]:
instagram_df["instagram_liked_posts_count"].describe()


In [None]:
# VIS!
# Calculate rolling averages
rolling = instagram_df.copy()
rolling["messages_rolling"] = rolling["instagram_messages_sent"].rolling(window=7).mean()
rolling["likes_rolling"] = rolling["instagram_liked_posts_count"].rolling(window=7).mean()

# Plot
plt.figure(figsize=(15, 5))

# Light base lines
plt.plot(instagram_df["date"], instagram_df["instagram_messages_sent"], label="Messages (Daily)", color="skyblue", alpha=0.3)
plt.plot(instagram_df["date"], instagram_df["instagram_liked_posts_count"], label="Posts Liked (Daily)", color="orange", alpha=0.3)

# Bold rolling averages
plt.plot(rolling["date"], rolling["messages_rolling"], label="Messages (7-Day Avg)", color="skyblue", linewidth=2.5)
plt.plot(rolling["date"], rolling["likes_rolling"], label="Posts Liked (7-Day Avg)", color="orange", linewidth=2.5)

plt.title("Daily Instagram Activity with 7-Day Rolling Averages")
plt.xlabel("Date")
plt.ylabel("Count")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
weekly = instagram_df.copy()
weekly["week"] = weekly["date"].dt.to_period("W").apply(lambda r: r.start_time)
weekly_totals = weekly.groupby("week")[["instagram_messages_sent", "instagram_liked_posts_count"]].sum()

weekly_totals.plot(kind="bar", figsize=(15, 5), stacked=True, color=["skyblue", "orange"])
plt.title("Weekly Instagram Activity")
plt.xlabel("Week")
plt.ylabel("Total Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
dow = instagram_df.copy()
dow["day"] = dow["date"].dt.day_name()

dow_summary = dow.groupby("day")[["instagram_messages_sent", "instagram_liked_posts_count"]].mean().reindex([
    "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"
])

dow_summary.plot(kind="bar", figsize=(10, 4), color=["skyblue", "orange"])
plt.title("Average Instagram Activity by Day of Week")
plt.ylabel("Average Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(6, 6))
plt.scatter(merged_df["instagram_messages_sent"], merged_df["instagram_liked_posts_count"], alpha=0.5, color="purple")
plt.title("Relationship Between Messages Sent & Posts Liked")
plt.xlabel("Messages Sent")
plt.ylabel("Posts Liked")
plt.grid(True)
plt.tight_layout()
plt.show()


# Now, putting them all together

In [None]:
# Merge all df together for all data!
# First, print the head of each DataFrame to check
print("use_google_browser_data:")
print(use_google_browser_data.head(), "\n")

print("use_sleep_data:")
print(use_sleep_data.head(), "\n")

print("heart_rate_daily_avg_df:")
print(heart_rate_daily_avg_df.head(), "\n")

print("activity_df:")
print(activity_df.head(), "\n")

print("steps_df:")
print(steps_df.head(), "\n")

print("strava_clean:")
print(strava_clean.head(), "\n")

print("linkedin_summary:")
print(linkedin_summary.head(), "\n")

print("Instagram_df:")
print(instagram_df.head(), "\n")


In [None]:
# All dataframes
dfs = [
    use_google_browser_data,
    use_sleep_data,
    heart_rate_daily_avg_df,
    activity_df,
    steps_df,
    strava_clean,
    linkedin_summary,
    instagram_df
]

original_dtypes = {}

def rename_and_collect_dtypes(df, index):
    df = df.copy()
    df["date"] = pd.to_datetime(df["date"]).dt.normalize()
    
    # Rename only if column doesn't already have suffix
    new_cols = []
    for col in df.columns:
        if col == "date":
            new_cols.append("date")
        elif f"_df{index}" not in col:  # prevent repeated renaming
            new_cols.append(f"{col}_df{index}")
        else:
            new_cols.append(col)
    df.columns = new_cols

    for col in df.columns:
        if col != "date":
            original_dtypes[col] = df[col].dtype

    return df

# Clean and rename
dfs = [rename_and_collect_dtypes(df, i) for i, df in enumerate(dfs)]

# Merge all on 'date'
merged_df = reduce(lambda left, right: pd.merge(left, right, on="date", how="outer"), dfs)

# Restore dtypes
for col in merged_df.columns:
    if col == "date":
        continue

    orig_dtype = original_dtypes.get(col, None)
    if orig_dtype is None:
        continue

    if pd.api.types.is_integer_dtype(orig_dtype):
        merged_df[col] = merged_df[col].astype("Int64")
    elif pd.api.types.is_float_dtype(orig_dtype):
        merged_df[col] = merged_df[col].astype(float)
    elif pd.api.types.is_object_dtype(orig_dtype) or pd.api.types.is_string_dtype(orig_dtype):
        merged_df[col] = merged_df[col].astype(object)

# Sort by date
merged_df.sort_values("date", inplace=True)

print(merged_df.info())
merged_df


In [None]:
# Had to manually edit the column names after saving...
merged_df
merged_df.to_csv("merged_data.csv", index=False)


In [264]:
df = pd.read_csv("merged_data.csv", parse_dates=["date"])
df

# Want total social media interaction...
df["instagram_total_interactions"] = df["instagram_messages_sent"] + df["instagram_liked_posts_count"]

df = df.drop(columns=["rolling_avg"])

df.to_csv("merged_data.csv", index=False)


In [None]:
df

In [307]:
# Load the CSV file
merged_data = pd.read_csv("merged_data.csv")

# Get the number of rows
num_rows = merged_data.shape[0]
num_rows

375

In [None]:
merged_data