In [238]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.io as pio
import plotly.graph_objects as go
import json
from scipy.stats import pearsonr

pio.renderers.default = "notebook" 

# Project analysis: first statistical correlations, then with life events, then ML

In [None]:
df = pd.read_csv("merged_data.csv", parse_dates=["date"])
df

In [240]:
# Explore correlations within my own behavior
# Pearson correlation coefficient bw every pair of numeric columns in df
correlations = df.corr(numeric_only=True)


In [None]:
correlations

In [None]:
# Heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(correlations, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Matrix of Behavior Data")
plt.show()


In [None]:
# Recalculate correlation matrix with numeric columns
corr_matrix = df.corr(numeric_only=True)

# Unstack the matrix to long format
corr_pairs = corr_matrix.unstack()

# Drop self-correlations
corr_pairs = corr_pairs[corr_pairs.index.get_level_values(0) != corr_pairs.index.get_level_values(1)]

# Drop duplicate pairs (e.g., keep A-B, drop B-A)
corr_pairs = corr_pairs.sort_values(key=lambda x: np.abs(x), ascending=False).drop_duplicates()

# Show top 20 relationships
top_n = 20
print(corr_pairs.head(top_n))


In [None]:
# Manually make list to then remove for easier analysis...
expected_links = [
    ("mins_sedentary", "sedentary_total"),
    ("instagram_total_interactions", "instagram_messages_sent"),
    ("active_total", "mins_lightly_active"),
    ("active_total", "sedentary_total"),
    ("mins_sedentary", "active_total"),
    ("total_steps", "active_total"),
    ("mins_lightly_active", "sedentary_total"),
    ("avg_heart_rate", "active_total"),
    ("total_steps", "mins_lightly_active"),
    ("mins_very_active", "avg_heart_rate"),
    ("mins_moderately_active", "active_total"),
    ("mins_sedentary", "avg_heart_rate"),
    ("mins_very_active", "mins_moderately_active"),
    ("total_steps", "mins_very_active"),
    ("total_steps", "avg_heart_rate"),
    ("mins_lightly_active", "avg_heart_rate"),
    ("total_steps", "mins_moderately_active"),
    ("mins_sedentary", "total_steps"),
    ("rolling_avg", "avg_heart_rate"),
    ("sleep_score", "sleep_hours"),
]

# Compute correlations
corr_matrix = df.corr(numeric_only=True)
corr_pairs = corr_matrix.unstack()
corr_pairs = corr_pairs[corr_pairs.index.get_level_values(0) != corr_pairs.index.get_level_values(1)]
corr_pairs = corr_pairs.sort_values(key=lambda x: np.abs(x), ascending=False).drop_duplicates()

# Filter out expected links
expected_set = set(tuple(sorted(pair)) for pair in expected_links)
filtered_pairs = [pair for pair in corr_pairs.index if tuple(sorted(pair)) not in expected_set]
filtered_corrs = corr_pairs.loc[filtered_pairs]

print(filtered_corrs.head(30))


In [None]:
# Get PCC and P value!

# Select only numeric columns
numeric_df = df.select_dtypes(include=["number"])

# Initialize list to store results
results = []

# Loop through each pair of numeric columns
cols = numeric_df.columns
for i in range(len(cols)):
    for j in range(i + 1, len(cols)):
        col1 = cols[i]
        col2 = cols[j]
        # Drop NaNs for both columns before computing
        valid = numeric_df[[col1, col2]].dropna()
        if len(valid) > 1:
            corr, pval = pearsonr(valid[col1], valid[col2])
            results.append({
                "var1": col1,
                "var2": col2,
                "correlation": corr,
                "p_value": pval,
                "abs_correlation": abs(corr)
            })

# Convert to DataFrame
correlation_df = pd.DataFrame(results)

# Filter for statistically significant results
significant_corrs = correlation_df[correlation_df["p_value"] < 0.05]

# Sort by absolute correlation strength
significant_corrs = significant_corrs.sort_values(by="abs_correlation", ascending=False)

# Display results
significant_corrs


In [None]:
# drop VERY HIGHLY similar columns to reduce coliearity, redundancy, etc....
df = df.drop(columns=["sedentary_total", "instagram_total_interactions", "active_total"])
df

In [None]:
# Select only numeric columns
numeric_df = df.select_dtypes(include=["number"])

# Initialize list to store results
results = []

# Loop through each pair of numeric columns
cols = numeric_df.columns
for i in range(len(cols)):
    for j in range(i + 1, len(cols)):
        col1 = cols[i]
        col2 = cols[j]
        # Drop NaNs for both columns before computing
        valid = numeric_df[[col1, col2]].dropna()
        if len(valid) > 1:
            corr, pval = pearsonr(valid[col1], valid[col2])
            results.append({
                "var1": col1,
                "var2": col2,
                "correlation": corr,
                "p_value": pval,
                "abs_correlation": abs(corr)
            })

# Convert to DataFrame
correlation_df = pd.DataFrame(results)

# Filter for statistically significant results
significant_corrs = correlation_df[correlation_df["p_value"] < 0.05]

# Sort by absolute correlation strength
significant_corrs = significant_corrs.sort_values(by="abs_correlation", ascending=False)

# Display results
significant_corrs


In [None]:
# Drop missing values
plot_df = df[["avg_heart_rate", "mins_very_active"]].dropna()

# Compute correlation and p-value
corr, pval = pearsonr(plot_df["mins_very_active"], plot_df["avg_heart_rate"])

# Plot
plt.figure(figsize=(8, 6))
sns.regplot(data=plot_df, x="mins_very_active", y="avg_heart_rate", scatter_kws={'alpha':0.6})
plt.title("Relationship Between Activity and Average Heart Rate")
plt.xlabel("Total Active Minutes")
plt.ylabel("Average Heart Rate")
plt.grid(True)

# Add text box with correlation and p-value
plt.text(
    0.05, 0.95,
    f"r = {corr:.2f}\np = {pval:.3g}",
    transform=plt.gca().transAxes,
    fontsize=12,
    verticalalignment='top',
    bbox=dict(boxstyle='round', facecolor='white', alpha=0.6)
)

plt.tight_layout()
plt.show()


In [None]:
# a cool 3D visualization!
# Drop missing values
plot_df = df[["avg_heart_rate", "mins_very_active", "total_steps"]].dropna()

# Create interactive 3D scatter
fig = go.Figure(data=[go.Scatter3d(
    x=plot_df["mins_very_active"],
    y=plot_df["total_steps"],
    z=plot_df["avg_heart_rate"],
    mode='markers',
    marker=dict(
        size=4,
        color=plot_df["avg_heart_rate"],
        colorscale='Viridis',
        opacity=0.7,
        colorbar=dict(title='Avg Heart Rate')
    )
)])

fig.update_layout(
    scene=dict(
        xaxis_title='Total Very Active Minutes',
        yaxis_title='Total Steps',
        zaxis_title='Average Heart Rate',
        zaxis=dict(range=[50, 95])  # manually increase z-axis space
    ),
    title="Interactive 3D: Heart Rate vs Activity vs Steps",
    margin=dict(l=0, r=0, b=0, t=40)
)

fig.show()

fig.write_html("cool_plot.html")



In [None]:
# Drop missing values
plot_df = df[["total_sites_visited", "mins_lightly_active"]].dropna()

# Compute correlation and p-value
corr, pval = pearsonr(plot_df["total_sites_visited"], plot_df["mins_lightly_active"])

# Create the plot
plt.figure(figsize=(8, 6))
sns.regplot(
    data=plot_df,
    x="total_sites_visited",
    y="mins_lightly_active",
    scatter_kws={'alpha': 0.6}
)

plt.title("Total Sites Visited vs Lightly Active Minutes")
plt.xlabel("Total Sites Visited")
plt.ylabel("Minutes Lightly Active")
plt.grid(True)

# Add correlation + p-value as annotation
plt.text(
    0.60, 0.95,
    f"r = {corr:.2f}\np = {pval:.3g}",
    transform=plt.gca().transAxes,
    fontsize=12,
    verticalalignment='top',
    bbox=dict(boxstyle='round', facecolor='white', alpha=0.6)
)

plt.tight_layout()
plt.show()


In [None]:
# Not look at more pairs of relationships: define the variable pairs want to plot
pairs = [
    ("sleep_hours", "mins_sedentary"),
    ("total_sites_visited", "instagram_liked_posts_count"),
    ("total_sites_visited", "avg_heart_rate"),
    ("mins_moderately_active", "instagram_messages_sent"),
    ("sleep_score", "instagram_messages_sent"),
    ("sleep_hours", "avg_heart_rate"),
]

# Generate a plot for each pair
for x_var, y_var in pairs:
    # Drop missing values
    plot_df = df[[x_var, y_var]].dropna()
    
    # Compute correlation and p-value
    corr, pval = pearsonr(plot_df[x_var], plot_df[y_var])
    
    # Plot
    plt.figure(figsize=(8, 6))
    sns.regplot(data=plot_df, x=x_var, y=y_var, scatter_kws={'alpha': 0.6})
    plt.title(f"{x_var.replace('_', ' ').title()} vs {y_var.replace('_', ' ').title()}")
    plt.xlabel(x_var.replace("_", " ").title())
    plt.ylabel(y_var.replace("_", " ").title())
    plt.grid(True)
    
    # Annotate with correlation + p-value
    plt.text(
        0.05, 0.95,
        f"r = {corr:.2f}\np = {pval:.3g}",
        transform=plt.gca().transAxes,
        fontsize=12,
        verticalalignment='top',
        bbox=dict(boxstyle='round', facecolor='white', alpha=0.6)
    )
    
    plt.tight_layout()
    plt.show()


## Personal Timeline!

In [248]:
# Add events here! I removed mine for public possting
life_events = [
    {"event": "fam visit", "event_type": "visit", "start": "2024-05-15", "end": "2024-05-19"},
]


In [251]:
all_events = life_events
# Save to a JSON file
with open("all_life_events.json", "w") as f:
    json.dump(all_events, f, indent=2)

In [252]:
df["events"] = [[] for _ in range(len(df))]
df["event_types"] = [[] for _ in range(len(df))]

for e in all_events:
    start = pd.to_datetime(e["start"])
    end = pd.to_datetime(e["end"])
    mask = (df["date"] >= start) & (df["date"] <= end)

    df.loc[mask, "events"] = df.loc[mask, "events"].apply(lambda x: x + [e["event"]])
    df.loc[mask, "event_types"] = df.loc[mask, "event_types"].apply(lambda x: x + [e["event_type"]])


In [None]:
# Count number of each type of event
df[df["event_types"].apply(lambda x: "interview" in x)]
from collections import Counter
event_type_counts = Counter([et for sublist in df["event_types"] for et in sublist])
print(event_type_counts)


In [254]:
df
df.to_csv("merged_data_with_events.csv", index=False)


### Stat analysis

In [255]:
# Drop rows with no event_types
df_exploded = df[df["event_types"].map(len) > 0].explode("event_types")

# Get numeric columns
numeric_cols = df_exploded.select_dtypes(include="number").columns

# Group by event type and compute means
grouped_stats = df_exploded.groupby("event_types")[numeric_cols].mean().T.sort_index()



In [None]:
grouped_normalized = (grouped_stats - grouped_stats.mean(axis=1).values[:, None]) / grouped_stats.std(axis=1).values[:, None]


plt.figure(figsize=(12, 8))
sns.heatmap(grouped_normalized, cmap="coolwarm", center=0, annot=False)
plt.title("Average Numeric Feature Values by Event Type")
plt.xlabel("Event Type")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()


In [None]:
from collections import Counter

# Flatten all event types across all rows
event_type_counts = Counter([etype for sublist in df["event_types"] for etype in sublist])

# Print sorted count
event_type_counts = dict(sorted(event_type_counts.items(), key=lambda x: x[1], reverse=True))

for event_type, count in event_type_counts.items():
    print(f"{event_type}: {count}")


#### Hyp: Different event types (like "lab meeting", "deadline", "holiday", etc.) significantly influence my behavior (activity, heart rate, screen time, social media use, etc.).



In [None]:
from scipy.stats import kruskal
import pandas as pd

# kruskal-wallis test to see if feature is sigificantly affected by event type!

# Explode event_types list
df_exploded = df[df["event_types"].map(len) > 0].explode("event_types")

# Get numeric columns
numeric_cols = df_exploded.select_dtypes(include="number").columns

# Top event types to consider (avoid tiny groups)
top_event_types = df_exploded["event_types"].value_counts().head(10).index

# Run Kruskal-Wallis and store results + per-event-type means
results = []

for col in numeric_cols:
    groups = [df_exploded[df_exploded["event_types"] == etype][col].dropna() for etype in top_event_types]
    
    if all(len(g) > 1 for g in groups):
        stat, p = kruskal(*groups)

        # Compute per-event-type means for this feature
        means = {etype: df_exploded[df_exploded["event_types"] == etype][col].mean() for etype in top_event_types}

        results.append({
            "feature": col,
            "statistic": stat,
            "p_value": p,
            "means_by_event_type": means
        })

# Format results
results_df = pd.DataFrame(results)

# Sort by p-value
results_df = results_df.sort_values(by="p_value")

# Display top rows
for _, row in results_df.iterrows():
    print(f"\n Feature: {row['feature']}")
    print(f"p = {row['p_value']:.4f}")
    print("Event Type Means:")
    for etype, mean_val in row["means_by_event_type"].items():
        print(f"     {etype}: {mean_val:.2f}")


In [None]:
from scipy.stats import mannwhitneyu
import itertools

# Determine statistically significant differences of every feature for pairwise event comparison

# Explode event_types so each row has one type
df_exploded = df[df["event_types"].map(len) > 0].explode("event_types")

# Get numeric columns
numeric_cols = df_exploded.select_dtypes(include="number").columns

# Get most common event types
top_event_types = df_exploded["event_types"].value_counts().head(3).index.tolist()

# Store results
pairwise_results = []

for col in numeric_cols:
    for et1, et2 in itertools.combinations(top_event_types, 2):
        group1 = df_exploded[df_exploded["event_types"] == et1][col].dropna()
        group2 = df_exploded[df_exploded["event_types"] == et2][col].dropna()

        if len(group1) > 1 and len(group2) > 1:
            stat, p = mannwhitneyu(group1, group2, alternative='two-sided')
            pairwise_results.append({
                "feature": col,
                "event_type_1": et1,
                "event_type_2": et2,
                "p_value": p,
                "mean_1": group1.mean(),
                "mean_2": group2.mean()
            })

# Format results
pairwise_df = pd.DataFrame(pairwise_results)
pairwise_df = pairwise_df.sort_values("p_value")

# Show top significant differences
print(pairwise_df[pairwise_df["p_value"] < 0.05])


In [None]:
sns.set(style="whitegrid", font_scale=1.1)

# Top features to visualize
top_features = ["sleep_score", "avg_heart_rate", "total_sites_visited", "instagram_messages_sent"]

# Choose a color palette for distinct event types
palette = sns.color_palette("Set2", n_colors=len(top_event_types))

for feature in top_features:
    plt.figure(figsize=(11, 5))
    ax = sns.violinplot(
        data=df_plot,
        x="event_types",
        y=feature,
        inner="quartile",        
        scale="width",           
        linewidth=1.1,
        palette=palette         
    )
    
    ax.set_title(f"{feature.replace('_', ' ').title()} by Event Type", fontsize=14, weight="bold")
    ax.set_xlabel("Event Type", fontsize=12)
    ax.set_ylabel(feature.replace('_', ' ').title(), fontsize=12)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


#### looking at interview days!

In [None]:
df["is_interview_day"] = df["event_types"].apply(lambda x: "interview" in x).astype(int)

from scipy.stats import mannwhitneyu

features_to_test = ["linkedin_connection_count", "linkedin_message_count"]

for col in features_to_test:
    group_interview = df[df["is_interview_day"] == 1][col].dropna()
    group_non = df[df["is_interview_day"] == 0][col].dropna()
    
    if len(group_interview) > 1 and len(group_non) > 1:
        stat, p = mannwhitneyu(group_interview, group_non, alternative="two-sided")
        print(f"{col}: p = {p:.4f}, mean (interview) = {group_interview.mean():.2f}, mean (non) = {group_non.mean():.2f}")

for col in features_to_test:
    plt.figure(figsize=(8, 4))
    sns.violinplot(data=df, x="is_interview_day", y=col, palette="Set2", inner="quartile")
    plt.title(f"{col.replace('_', ' ').title()} on Interview vs Non-Interview Days")
    plt.xticks([0, 1], ["Non-Interview", "Interview"])
    plt.ylabel(col.replace('_', ' ').title())
    plt.xlabel("Day Type")
    plt.tight_layout()
    plt.show()

results = []

for col in df.select_dtypes(include="number").columns:
    if col == "is_interview_day":
        continue
    group_interview = df[df["is_interview_day"] == 1][col].dropna()
    group_non = df[df["is_interview_day"] == 0][col].dropna()
    
    if len(group_interview) > 1 and len(group_non) > 1:
        stat, p = mannwhitneyu(group_interview, group_non)
        results.append({
            "feature": col,
            "p_value": p,
            "mean_interview": group_interview.mean(),
            "mean_non_interview": group_non.mean()
        })

# View sorted results
pd.DataFrame(results).sort_values("p_value").head(10)


In [None]:
plt.figure(figsize=(8, 4))
sns.boxplot(data=df, x="is_interview_day", y="linkedin_connection_count", palette="Set2")
sns.swarmplot(data=df, x="is_interview_day", y="linkedin_connection_count", color=".3", alpha=0.6)
plt.title("LinkedIn Connection Count on Interview vs Non-Interview Days")
plt.xticks([0, 1], ["Non-Interview", "Interview"])
plt.ylabel("LinkedIn Connection Count")
plt.xlabel("Day Type")
plt.tight_layout()
plt.show()




#### look at vet visit

In [None]:
df["is_vet_visit_day"] = df["event_types"].apply(lambda x: "vet visit" in x).astype(int)
from scipy.stats import mannwhitneyu

features_to_test = [
    "sleep_score", "avg_heart_rate", "total_sites_visited", 
    "instagram_messages_sent", "mins_very_active", "sleep_hours"
]

print("Feature comparison: Vet Visit vs Non-Vet Visit days\n")

for col in features_to_test:
    group_vet = df[df["is_vet_visit_day"] == 1][col].dropna()
    group_non = df[df["is_vet_visit_day"] == 0][col].dropna()
    
    if len(group_vet) > 1 and len(group_non) > 1:
        stat, p = mannwhitneyu(group_vet, group_non, alternative="two-sided")
        print(f"{col}: p = {p:.4f} | mean (vet visit) = {group_vet.mean():.2f}, mean (non) = {group_non.mean():.2f}")

plot_features = ["avg_heart_rate", "mins_very_active", "sleep_hours"]

for feature in plot_features:
    plt.figure(figsize=(8, 4))
    sns.boxplot(data=df, x="is_vet_visit_day", y=feature, palette="Set3")
    sns.swarmplot(data=df, x="is_vet_visit_day", y=feature, color=".3", alpha=0.6)
    plt.title(f"{feature.replace('_', ' ').title()} on Vet Visit vs Non-Vet Visit Days")
    plt.xticks([0, 1], ["Non-Vet Visit", "Vet Visit"])
    plt.ylabel(feature.replace("_", " ").title())
    plt.xlabel("Day Type")
    plt.tight_layout()
    plt.show()


#### Looking at group lab meeting days!

In [None]:
df["is_lab_meeting_day"] = df["event_types"].apply(lambda x: "group lab meeting" in x).astype(int)

from scipy.stats import mannwhitneyu

features_to_test = [
    "sleep_score", "avg_heart_rate", "total_sites_visited",
    "instagram_messages_sent", "mins_very_active", "sleep_hours"
]

print("Feature comparison: Lab Meeting vs Non-Lab Meeting Days\n")

for col in features_to_test:
    group_lab = df[df["is_lab_meeting_day"] == 1][col].dropna()
    group_non = df[df["is_lab_meeting_day"] == 0][col].dropna()

    if len(group_lab) > 1 and len(group_non) > 1:
        stat, p = mannwhitneyu(group_lab, group_non, alternative="two-sided")
        print(f"{col}: p = {p:.4f} | mean (lab) = {group_lab.mean():.2f}, mean (non) = {group_non.mean():.2f}")

plot_features = ["avg_heart_rate", "total_sites_visited", "instagram_messages_sent"]

for feature in plot_features:
    plt.figure(figsize=(8, 4))
    sns.boxplot(data=df, x="is_lab_meeting_day", y=feature, palette="Set2")
    sns.swarmplot(data=df, x="is_lab_meeting_day", y=feature, color=".3", alpha=0.6)
    plt.title(f"{feature.replace('_', ' ').title()} on Lab Meeting vs Non-Lab Days")
    plt.xticks([0, 1], ["Non-Lab Day", "Lab Meeting Day"])
    plt.ylabel(feature.replace("_", " ").title())
    plt.xlabel("Day Type")
    plt.tight_layout()
    plt.show()


#### now looking at deadline days


In [None]:
df["is_deadline_day"] = df["event_types"].apply(lambda x: "deadline" in x).astype(int)
from scipy.stats import mannwhitneyu

features_to_test = [
    "sleep_score", "avg_heart_rate", "total_sites_visited",
    "instagram_messages_sent", "mins_very_active", "sleep_hours"
]

print("Feature comparison: Deadline vs Non-Deadline Days\n")

for col in features_to_test:
    group_deadline = df[df["is_deadline_day"] == 1][col].dropna()
    group_non = df[df["is_deadline_day"] == 0][col].dropna()

    if len(group_deadline) > 1 and len(group_non) > 1:
        stat, p = mannwhitneyu(group_deadline, group_non, alternative="two-sided")
        print(f"{col}: p = {p:.4f} | mean (deadline) = {group_deadline.mean():.2f}, mean (non) = {group_non.mean():.2f}")

plot_features = ["avg_heart_rate", "total_sites_visited", "sleep_score"]

for feature in plot_features:
    plt.figure(figsize=(8, 4))
    sns.boxplot(data=df, x="is_deadline_day", y=feature, palette="Set1")
    sns.swarmplot(data=df, x="is_deadline_day", y=feature, color=".3", alpha=0.6)
    plt.title(f"{feature.replace('_', ' ').title()} on Deadline vs Non-Deadline Days")
    plt.xticks([0, 1], ["Non-Deadline", "Deadline"])
    plt.ylabel(feature.replace("_", " ").title())
    plt.xlabel("Day Type")
    plt.tight_layout()
    plt.show()


In [None]:
# Look at behavior 5 days before and after an event!

# Sort df
df = df.sort_values("date").reset_index(drop=True)

# Get all deadline dates
deadline_dates = df[df["event_types"].apply(lambda x: "deadline" in x)]["date"]

# Initialize column
df["days_from_deadline"] = None

# Populate offset days around each deadline
for deadline_day in deadline_dates:
    for offset in range(-5, 6):  # 5 days before to 5 days after
        day = deadline_day + pd.Timedelta(days=offset)
        if day in df["date"].values:
            df.loc[df["date"] == day, "days_from_deadline"] = offset

# Subset to relevant window
df_deadline_window = df[df["days_from_deadline"].notna()].copy()
df_deadline_window["days_from_deadline"] = df_deadline_window["days_from_deadline"].astype(int)


# Features to include
features = ["sleep_hours", "avg_heart_rate", "instagram_messages_sent", "total_sites_visited", "total_steps"]

# Normalize each feature
for col in features:
    min_val = df_deadline_window[col].min()
    max_val = df_deadline_window[col].max()
    df_deadline_window[f"{col}_norm"] = (df_deadline_window[col] - min_val) / (max_val - min_val)

# Melt for seaborn
plot_df = df_deadline_window.melt(
    id_vars=["days_from_deadline"],
    value_vars=[f"{col}_norm" for col in features],
    var_name="feature",
    value_name="normalized_value"
)

# Clean up labels
plot_df["feature"] = plot_df["feature"].str.replace("_norm", "").str.replace("_", " ").str.title()

# Plot
plt.figure(figsize=(11, 6))
sns.lineplot(
    data=plot_df,
    x="days_from_deadline",
    y="normalized_value",
    hue="feature",
    marker="o"
)

plt.axvline(0, color="red", linestyle="--", lw=1.2, label="Deadline Day")
plt.title("Behavioral Trends Before and After Deadline Days")
plt.xlabel("Days From Deadline (0 = Deadline Day)")
plt.ylabel("Normalized Value (0–1)")
plt.grid(True)
plt.tight_layout()
plt.legend(title="Behavior Feature")
plt.show()


In [None]:
# Same but for interview days

# Make sure your dates are sorted
df = df.sort_values("date").reset_index(drop=True)

# Get all interview dates
interview_dates = df[df["event_types"].apply(lambda x: "interview" in x)]["date"]

# Initialize new column
df["days_from_interview"] = None

# Fill days relative to each interview date
for interview_day in interview_dates:
    for offset in range(-5, 6):  # 5 days before to 5 days after
        day = interview_day + pd.Timedelta(days=offset)
        if day in df["date"].values:
            df.loc[df["date"] == day, "days_from_interview"] = offset

# Filter for only rows within -5 to +5 days
df_event_window = df[df["days_from_interview"].notna()].copy()
df_event_window["days_from_interview"] = df_event_window["days_from_interview"].astype(int)

# Features to include
features = ["sleep_hours", "avg_heart_rate", "instagram_messages_sent", "total_sites_visited", "total_steps"]

# Normalize each column for comparability
for col in features:
    min_val = df_event_window[col].min()
    max_val = df_event_window[col].max()
    df_event_window[f"{col}_norm"] = (df_event_window[col] - min_val) / (max_val - min_val)


# Melt to long format for seaborn
plot_df = df_event_window.melt(
    id_vars=["days_from_interview"],
    value_vars=[f"{col}_norm" for col in features],
    var_name="feature",
    value_name="normalized_value"
)

# Clean up feature names
plot_df["feature"] = plot_df["feature"].str.replace("_norm", "").str.replace("_", " ").str.title()

# Plot
plt.figure(figsize=(11, 6))
sns.lineplot(
    data=plot_df,
    x="days_from_interview",
    y="normalized_value",
    hue="feature",
    marker="o"
)

plt.axvline(0, color="red", linestyle="--", lw=1.2, label="Interview Day")
plt.title("Behavioral Trends Before and After Interview Days")
plt.xlabel("Days From Interview (0 = Interview Day)")
plt.ylabel("Normalized Value (0–1)")
plt.grid(True)
plt.tight_layout()
plt.legend(title="Behavior Feature")
plt.show()


In [None]:
df["is_winter_break"] = df["event_types"].apply(lambda x: "winter break" in x).astype(int)

features = [
    "sleep_score", "sleep_hours", "avg_heart_rate", "total_sites_visited",
    "instagram_messages_sent", "mins_very_active", "total_steps"
]

df.groupby("is_winter_break")[features].mean().T
from scipy.stats import mannwhitneyu

print("Behavior Comparison: Winter Break vs Non-Break Days\n")

for col in features:
    break_days = df[df["is_winter_break"] == 1][col].dropna()
    non_break_days = df[df["is_winter_break"] == 0][col].dropna()

    if len(break_days) > 1 and len(non_break_days) > 1:
        stat, p = mannwhitneyu(break_days, non_break_days, alternative='two-sided')
        print(f"{col}: p = {p:.4f}, mean (break) = {break_days.mean():.2f}, mean (non) = {non_break_days.mean():.2f}")


plot_features = ["sleep_hours", "total_sites_visited", "instagram_messages_sent"]

for feature in plot_features:
    plt.figure(figsize=(8, 4))
    sns.boxplot(data=df, x="is_winter_break", y=feature, palette="Set2")
    sns.swarmplot(data=df, x="is_winter_break", y=feature, color=".3", alpha=0.6)
    plt.title(f"{feature.replace('_', ' ').title()} During vs Outside Winter Break")
    plt.xticks([0, 1], ["Non-Break", "Winter Break"])
    plt.ylabel(feature.replace("_", " ").title())
    plt.xlabel("Day Type")
    plt.tight_layout()
    plt.show()



In [None]:
#trips


df["is_trip"] = df["event_types"].apply(lambda x: "trip" in x).astype(int)

features = [
    "sleep_score", "sleep_hours", "avg_heart_rate", "total_sites_visited",
    "instagram_messages_sent", "mins_very_active", "total_steps"
]

df.groupby("is_trip")[features].mean().T
from scipy.stats import mannwhitneyu

print("Behavior Comparison: Trip vs Non-trip Days\n")

for col in features:
    break_days = df[df["is_trip"] == 1][col].dropna()
    non_break_days = df[df["is_trip"] == 0][col].dropna()

    if len(break_days) > 1 and len(non_break_days) > 1:
        stat, p = mannwhitneyu(break_days, non_break_days, alternative='two-sided')
        print(f"{col}: p = {p:.4f}, mean (trip) = {break_days.mean():.2f}, mean (non) = {non_break_days.mean():.2f}")


plot_features = ["sleep_hours", "total_sites_visited", "instagram_messages_sent"]

for feature in plot_features:
    plt.figure(figsize=(8, 4))
    sns.boxplot(data=df, x="is_trip", y=feature, palette="Set2")
    sns.swarmplot(data=df, x="is_trip", y=feature, color=".3", alpha=0.6)
    plt.title(f"{feature.replace('_', ' ').title()} During vs Outside Trips")
    plt.xticks([0, 1], ["Non-Trip", "Trip"])
    plt.ylabel(feature.replace("_", " ").title())
    plt.xlabel("Day Type")
    plt.tight_layout()
    plt.show()



In [None]:
df

## ML!

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


In [None]:
df = pd.read_csv("merged_data_with_events.csv")
df

In [None]:
# Map event_types to specific labels
def get_break_label(event_list):
    if "summer" in event_list:
        return "summer"
    elif "fall" in event_list:
        return "fall"
    elif "winter break" in event_list:
        return "winter"
    else:
        return None  # exclude non-break days

df["target_event"] = df["event_types"].apply(get_break_label)

# Filter to rows that are one of the 3 break types
df_multi = df[df["target_event"].notna()].copy()

# Confirm label distribution
print(df_multi["target_event"].value_counts())

features = [
    "sleep_score", "sleep_hours", "avg_heart_rate", "total_sites_visited",
    "mins_very_active", "instagram_messages_sent", "instagram_liked_posts_count",
    "linkedin_connection_count", "linkedin_message_count", "total_steps"
]

df_multi = df_multi.dropna(subset=features + ["target_event"])
X = df_multi[features]
y = df_multi["target_event"]

clf = RandomForestClassifier(class_weight="balanced", random_state=42)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(clf, X, y, cv=cv, scoring="f1_weighted")

print("Cross-validated F1 (weighted) scores:", scores)
print("Mean F1 score: {:.4f}".format(scores.mean()))


clf.fit(X, y)

importances = pd.Series(clf.feature_importances_, index=features).sort_values(ascending=False)

plt.figure(figsize=(8, 5))
sns.barplot(x=importances.values, y=importances.index, palette="viridis")
plt.title("Feature Importance for Predicting Break Type")
plt.xlabel("Importance Score")
plt.tight_layout()
plt.show()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("\nClassification Report (Multi-Class, Test Set):\n")
print(classification_report(y_test, y_pred))



In [None]:
# Get confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
disp.plot(cmap="Blues", values_format='d')
plt.title("Season Classification")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.grid(False)
plt.tight_layout()
plt.show()


##### Now just fall vs summer - BINARY

In [None]:
def get_fall_summer_label(event_list):
    if "summer" in event_list:
        return "summer"
    elif "fall" in event_list:
        return "fall"
    else:
        return None  # exclude non-summer/fall rows

df["target_event"] = df["event_types"].apply(get_fall_summer_label)

# Keep only summer/fall
df_binary = df[df["target_event"].notna()].copy()

# Check distribution
print(df_binary["target_event"].value_counts())

features = [
    "sleep_score", "sleep_hours", "avg_heart_rate", "total_sites_visited",
    "mins_very_active", "instagram_messages_sent", "instagram_liked_posts_count",
    "linkedin_connection_count", "linkedin_message_count", "total_steps"
]

df_binary = df_binary.dropna(subset=features + ["target_event"])
X = df_binary[features]
y = df_binary["target_event"]


clf = RandomForestClassifier(class_weight="balanced", random_state=42)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(clf, X, y, cv=cv, scoring="f1_weighted")

print("Cross-validated F1 (weighted) scores:", scores)
print("Mean F1 score: {:.4f}".format(scores.mean()))

clf.fit(X, y)

importances = pd.Series(clf.feature_importances_, index=features).sort_values(ascending=False)

plt.figure(figsize=(8, 5))
sns.barplot(x=importances.values, y=importances.index, palette="viridis")
plt.title("Feature Importance for Predicting Fall vs Summer Break")
plt.xlabel("Importance Score")
plt.tight_layout()
plt.show()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("\nClassification Report (Fall vs Summer):\n")
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
disp.plot(cmap="Blues", values_format='d')
plt.title("Season Classification")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.grid(False)
plt.tight_layout()
plt.show()