In [12]:
%load_ext autoreload
%autoreload 2

from webbrowser import get
import pandas as pd
from sklearn.metrics import accuracy_score, r2_score
from models import *
import logging
from pretty_logger import get_logger
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
df = pd.read_csv(datafile)
df["date"] = pd.to_datetime(df["day_survey"])
df["C"] = df["date"].apply(
    lambda date: (
        COVIDStatus.POST_COVID if date > date_covid else COVIDStatus.PRE_COVID
    )
)

df_head = df.head(5).copy()

df.rename(columns=reverse_ema_dictionary, inplace=True)
df.set_index(["uid", "date"], inplace=True)
len_before_drop_na = len(df)
df.dropna(subset=ema + physical + social, inplace=True)
# df.dropna(subset=ema, inplace=True)

len_after_drop_na = len(df)
print(
    f"Length before dropna: {len_before_drop_na}\n"
    f"Length a dropna: {len_after_drop_na}"
)

sets_df = pd.read_parquet(sets_file, engine="pyarrow")

Length before dropna: 34706
Length a dropna: 16311


In [14]:
full_dictionary

{'P1': 'excercise',
 'P2': 'studying',
 'P3': 'in house',
 'P4': 'sports',
 'S1': 'traveling',
 'S2': 'distance traveled',
 'S3': 'time in social location',
 'S4': 'visits',
 'S5': 'duration unlocked phone in social locations',
 'S6': 'frequency of unlocked phone in social locations',
 'S7': 'motion at social locations',
 'Z1': 'sleep_duration',
 'Z2': 'sleep start time',
 'Z3': 'sleep end time',
 'Y1': 'pam',
 'Y2': 'phq4_score',
 'Y3': 'phq2_score',
 'Y4': 'gad2_score',
 'Y5': 'social_level',
 'Y6': 'sse_score',
 'Y7': 'stress',
 'C': <enum 'COVIDStatus'>,
 'D1': 'gender',
 'D2': 'race',
 'D3': 'os',
 'D4': 'cohort year'}

In [15]:
# GAD2 > 3
# https://www.hiv.uw.edu/page/mental-health-screening/gad-2
(df["Y4"] > 3).sum() / len(df)

0.07491876647661087

In [16]:
# PHQ2 > 3
# https://www.hiv.uw.edu/page/mental-health-screening/phq-2
precovid_phq2_greater_than_3 = (
    df[df["C"] == COVIDStatus.PRE_COVID]["Y2"] > 3
).sum() / len(df)

postcovid_phq2_greater_than_3 = (
    df[df["C"] == COVIDStatus.POST_COVID]["Y2"] > 3
).sum() / len(df)

print(f"PHQ2 greater than 3 in precovid: {precovid_phq2_greater_than_3}")
print(f"PHQ2 greater than 3 in postcovid: {postcovid_phq2_greater_than_3}")

PHQ2 greater than 3 in precovid: 0.22070995034026117
PHQ2 greater than 3 in postcovid: 0.05395132119428606


In [17]:
df[df["C"] == COVIDStatus.POST_COVID]["Y2"].describe()

count    2844.000000
mean        2.614627
std         2.734058
min         0.000000
25%         0.000000
50%         2.000000
75%         4.000000
max        12.000000
Name: Y2, dtype: float64

In [18]:
def pre_and_post_distributions(df: pd.DataFrame, column: str):
    colorpre = "green"
    colorpost = "orangered"
    nbins = 6
    alpha = 0.4
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.histplot(
        df[df["C"] == COVIDStatus.POST_COVID][column],
        bins=nbins,
        edgecolor="k",
        alpha=alpha,
        color=colorpost,
        label="postcovid",
        stat="probability",
        ax=ax,
    )
    sns.histplot(
        df[df["C"] == COVIDStatus.PRE_COVID][column],
        bins=nbins,
        edgecolor="k",
        alpha=alpha,
        color=colorpre,
        label="precovid",
        stat="probability",
        ax=ax,
    )
    ax.set_xlabel("Value")
    ax.set_ylabel("Frequency")
    ax.legend()
    fig.suptitle(f"{column}={full_dictionary[column]}")

    fig.savefig(Path(project_root, "temp", f"Pre_and_post_{column}.png"))
    fig.savefig(Path(project_root, "temp", f"Pre_and_post_{column}.svg"))
    plt.close(fig)
    return ax

In [19]:
# PHQ4 > 6
# https://www.oregonpainguidance.org/app/content/uploads/2016/05/PHQ-4.pdf
# PHQ2 > 3
# https://www.hiv.uw.edu/page/mental-health-screening/phq-2
precovid_phq4_greater_than_6 = (
    df[df["C"] == COVIDStatus.PRE_COVID]["Y3"] > 6
).sum() / len(df)

postcovid_phq4_greater_than_6 = (
    df[df["C"] == COVIDStatus.POST_COVID]["Y3"] > 6
).sum() / len(df)

print(f"PHQ4 greater than 6 in precovid: {precovid_phq4_greater_than_6}")
print(f"PHQ4 greater than 6 in postcovid: {postcovid_phq4_greater_than_6}")

PHQ4 greater than 6 in precovid: 0.0
PHQ4 greater than 6 in postcovid: 0.0


In [20]:
mean_precovid_phq4 = df[df["C"] == COVIDStatus.PRE_COVID]["Y3"].mean()
mean_postcovid_phq4 = df[df["C"] == COVIDStatus.POST_COVID]["Y3"].mean()
print(
    f"precovid mean phq4: {mean_precovid_phq4}"
    f"postcovid mean phq4: {mean_postcovid_phq4}"
)

precovid mean phq4: 1.28610677953516postcovid mean phq4: 1.3716596343178622


In [21]:
mean_precovid_phq2 = df[df["C"] == COVIDStatus.PRE_COVID]["Y2"].mean()
mean_postcovid_phq2 = df[df["C"] == COVIDStatus.POST_COVID]["Y2"].mean()
print(
    f"precovid mean phq2: {mean_precovid_phq2}"
    f"postcovid mean phq2: {mean_postcovid_phq2}"
)

precovid mean phq2: 2.2610826464691467postcovid mean phq2: 2.6146272855133614


In [22]:
for col in ema_dictionary.keys():
    pre_and_post_distributions(df, col)