In [21]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from models import *
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
df = pd.read_csv(datafile)
df["date"] = pd.to_datetime(df["day_survey"])
df["C"] = df["date"].apply(
    lambda date: (
        COVIDStatus.POST_COVID if date > date_covid else COVIDStatus.PRE_COVID
    )
)

df_head = df.head(5).copy()

df.rename(columns=reverse_ema_dictionary, inplace=True)
df.set_index(["uid", "date"], inplace=True)
len_before_drop_na = len(df)
df.dropna(subset=ema + physical + social, inplace=True)
# df.dropna(subset=ema, inplace=True)

len_after_drop_na = len(df)
print(
    f"Length before dropna: {len_before_drop_na}\n"
    f"Length a dropna: {len_after_drop_na}"
)

sets_df = pd.read_parquet(sets_file, engine="pyarrow")

Length before dropna: 34706
Length a dropna: 16311


In [23]:
# GAD2 > 3
# https://www.hiv.uw.edu/page/mental-health-screening/gad-2
(df["Y4"] > 3).sum() / len(df)

0.07491876647661087

In [24]:
# PHQ2 > 3
# https://www.hiv.uw.edu/page/mental-health-screening/phq-2
precovid_phq2_greater_than_3 = (
    df[df["C"] == COVIDStatus.PRE_COVID]["Y2"] > 3
).sum() / len(df)

postcovid_phq2_greater_than_3 = (
    df[df["C"] == COVIDStatus.POST_COVID]["Y2"] > 3
).sum() / len(df)

print(f"PHQ2 greater than 3 in precovid: {precovid_phq2_greater_than_3}")
print(f"PHQ2 greater than 3 in postcovid: {postcovid_phq2_greater_than_3}")

PHQ2 greater than 3 in precovid: 0.22070995034026117
PHQ2 greater than 3 in postcovid: 0.05395132119428606


In [25]:
df[df["C"] == COVIDStatus.POST_COVID]["Y2"].describe()

count    2844.000000
mean        2.614627
std         2.734058
min         0.000000
25%         0.000000
50%         2.000000
75%         4.000000
max        12.000000
Name: Y2, dtype: float64

In [26]:
def pre_and_post_distributions(df: pd.DataFrame, column: str):
    colorpre = "blue"
    colorpost = "red"
    nbins = 6
    alpha = 0.4
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.histplot(
        df[df["C"] == COVIDStatus.POST_COVID][column],
        bins=nbins,
        edgecolor="k",
        alpha=alpha,
        color=colorpost,
        label="postcovid",
        stat="probability",
        ax=ax,
    )
    sns.histplot(
        df[df["C"] == COVIDStatus.PRE_COVID][column],
        bins=nbins,
        edgecolor="k",
        alpha=alpha,
        color=colorpre,
        label="precovid",
        stat="probability",
        ax=ax,
    )
    ax.set_xlabel("Value")
    ax.set_ylabel("Frequency")
    ax.legend()
    fig.suptitle(f"{column}={full_dictionary[column]}")
    folder = Path(
        project_root, "3.Counterfactual and ACEs estimation", "pictures"
    )

    if not folder.exists():
        folder.mkdir(parents=True)

    fig.savefig(Path(folder, f"Pre_and_post_{column}.png"))
    fig.savefig(Path(folder, f"Pre_and_post_{column}.svg"))
    plt.close(fig)
    return ax

In [27]:
# PHQ4 > 6
# https://www.oregonpainguidance.org/app/content/uploads/2016/05/PHQ-4.pdf
# PHQ2 > 3
# https://www.hiv.uw.edu/page/mental-health-screening/phq-2
precovid_phq4_greater_than_6 = (
    df[df["C"] == COVIDStatus.PRE_COVID]["Y3"] > 6
).sum() / len(df)

postcovid_phq4_greater_than_6 = (
    df[df["C"] == COVIDStatus.POST_COVID]["Y3"] > 6
).sum() / len(df)

print(f"PHQ4 greater than 6 in precovid: {precovid_phq4_greater_than_6}")
print(f"PHQ4 greater than 6 in postcovid: {postcovid_phq4_greater_than_6}")

PHQ4 greater than 6 in precovid: 0.0
PHQ4 greater than 6 in postcovid: 0.0


In [28]:
mean_precovid_phq4 = df[df["C"] == COVIDStatus.PRE_COVID]["Y3"].mean()
mean_postcovid_phq4 = df[df["C"] == COVIDStatus.POST_COVID]["Y3"].mean()
print(
    f"precovid mean phq4: {mean_precovid_phq4}"
    f"postcovid mean phq4: {mean_postcovid_phq4}"
)

precovid mean phq4: 1.28610677953516postcovid mean phq4: 1.3716596343178622


In [29]:
mean_precovid_phq2 = df[df["C"] == COVIDStatus.PRE_COVID]["Y2"].mean()
mean_postcovid_phq2 = df[df["C"] == COVIDStatus.POST_COVID]["Y2"].mean()
print(
    f"precovid mean phq2: {mean_precovid_phq2}"
    f"postcovid mean phq2: {mean_postcovid_phq2}"
)

precovid mean phq2: 2.2610826464691467postcovid mean phq2: 2.6146272855133614


In [30]:
for col in ema_dictionary.keys():
    pre_and_post_distributions(df, col)

In [31]:
for column in social + physical + ema + sleep:
    mean = df[column].mean()
    print(f"Mean of {column}:{full_dictionary[column]}={mean:.3f}")

Mean of S1:Traveling=1577.529
Mean of S2:Distance traveled=123422.310
Mean of S3:Time in social loc.=1.684
Mean of S4:Visits=4.681
Mean of S5:Duration unlocked phone in social loc.=12.507
Mean of S6:Frequency of unlocked phone in social loc.=10.399
Mean of S7:Motion at social locations=60.427
Mean of P1:Exercise=9789.249
Mean of P2:Studying=3.368
Mean of P3:At home=11.030
Mean of P4:Sports=0.321
Mean of Y1:pam=7.320
Mean of Y2:phq4_score=2.323
Mean of Y3:phq2_score=1.301
Mean of Y4:gad2_score=1.021
Mean of Y5:social_level=3.131
Mean of Y6:sse_score=12.915
Mean of Y7:stress=2.599
Mean of Z1:sleep_duration=7.114
Mean of Z2:sleep start time=42.404
Mean of Z3:sleep end time=99.315


In [32]:
for column in df.columns:
    if column in full_dictionary.keys():
        print(
            f"{column}:{full_dictionary[column]}:\n{df.describe()[column]}\n-----------------\n"
        )

Y1:pam:
count    16311.000000
mean         7.319968
std          4.402365
min          1.000000
25%          3.000000
50%          7.000000
75%         10.000000
max         16.000000
Name: Y1, dtype: float64
-----------------

Y2:phq4_score:
count    16311.000000
mean         2.322727
std          2.537266
min          0.000000
25%          0.000000
50%          2.000000
75%          4.000000
max         12.000000
Name: Y2, dtype: float64
-----------------

Y3:phq2_score:
count    16311.000000
mean         1.301024
std          1.382557
min          0.000000
25%          0.000000
50%          1.000000
75%          2.000000
max          6.000000
Name: Y3, dtype: float64
-----------------

Y4:gad2_score:
count    16311.000000
mean         1.021029
std          1.406776
min          0.000000
25%          0.000000
50%          0.000000
75%          2.000000
max          6.000000
Name: Y4, dtype: float64
-----------------

Y5:social_level:
count    16311.000000
mean         3.130709
std   

KeyError: 'C'

In [None]:
df.describe()["P1"]