Compress the time-varying survival data generated from get_asmbly_member_date_long_form.py
into short form data suitable for time-invariant survival anlysis.

Stratify the resulting data set into the following new dataframes:

- duration < 4 months
- 4 months <= duration <= 1 year
- 1 year < duration <= 3 years
- duration > 3 years

This module will return data suitable for time-invariant covariate 
survival regression analysis where each row will correspond to 1 member
with the following pieces of data for each member:
- email
- Neon ID
- First name
- Last name
- DiscourseID (None if none)
- Has Discourse ID (boolean)
- Has OP ID (boolean)
- Distance from Asmbly (np.nan if address unavailable)
- Time from Asmbly (np.nan if address unavailable)
- Gender (np.nan if unavailable)
- Age (np.nan if unavailable)
- Referral Source (np.nan if unavailable)
- Family Membership (boolean)
- Membership Cancelled (boolean)
- Membership Type (monthly or annual)
- Membership duration
- Total classes attended before first membership
- Waiver Signed (boolean)
- Orientation attended (boolean)
- Woodshop Safety attended (boolean)
- Metal Shop Safety attended (boolean)
- CNC Router class attended (boolean)
- Laser class attended (boolean)
- Steward (boolean)
- Teacher (boolean)

In [None]:
import datetime
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
df = pd.read_csv("../TimeVaryingSurvivalAnalysis/all_members_long_form.csv")

In [None]:
monthly_counts = df[df.membership_type == "MONTH"].groupby("neon_id")["email"].count()

annual_counts = df[df.membership_type == "YEAR"].groupby("neon_id")["email"].count() * 12

total_counts = pd.merge(monthly_counts, annual_counts, how="outer", on="neon_id").groupby("neon_id").sum()

total_counts["duration"] = total_counts["email_x"] + total_counts["email_y"]

total_counts.drop(columns=["email_x", "email_y"], inplace=True)

annual_mem = df[df.membership_type == "YEAR"]["neon_id"].unique()

total_counts = total_counts.reset_index().rename(columns={"index": "neon_id"})

annual_mem = pd.DataFrame(annual_mem, columns=["neon_id"])
annual_mem["annual_membership"] = True

merged = pd.merge(total_counts, annual_mem, how="outer", on="neon_id")
merged["annual_membership"].fillna(False, inplace=True)

cancelled = df[df.membership_cancelled == True][["neon_id"]]
cancelled["membership_cancelled"] = True

merged = pd.merge(merged, cancelled, how="outer", on="neon_id")
merged["membership_cancelled"].fillna(False, inplace=True)


In [None]:
drop = [
    "membership_type",
    "start",
    "stop",
    "start_date",
    "end_date",
    "first_name",
    "last_name",
    "discourse_id",
    "membership_cancelled",
    "volunteer",
    "num_classes_attended",
    "dollars_spent",
    "woodshop_classes",
    "metal_shop_classes",
    "lasers_classes",
    "textiles_classes",
    "electronics_classes",
    "3dp_classes",
    "distance_from_asmbly",
    "email",
    
]

df.drop(columns=drop, inplace=True)

In [None]:
df.drop_duplicates(inplace=True, ignore_index=True)

In [None]:
df = pd.merge(df, merged, on="neon_id", how="inner")

In [None]:
age_med = df["age"].median()
gender_mode =df["gender"].mode()[0]
referral_mode = df["referral_source"].mode()[0]
time_med = df["time_from_asmbly"].median()

df["age"].fillna(age_med, inplace=True)
df["gender"].fillna(gender_mode, inplace=True)
df.loc[df["time_from_asmbly"] > 10800, "time_from_asmbly"] = time_med
df["time_from_asmbly"].fillna(time_med, inplace=True)
df["referral_source"].fillna(referral_mode, inplace=True)


In [None]:
df.isna().any()

In [None]:
df.to_csv("all_members_cleaned_time_invariant.csv", index=False)