In [1]:
from dsx.ds_utils import *

Package loaded in Notebook Mode
Successfully imported ds_utils as Package


In [2]:
import re, datetime, qgrid, dtale
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('fivethirtyeight')
sns.set_context(context={'figure.figsize': (16,9)})

In [4]:
qgrid.set_grid_option('forceFitColumns', False)

In [5]:
dsx.set_dirs()

Set project directory to /mnt/d/DSAI_Workstation/0003_ePrimer/workspace.
Property "dir_%" enabled


# Loading Data

In [46]:
df = pd.read_excel('data/outputs/processed_data.xlsx')

In [47]:
df.Course_Enrolled_Date_SGT = pd.to_datetime(df.Course_Enrolled_Date_SGT)
df.Course_Completed_Date_SGT = pd.to_datetime(df.Course_Completed_Date_SGT)

In [48]:
top_n_agencies = df.ds.cumsum('Agency').head(35).Agency
df = df[df.Agency.isin(top_n_agencies)]

In [49]:
train_date_cutoff = datetime.datetime(2020, 9, 14, 0, 0, 0)
df_train = df[df.Course_Enrolled_Date_SGT < train_date_cutoff].copy()
df_test = df[df.Course_Enrolled_Date_SGT >= train_date_cutoff].copy()


In [50]:
data_set = [df_train, df_test]

# Survival Analysis Features

In [None]:
# The right_censor_date for train data need to be re-considered

In [12]:
def create_surva_feats_inplace(dfx, right_censor_date=datetime.datetime(2020, 10, 2, 0, 0, 0)):
    dfx['duration'] = dfx.total_days.copy()
    dfx.loc[dfx.duration.isnull(), 'duration'] = (right_censor_date - dfx.Course_Enrolled_Date_SGT).map(lambda x: x.total_seconds() // 60 / 60 / 24)
    dfx['event'] = np.where(dfx.Module_Complete, 1, 0)

In [51]:
data_set[0] = create_surva_feats_inplace(data_set[0], train_date_cutoff)
data_set[1] = create_surva_feats_inplace(data_set[1])

In [33]:
from lifelines import KaplanMeierFitter

In [41]:
def create_fitter_per_agency(dfx, fitter_max_range=365):
    fitter_dict = {}
    for agency, group in df_train.groupby('Agency'):
        fitter_dict[agency] = KaplanMeierFitter().fit(group['duration'], group['event'], range(0, fitter_max_range+1, 1))
    return fitter_dict

In [53]:
fitters = create_fitter_per_agency(df_train)

In [55]:
df_train['survival_p_agency'] = df_train.parallel_apply(lambda x: fitters[x.Agency].predict(x.duration), axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1581), Label(value='0 / 1581'))), …

In [76]:
df_test['survival_p_agency'] = df_test.parallel_apply(lambda x: fitters[x.Agency].predict(x.duration), axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=354), Label(value='0 / 354'))), HB…

In [59]:
def generate_probablity_of_completion(fitter, start_period, end_period, freq='w'):
    if freq=='w':
        start_period *=7
        end_period *=7
    elif freq=='M':
        start_period *=28
        end_period *=28
    elif freq=='d':
        pass
    
    completion_rate_btw = fitter.predict(start_period) - fitter.predict(end_period)
    return completion_rate_btw / fitter.predict(start_period)

In [93]:
def generate_proxy_hazard_data(fitter, days, event_threshold=365):
    data = [(x, generate_probablity_of_completion(fitter, x, event_threshold, 'd')) for x in range(0, days+1, 1)]
    data = pd.DataFrame(list(data), columns=["duration", "hazard_proba"])
    return data


In [94]:
df_data = []
for f in fitters:
    dfz = generate_proxy_hazard_data(fitters[f], 180, 365)
    dfz['Agency'] = f
    df_data.append(dfz)

In [95]:
df_data = pd.concat(df_data, axis=0, ignore_index=True)

In [96]:
df_train['dur'] = df_train.duration.copy()
df_train.duration = df_train.duration.map(lambda x: round(x, 0))

In [98]:
df_train = df_train.ds.merge(df_data, 'left', ['duration', 'Agency'], isnull='hazard_proba')

(True, 18972)
(278, 0.014653173097195868)


In [99]:
df_train.loc[df_train.hazard_proba.isnull(), 'hazard_proba'] = df_train.Agency.map(lambda x: df_data[df_data.Agency == x].hazard_proba.max())