In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression

from ds_utils.db.connectors import PivotDW

pd.set_option('display.float_format', lambda x: '%.4f' % x)
pd.set_option('display.max_rows', 100)
pd.set_option('display.min_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', True)

# pd.set_option('precision', 2)
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['axes.facecolor'] = 'white'
plt.style.use('fivethirtyeight')
# pd.option_context('display.max_rows', None, 'display.max_columns', None)
# print(plt.style.available)


In [2]:
# This is a query on a materialized view.  Refresh once a day.
refresh_view = False

stm_query = """
    SELECT
        app_pid,
        datetime_created::DATE AS sold_date,
        date_part(MONTH, datetime_created::DATE)::INT AS sold_month,
        date_part(YEAR, datetime_created::DATE)::INT AS sold_year,
        effective_date,
        expiration_date,
        termination_date,
        CASE WHEN plan_type IN ('30x1', '60x1', '90x1') THEN '90x1' ELSE plan_type END AS plan_type,
        CASE WHEN plan_group IN ('30', '60', '90') THEN '90' ELSE plan_group END AS plan_group,
        duration_sold,
        duration_inforce,
        inforce_percent,
        asof_date,
        has_dental,
        has_vision,
        has_supp,
        state,
        plan_members,
        (children_members > 0)::INT AS has_children,
        spousal_members,
        children_members,
        primary_age,
        spouse_age,
        child_age_average
    FROM tracking.app_premium_revised
    WHERE status = 'LAPSED'
        AND is_prepaid = 0
        AND product_type IN ('STM', 'BTM')
        AND plan_type IN ('180x1', '180x2', '30x1', '60x1', '90x1', '90x2', '90x4', '364x1')
        AND plan_group != '364+'
        AND duration_inforce <= duration_sold + 1
    ;
"""

with PivotDW() as db_context:
    
    if refresh_view:
        db_context.exec("REFRESH MATERIALIZED VIEW tracking.app_premium;")
        
    stm_df = db_context.to_df(stm_query, parse_dates='sold_date')

    
print(len(stm_df))
stm_df.head()


INFO:root:query: SELECT app_pid, datetime_created::DATE AS sold_date, date_pa... executed in 12.26 seconds


74442


Unnamed: 0,app_pid,sold_date,sold_month,sold_year,effective_date,expiration_date,termination_date,plan_type,plan_group,duration_sold,duration_inforce,inforce_percent,asof_date,has_dental,has_vision,has_supp,state,plan_members,has_children,spousal_members,children_members,primary_age,spouse_age,child_age_average
0,205502,2019-05-07,5,2019,2019-05-15,2019-11-10,2019-10-14,180x1,180,180,152,0.8444,2021-03-26,0,0,0,TX,1,0,0,0,30.3,,
1,191889,2019-01-08,1,2019,2019-01-15,2019-04-14,2019-04-14,90x1,90,90,89,0.9888,2021-03-26,0,0,0,OH,1,0,0,0,55.53,,
2,205739,2019-05-09,5,2019,2019-06-01,2020-05-29,,364x1,364,364,364,1.0,2021-03-26,0,0,0,GA,1,0,0,0,58.35,,
3,211091,2019-06-24,6,2019,2019-06-26,2019-09-23,2019-09-23,90x1,90,90,89,0.9888,2021-03-26,0,0,0,GA,3,1,1,1,58.45,58.4,24.45
4,215646,2019-07-31,7,2019,2019-08-01,2020-01-27,,180x1,180,180,180,1.0,2021-03-26,0,0,0,VA,1,0,0,0,22.43,,


In [3]:
stm_df.groupby('plan_group').size().reset_index(name='policies')

Unnamed: 0,plan_group,policies
0,180,28033
1,364,36125
2,90,10284


In [4]:
def label_months(plan_group, month):
    
    prefix = ''
    plan_label = None
    
    def label_group(month, performance_groups, relative_performance):
        label = None
        for performance_group in performance_groups:
            if month in performance_group:                
                label = f'{plan_group}_' + '_'.join([str(n) for n in performance_group])
        return label
    
    if plan_group in ['364']:
        low, perf = [7,8,9], 'low'
        mid_low, perf = [4,5,6], 'mid_low'
        mid_high, perf = [2,3,10], 'mid_high'
        high, perf = [1,11,12], 'high'
        
        plan_label = label_group(month, [low, mid_low, mid_high, high], perf)


    if plan_group == '180':
        low, perf = [10,11], 'low'
        mid_low, perf = [8,9,12], 'mid_low'
        mid_high, perf = [2,1,7], 'mid_high'
        high, perf = [3,4,5,6], 'high'
        
        plan_label = label_group(month, [low, mid_low, mid_high, high], perf)

    if plan_group == '90':
        low, perf = [2,6,5], 'low'
        mid, perf = [1,8,7,3], 'mid'
        high, perf = [12,4,10,9,11], 'high'
        
        plan_label = label_group(month, [low, mid, high], perf)

    return plan_label


stm_df['month_group'] = stm_df[['plan_group', 'sold_month']].apply(lambda x: label_months(*x), axis=1)

stm_df.head()

Unnamed: 0,app_pid,sold_date,sold_month,sold_year,effective_date,expiration_date,termination_date,plan_type,plan_group,duration_sold,duration_inforce,inforce_percent,asof_date,has_dental,has_vision,has_supp,state,plan_members,has_children,spousal_members,children_members,primary_age,spouse_age,child_age_average,month_group
0,205502,2019-05-07,5,2019,2019-05-15,2019-11-10,2019-10-14,180x1,180,180,152,0.8444,2021-03-26,0,0,0,TX,1,0,0,0,30.3,,,180_3_4_5_6
1,191889,2019-01-08,1,2019,2019-01-15,2019-04-14,2019-04-14,90x1,90,90,89,0.9888,2021-03-26,0,0,0,OH,1,0,0,0,55.53,,,90_1_8_7_3
2,205739,2019-05-09,5,2019,2019-06-01,2020-05-29,,364x1,364,364,364,1.0,2021-03-26,0,0,0,GA,1,0,0,0,58.35,,,364_4_5_6
3,211091,2019-06-24,6,2019,2019-06-26,2019-09-23,2019-09-23,90x1,90,90,89,0.9888,2021-03-26,0,0,0,GA,3,1,1,1,58.45,58.4,24.45,90_2_6_5
4,215646,2019-07-31,7,2019,2019-08-01,2020-01-27,,180x1,180,180,180,1.0,2021-03-26,0,0,0,VA,1,0,0,0,22.43,,,180_2_1_7


In [5]:
model_df_90 = stm_df[stm_df['plan_group'] == '90'][[
    'inforce_percent', 'has_dental', 'has_vision', 'primary_age', 'has_children', 'month_group'
]].copy()

model_df_180 = stm_df[stm_df['plan_group'] == '180'][[
    'inforce_percent', 'has_dental', 'has_vision', 'primary_age', 'has_children', 'month_group'
]].copy()

model_df_364 = stm_df[stm_df['plan_group'] == '364'][[
    'inforce_percent', 'has_dental', 'has_vision', 'primary_age', 'has_children', 'month_group'
]].copy()

model_df_90 = pd.get_dummies(model_df_90, columns=['month_group'], drop_first=True)
model_df_180 = pd.get_dummies(model_df_180, columns=['month_group'], drop_first=True)
model_df_364 = pd.get_dummies(model_df_364, columns=['month_group'], drop_first=True)


model_df_90 = model_df_90[~model_df_90.isna().any(axis=1)].copy().astype(float)
model_df_180 = model_df_180[~model_df_180.isna().any(axis=1)].copy().astype(float)
model_df_364 = model_df_364[~model_df_364.isna().any(axis=1)].copy().astype(float)


model_df_180.head(10)

Unnamed: 0,inforce_percent,has_dental,has_vision,primary_age,has_children,month_group_180_2_1_7,month_group_180_3_4_5_6,month_group_180_8_9_12
0,0.8444,0.0,0.0,30.3,0.0,0.0,1.0,0.0
4,1.0,0.0,0.0,22.43,0.0,1.0,0.0,0.0
5,0.1666,0.0,0.0,18.9,0.0,0.0,0.0,1.0
12,0.9944,0.0,0.0,63.47,0.0,0.0,1.0,0.0
15,1.0,0.0,0.0,58.69,0.0,1.0,0.0,0.0
16,0.9944,0.0,0.0,30.51,1.0,0.0,1.0,0.0
22,0.9944,0.0,0.0,64.07,0.0,0.0,1.0,0.0
23,0.1657,0.0,0.0,26.07,0.0,0.0,0.0,1.0
25,0.5,0.0,0.0,22.37,0.0,0.0,1.0,0.0
28,0.6722,0.0,0.0,41.55,1.0,0.0,1.0,0.0


In [6]:
len(model_df_180)

24534

In [8]:
model_df_364.columns

Index(['inforce_percent', 'has_dental', 'has_vision', 'primary_age',
       'has_children', 'month_group_364_2_3_10', 'month_group_364_4_5_6',
       'month_group_364_7_8_9'],
      dtype='object')

In [9]:
import statsmodels.api as sm

x = model_df_90[['primary_age', 'has_children', 'month_group_90_1_8_7_3', 'month_group_90_2_6_5']].astype(float)
x['intercept'] = 1.0

y = model_df_90[['inforce_percent']]

model = sm.OLS(endog=y, exog=x).fit()
predictions_90 = model.predict(x)
    
model.summary()

0,1,2,3
Dep. Variable:,inforce_percent,R-squared:,0.006
Model:,OLS,Adj. R-squared:,0.006
Method:,Least Squares,F-statistic:,14.19
Date:,"Fri, 09 Apr 2021",Prob (F-statistic):,1.52e-11
Time:,14:24:06,Log-Likelihood:,200.43
No. Observations:,9486,AIC:,-390.9
Df Residuals:,9481,BIC:,-355.1
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
primary_age,0.0008,0.000,4.492,0.000,0.000,0.001
has_children,-0.0189,0.006,-2.914,0.004,-0.032,-0.006
month_group_90_1_8_7_3,-0.0109,0.006,-1.940,0.052,-0.022,0.000
month_group_90_2_6_5,-0.0342,0.006,-5.336,0.000,-0.047,-0.022
intercept,0.8455,0.008,108.717,0.000,0.830,0.861

0,1,2,3
Omnibus:,2054.482,Durbin-Watson:,1.869
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3667.601
Skew:,-1.492,Prob(JB):,0.0
Kurtosis:,3.616,Cond. No.,155.0


In [12]:
model.params

primary_age               0.0008
has_children             -0.0189
month_group_90_1_8_7_3   -0.0109
month_group_90_2_6_5     -0.0342
intercept                 0.8455
dtype: float64

In [13]:
import statsmodels.api as sm

x = model_df_180[['has_vision', 'primary_age',
       'has_children', 'month_group_180_2_1_7', 'month_group_180_3_4_5_6',
       'month_group_180_8_9_12']]
x['intercept'] = 1.0

y = model_df_180[['inforce_percent']]

model_180 = sm.OLS(endog=y, exog=x).fit()
model_df_180 = model_180.predict(x)
    
model_180.summary()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['intercept'] = 1.0


0,1,2,3
Dep. Variable:,inforce_percent,R-squared:,0.034
Model:,OLS,Adj. R-squared:,0.034
Method:,Least Squares,F-statistic:,144.2
Date:,"Fri, 09 Apr 2021",Prob (F-statistic):,1.9799999999999997e-180
Time:,14:25:37,Log-Likelihood:,-7292.2
No. Observations:,24534,AIC:,14600.0
Df Residuals:,24527,BIC:,14660.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
has_vision,-0.0849,0.019,-4.417,0.000,-0.123,-0.047
primary_age,0.0018,0.000,12.327,0.000,0.002,0.002
has_children,-0.0427,0.006,-7.337,0.000,-0.054,-0.031
month_group_180_2_1_7,0.1244,0.007,18.030,0.000,0.111,0.138
month_group_180_3_4_5_6,0.1635,0.007,24.302,0.000,0.150,0.177
month_group_180_8_9_12,0.0813,0.007,12.404,0.000,0.068,0.094
intercept,0.5645,0.008,70.075,0.000,0.549,0.580

0,1,2,3
Omnibus:,15147.829,Durbin-Watson:,1.769
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3044.958
Skew:,-0.646,Prob(JB):,0.0
Kurtosis:,1.856,Cond. No.,385.0


In [17]:
model_180.params

has_vision                -0.0849
primary_age                0.0018
has_children              -0.0427
month_group_180_2_1_7      0.1244
month_group_180_3_4_5_6    0.1635
month_group_180_8_9_12     0.0813
intercept                  0.5645
dtype: float64

In [14]:
len(model_df_364)

33877

In [15]:
import statsmodels.api as sm

x = model_df_364[['has_vision', 'primary_age',
       'has_children', 'month_group_364_2_3_10', 'month_group_364_4_5_6',
       'month_group_364_7_8_9']].astype(float)
x['intercept'] = 1.0

y = model_df_364[['inforce_percent']]

model_364 = sm.OLS(endog=y, exog=x).fit()
model_df_364 = model_364.predict(x)
    
model_364.summary()

0,1,2,3
Dep. Variable:,inforce_percent,R-squared:,0.046
Model:,OLS,Adj. R-squared:,0.046
Method:,Least Squares,F-statistic:,273.6
Date:,"Fri, 09 Apr 2021",Prob (F-statistic):,0.0
Time:,14:25:40,Log-Likelihood:,-13404.0
No. Observations:,33877,AIC:,26820.0
Df Residuals:,33870,BIC:,26880.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
has_vision,-0.0514,0.015,-3.380,0.001,-0.081,-0.022
primary_age,0.0021,0.000,15.321,0.000,0.002,0.002
has_children,-0.0237,0.005,-4.574,0.000,-0.034,-0.014
month_group_364_2_3_10,-0.0642,0.005,-11.817,0.000,-0.075,-0.054
month_group_364_4_5_6,-0.1189,0.006,-21.158,0.000,-0.130,-0.108
month_group_364_7_8_9,-0.1665,0.005,-32.071,0.000,-0.177,-0.156
intercept,0.5258,0.007,77.482,0.000,0.512,0.539

0,1,2,3
Omnibus:,176279.73,Durbin-Watson:,1.754
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3617.719
Skew:,0.164,Prob(JB):,0.0
Kurtosis:,1.433,Cond. No.,339.0


In [16]:
model_364.params

has_vision               -0.0514
primary_age               0.0021
has_children             -0.0237
month_group_364_2_3_10   -0.0642
month_group_364_4_5_6    -0.1189
month_group_364_7_8_9    -0.1665
intercept                 0.5258
dtype: float64

In [None]:
stm_df['predictions'] = predictions
stm_df['predicted_duration_inforce'] = stm_df['predictions'] * stm_df['duration_sold']
stm_df[['inforce_percent', 'predicted_duration_inforce']].head()

stm_df['primary_age_est'] = round(stm_df['primary_age'], -1)
stm_df.head()

In [None]:
stm_df['days_diff'] = stm_df['duration_inforce'] - stm_df['predicted_duration_inforce']

stm_df['days_diff_pct'] = abs((stm_df['duration_inforce'] - stm_df['predicted_duration_inforce'])) / stm_df['duration_inforce']


In [None]:
stm_df.groupby(['plan_group', 'primary_age_est', 'month_perf_group'])\
    [['duration_inforce', 'predicted_duration_inforce', 'days_diff', 'days_diff_pct']].mean()\
    .reset_index()