# Models by lapse month

## Set up

In [1]:
##Import functions

import autograd.numpy as np
from autograd import grad 
import matplotlib.pyplot as plt
%matplotlib inline
import boto3
import pandas as pd

##Import sklearn models & metrics

from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression,RidgeClassifierCV, LassoCV, LogisticRegressionCV, SGDClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score, plot_roc_curve, roc_curve, RocCurveDisplay, auc, confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import resample
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFE, SelectKBest, chi2, f_classif, SelectFromModel
from sklearn.svm import SVR, SVC    
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


In [3]:
## Data cleaning

df = df1.copy(deep=False)

var_details = pd.read_csv('var_details.csv')
vars_cat = var_details.loc[(var_details.VarType=="categorical") & (var_details.Drop=="No")]
vars_cat = vars_cat['Variable'].dropna()
df = df.drop(['RMUSIC', 'RPHOTO', 'RVIDDVD', 'Count.of.Calls'], axis = 1)
df_cols = df.columns
vars_cat = [x for x in vars_cat if x in df_cols]
df = pd.get_dummies(df, columns = vars_cat)
df['Voluntary.Disenrollment'] = np.where((df['Voluntary.Disenrollment'] != "No "),1, 0)
df = df.loc[((df['Voluntary.Disenrollment'] == 0) & (df.elapsedMonths>12)) | (df['Voluntary.Disenrollment']==1)]


NameError: name 'df1' is not defined

In [None]:
## Create & summarize 3 month lapse model

df_3month = df.copy(deep=False)
df_3month = df_3month.loc[((df_3month.Lapsed==0) & (df_3month.elapsedMonths>3))| (df_3month.Lapsed==1)]
df_3month.loc[df_3month.elapsedMonths<=3, ('EarlyLapser')] = 1
df_3month.loc[df_3month.elapsedMonths>3, ('EarlyLapser')] = 0
df_3month = df_3month.reset_index()

print(df_3month.info())
print(df_3month['EarlyLapser'].value_counts())
print(df_3month['Lapsed'].value_counts())
print(df_3month['Voluntary.Disenrollment'].value_counts())

plt.scatter(df_3month['elapsedMonths'], df_3month['Lapsed'])
plt.vlines(x=3, ymin=0, ymax=1)
plt.show()

plt.scatter(df_3month['elapsedMonths'], df_3month['Voluntary.Disenrollment'])
plt.vlines(x=12, ymin=0, ymax=1)
plt.show()


In [None]:
## Create & summarize 6 month lapse model

df_6month = df.copy()
df_6month = df_6month.loc[((df_6month.Lapsed==0) & (df_6month.elapsedMonths>6))| (df_6month.Lapsed==1)]
df_6month.loc[df_6month.elapsedMonths<=6, ('EarlyLapser')] = 1
df_6month.loc[df_6month.elapsedMonths>6, ('EarlyLapser')] = 0
df_6month = df_6month.reset_index()



print(df_6month['EarlyLapser'].value_counts())
print(df_6month['Lapsed'].value_counts())
print(df_6month['Voluntary.Disenrollment'].value_counts())

plt.scatter(df_6month['elapsedMonths'], df_6month['Lapsed'])
plt.vlines(x=6, ymin=0, ymax=1)
plt.show()

plt.scatter(df_6month['elapsedMonths'], df_6month['Voluntary.Disenrollment'])
plt.vlines(x=12, ymin=0, ymax=1)
plt.show()

In [None]:
## Create & summarize 1 year lapse model

df_1year = df.copy()
df_1year = df_1year.loc[((df_1year.Lapsed==0) & (df_1year.elapsedMonths>12))| (df_1year.Lapsed==1)]
df_1year.loc[df_1year.elapsedMonths<=12, ('EarlyLapser')] = 1
df_1year.loc[df_1year.elapsedMonths>12, ('EarlyLapser')] = 0
df_1year = df_1year.reset_index()

print(df_1year['EarlyLapser'].value_counts())
print(df_1year['Lapsed'].value_counts())
print(df_1year['Voluntary.Disenrollment'].value_counts())

plt.scatter(df_1year['elapsedMonths'], df_1year['Lapsed'])
plt.vlines(x=12, ymin=0, ymax=1)
plt.show()

plt.scatter(df_1year['elapsedMonths'], df_1year['Voluntary.Disenrollment'])
plt.vlines(x=12, ymin=0, ymax=1)
plt.show()

In [None]:
##Define functions

def categorical_evaluation(y_test, y_pred):
    
    
    fpr, tpr, threshold = roc_curve(y_test, y_pred, pos_label = 1)
    roc_auc = auc(fpr, tpr)
    print("AUC:", round(roc_auc, 2))
    print("ROC Curve: ")
    display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,estimator_name='Model')
    display.plot()  
    plt.show()
    print("F1 Score: {}", round(f1_score(y_test, y_pred), 4))
    print("All Model Metrics: \n", classification_report(y_test, y_pred))
    print("Model Confusion Matrix: \n", confusion_matrix(y_test, y_pred))
    return None
   

def continuous_evaluation(y_test, y_pred):
    
    fpr, tpr, threshold = roc_curve(y_test, y_pred, pos_label = 1)
    roc_auc = auc(fpr, tpr)
    print("AUC:", round(roc_auc, 2))
    print("ROC Curve: ")
    display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,estimator_name='Model')
    display.plot()  
    plt.show()

    thresh = np.linspace(0,1,1001)
    f1_scores = []
    for i in range(len(thresh)):
        f1_scores.append(f1_score(y_test, classify(y_pred, thresh[i])))
         
    max_index = f1_scores.index(max(f1_scores))
    thresholdOpt = thresh[max_index]   
    print('Best Threshold: {}'.format(thresholdOpt))
    print("F1 Score: ", f1_score(y_test, classify(y_pred, thresholdOpt)))        
    print("All Model Metrics: \n", classification_report(y_test, classify(y_pred, thresholdOpt)))
    print("Model Confusion Matrix: \n", confusion_matrix(y_test, classify(y_pred, thresholdOpt)))
    
    return None

def classify_decision_boundary(y_test, y_pred):
    
    thresh = np.linspace(0,1,1001)
    f1_scores = []
    for i in range(len(thresh)):
        f1_scores.append(f1_score(y_test, classify(y_pred, thresh[i])))
         
    max_index = f1_scores.index(max(f1_scores))
    thresholdOpt = thresh[max_index]
    y = classify(y_pred, thresholdOpt)
    return y

def classify(y, x):
    conditions = [(y >= x),
             (y < x)]
    values = [1.0, 0.0]
    y = np.select(conditions, values)
    return y

def upsampler(x, y):
    
    x= pd.DataFrame(x)
    training = pd.concat([x, y], axis = 1)
    EarlyLapser = training[training.EarlyLapser==1]
    Retained = training[training.EarlyLapser==0]
              
    if (len(EarlyLapser[0]) < len(Retained[0])):
        
        EarlyLapser_upsampled = resample(EarlyLapser,
                                replace=True, # sample with replacement
                                n_samples=len(Retained), # match number in majority class
                                random_state=27, stratify = EarlyLapser) # reproducible results
        upsampled = pd.concat([EarlyLapser_upsampled, Retained])
    else:
        Retained_upsampled = resample(Retained,
                                replace=True, # sample with replacement
                                n_samples=len(EarlyLapser), # match number in majority class
                                random_state=27, stratify = Retained) # reproducible results
        upsampled = pd.concat([EarlyLapser, Retained_upsampled])
        
    y_upsampled = upsampled['EarlyLapser']
    x_upsampled = upsampled.drop('EarlyLapser', axis=1)
    imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
    imp_median.fit(x_upsampled)
    x_upsampled = imp_median.transform(x_upsampled)
    return(x_upsampled, y_upsampled)

def downsampler(x, y):
    
    x= pd.DataFrame(x)
    training = pd.concat([x, y], axis = 1)    
    EarlyLapser = training[training.EarlyLapser==1]
    Retained = training[training.EarlyLapser==0]
    
    if (len(EarlyLapser[0]) < len(Retained[0])):
        Retained_downsampled = resample(Retained,
                                replace=False, # sample without replacement
                                n_samples=len(EarlyLapser), # match number in majority class
                                random_state=27, stratify = Retained) # reproducible results
        downsampled = pd.concat([EarlyLapser, Retained_downsampled])
    else:
        EarlyLapser_downsampled = resample(EarlyLapser,
                                replace=False, # sample with replacement
                                n_samples=len(Retained), # match number in majority class
                                random_state=27, stratify = EarlyLapser) # reproducible results

        downsampled = pd.concat([EarlyLapser_downsampled, Retained])
        
    y_downsampled = downsampled['EarlyLapser']
    x_downsampled = downsampled.drop('EarlyLapser', axis=1)
    imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
    imp_median.fit(x_downsampled)
    x_downsampled = imp_median.transform(x_downsampled)
    return(x_downsampled, y_downsampled)


In [None]:
%%time
##Create train test split

#3 month
y_3month = df_3month['EarlyLapser']
x_3month = df_3month.drop(['EarlyLapser', 'elapsedMonths', 'Lapsed', 'Voluntary.Disenrollment'], axis = 1)

imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
imp_median.fit(x_3month)
x_3month = imp_median.transform(x_3month)

x_train_3month, x_test_3month, y_train_3month, y_test_3month = train_test_split(x_3month, y_3month, test_size=0.2, random_state=0)

#6 month

y_6month = df_6month['EarlyLapser']
x_6month = df_6month.drop(['EarlyLapser', 'elapsedMonths', 'Lapsed', 'Voluntary.Disenrollment'], axis = 1)

imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
imp_median.fit(x_6month)
x_6month = imp_median.transform(x_6month)

x_train_6month, x_test_6month, y_train_6month, y_test_6month = train_test_split(x_6month, y_6month, test_size=0.2, random_state=0)

y_1year = df_1year['EarlyLapser']
x_1year = df_1year.drop(['EarlyLapser', 'elapsedMonths', 'Lapsed', 'Voluntary.Disenrollment'], axis = 1)

imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
imp_median.fit(x_1year)
x_1year = imp_median.transform(x_1year)

x_train_1year, x_test_1year, y_train_1year, y_test_1year = train_test_split(x_1year, y_1year, test_size=0.2, random_state=0)


In [None]:
%%time
## Create upsamples and downsamples
x_3month_upsampled, y_3month_upsampled = upsampler(x_3month, y_3month)
x_3month_downsampled, y_3month_downsampled = downsampler(x_3month, y_3month)
x_train_3month_upsampled, x_test_3month_upsampled, y_train_3month_upsampled, y_test_3month_upsampled = train_test_split(x_3month_upsampled, y_3month_upsampled, test_size=0.2, random_state=0)
x_train_3month_downsampled, x_test_3month_downsampled, y_train_3month_downsampled, y_test_3month_downsampled = train_test_split(x_3month_downsampled, y_3month_downsampled, test_size=0.2, random_state=0)


x_6month_upsampled, y_6month_upsampled = upsampler(x_6month, y_6month)
x_6month_downsampled, y_6month_downsampled = downsampler(x_6month, y_6month)
x_train_6month_upsampled, x_test_6month_upsampled, y_train_6month_upsampled, y_test_6month_upsampled = train_test_split(x_6month_upsampled, y_6month_upsampled, test_size=0.2, random_state=0)
x_train_6month_downsampled, x_test_6month_downsampled, y_train_6month_downsampled, y_test_6month_downsampled = train_test_split(x_6month_downsampled, y_6month_downsampled, test_size=0.2, random_state=0)


x_1year_upsampled, y_1year_upsampled = upsampler(x_1year, y_1year)
x_1year_downsampled, y_1year_downsampled = downsampler(x_1year, y_1year)
x_train_1year_upsampled, x_test_1year_upsampled, y_train_1year_upsampled, y_test_1year_upsampled = train_test_split(x_1year_upsampled, y_1year_upsampled, test_size=0.2, random_state=0)
x_train_1year_downsampled, x_test_1year_downsampled, y_train_1year_downsampled, y_test_1year_downsampled = train_test_split(x_1year_downsampled, y_1year_downsampled, test_size=0.2, random_state=0)


In [None]:
##Create validation set

x_train_3month_val, x_test_3month_val, y_train_3month_val, y_test_3month_val = train_test_split(x_3month, y_3month, test_size=0.05, random_state=0)
x_train_6month_val, x_test_6month_val, y_train_6month_val, y_test_6month_val = train_test_split(x_6month, y_6month, test_size=0.05, random_state=0)
x_train_1year_val, x_test_1year_val, y_train_1year_val, y_test_1year_val = train_test_split(x_1year, y_1year, test_size=0.05, random_state=0)

# Logistic Regression

## 3 month model

In [None]:
%%time
#Logistic Regression Model w/ stochastic gradient descent

clf_3month = make_pipeline(StandardScaler(), SGDClassifier(loss = 'log', warm_start = True, learning_rate = 'adaptive', eta0 = 0.1, max_iter = 10000, penalty = 'elasticnet', class_weight = 'balanced')).fit(x_train_3month, y_train_3month)
y_pred_3month_log = clf_3month.predict(np.array(x_test_3month))

scores = cross_val_score(clf_3month, x_test_3month_val, y_test_3month_val, cv=5)
print("%0.2f accuracy with a standard deviation of %0.2f" % (round(scores.mean(),2), round(scores.std(), 2)))

categorical_evaluation(y_test_3month, y_pred_3month_log)

print("UPSAMPLED: ")

clf_3month_upsampled = make_pipeline(StandardScaler(), SGDClassifier(loss = 'log', warm_start = True, learning_rate = 'adaptive', eta0 = 0.1, max_iter = 10000, penalty = 'elasticnet', class_weight = 'balanced')).fit(x_train_3month_upsampled, y_train_3month_upsampled)
y_pred_3month_log_upsampled = clf_3month_upsampled.predict(np.array(x_test_3month))

scores = cross_val_score(clf_3month_upsampled, x_test_3month_val, y_test_3month_val, cv=5)
print("%0.2f accuracy with a standard deviation of %0.2f" % (round(scores.mean(),2), round(scores.std(), 2)))

categorical_evaluation(y_test_3month, y_pred_3month_log_upsampled)

print("DOWNSAMPLED: ")

clf_3month_downsampled = make_pipeline(StandardScaler(), SGDClassifier(loss = 'log', warm_start = True, learning_rate = 'adaptive', eta0 = 0.1, max_iter = 10000, penalty = 'elasticnet', class_weight = 'balanced')).fit(x_train_3month_downsampled, y_train_3month_downsampled)
y_pred_3month_log_downsampled = clf_3month_downsampled.predict(np.array(x_test_3month))

scores = cross_val_score(clf_3month_downsampled, x_test_3month_val, y_test_3month_val, cv=5)
print("%0.2f accuracy with a standard deviation of %0.2f" % (round(scores.mean(),2), round(scores.std(), 2)))

categorical_evaluation(y_test_3month, y_pred_3month_log_downsampled)


## 6 month model

In [None]:
%%time
##Logistic Regression Model w/ stochastic gradient descent

clf_6month = make_pipeline(StandardScaler(), SGDClassifier(loss = 'log', warm_start = True, learning_rate = 'adaptive', eta0 = 0.1, max_iter = 10000, penalty = 'elasticnet', class_weight = 'balanced')).fit(x_train_6month, y_train_6month)
y_pred_6month_log = clf_6month.predict(np.array(x_test_6month))

scores = cross_val_score(clf_6month, x_test_6month_val, y_test_6month_val, cv=5)
print("%0.2f accuracy with a standard deviation of %0.2f" % (round(scores.mean(),2), round(scores.std(), 2)))

categorical_evaluation(y_test_6month, y_pred_6month_log)

print("UPSAMPLED: ")

clf_6month_upsampled = make_pipeline(StandardScaler(), SGDClassifier(loss = 'log', warm_start = True, learning_rate = 'adaptive', eta0 = 0.1, max_iter = 10000, penalty = 'elasticnet', class_weight = 'balanced')).fit(x_train_6month_upsampled, y_train_6month_upsampled)
y_pred_6month_log_upsampled = clf_6month_upsampled.predict(np.array(x_test_6month))

scores = cross_val_score(clf_6month_upsampled, x_test_6month_val, y_test_6month_val, cv=5)
print("%0.2f accuracy with a standard deviation of %0.2f" % (round(scores.mean(),2), round(scores.std(), 2)))

categorical_evaluation(y_test_6month, y_pred_6month_log_upsampled)

print("DOWNSAMPLED: ")

clf_6month_downsampled = make_pipeline(StandardScaler(), SGDClassifier(loss = 'log', warm_start = True, learning_rate = 'adaptive', eta0 = 0.1, max_iter = 10000, penalty = 'elasticnet', class_weight = 'balanced')).fit(x_train_6month_downsampled, y_train_6month_downsampled)
y_pred_6month_log_downsampled = clf_6month_downsampled.predict(np.array(x_test_6month))

scores = cross_val_score(clf_6month_downsampled, x_test_6month_val, y_test_6month_val, cv=5)
print("%0.2f accuracy with a standard deviation of %0.2f" % (round(scores.mean(),2), round(scores.std(), 2)))

categorical_evaluation(y_test_6month, y_pred_6month_log_downsampled)

## 1 year model

In [None]:
%%time
##Logistic Regression Model w/ stochastic gradient descent

clf_1year = make_pipeline(StandardScaler(), SGDClassifier(loss = 'log', warm_start = True, learning_rate = 'adaptive', eta0 = 0.1, max_iter = 10000, penalty = 'elasticnet', class_weight = 'balanced')).fit(x_train_1year, y_train_1year)
y_pred_1year_log = clf_1year.predict(np.array(x_test_1year))

scores = cross_val_score(clf_1year, x_test_1year_val, y_test_1year_val, cv=5)
print("%0.2f accuracy with a standard deviation of %0.2f" % (round(scores.mean(),2), round(scores.std(), 2)))

categorical_evaluation(y_test_1year, y_pred_1year_log)

print("UPSAMPLED: ")

clf_1year_upsampled = make_pipeline(StandardScaler(), SGDClassifier(loss = 'log', warm_start = True, learning_rate = 'adaptive', eta0 = 0.1, max_iter = 10000, penalty = 'elasticnet', class_weight = 'balanced')).fit(x_train_1year_upsampled, y_train_1year_upsampled)
y_pred_1year_log_upsampled = clf_1year_upsampled.predict(np.array(x_test_1year))

scores = cross_val_score(clf_1year_upsampled, x_test_1year_val, y_test_1year_val, cv=5)
print("%0.2f accuracy with a standard deviation of %0.2f" % (round(scores.mean(),2), round(scores.std(), 2)))

categorical_evaluation(y_test_1year, y_pred_1year_log_upsampled)

print("DOWNSAMPLED: ")

clf_1year_downsampled = make_pipeline(StandardScaler(), SGDClassifier(loss = 'log', warm_start = True, learning_rate = 'adaptive', eta0 = 0.1, max_iter = 10000, penalty = 'elasticnet', class_weight = 'balanced')).fit(x_train_1year_downsampled, y_train_1year_downsampled)
y_pred_1year_log_downsampled = clf_1year_downsampled.predict(np.array(x_test_1year))

scores = cross_val_score(clf_1year_downsampled, x_test_1year_val, y_test_1year_val, cv=5)
print("%0.2f accuracy with a standard deviation of %0.2f" % (round(scores.mean(),2), round(scores.std(), 2)))

categorical_evaluation(y_test_1year, y_pred_1year_log_downsampled)

# Linear Regression w/ L1 penalty

## 3 month model

In [None]:
%%time
##Linear Regression Model w/ Lasso regularizer

modellasso_3month = make_pipeline(StandardScaler(), make_pipeline(StandardScaler(), LassoCV(max_iter = 10000, selection = "random"))).fit(x_train_3month, y_train_3month)
y_pred_3month_lasso = modellasso_3month.predict(np.array(x_test_3month))

continuous_evaluation(y_test_3month, y_pred_3month_lasso)

In [None]:
%%time
##Upsampled Linear Regression Model w/ Lasso regularizer

modellasso_3month_upsampled = make_pipeline(StandardScaler(), LassoCV(max_iter = 10000, selection = "random")).fit(x_train_3month_upsampled, y_train_3month_upsampled)
y_pred_3month_lasso_upsampled = modellasso_3month_upsampled.predict(np.array(x_test_3month))

continuous_evaluation(y_test_3month, y_pred_3month_lasso_upsampled)

In [None]:
%%time
##Downsampled Linear Regression Model w/ Lasso regularizer

modellasso_3month_downsampled =make_pipeline(StandardScaler(), LassoCV(max_iter = 10000, selection = "random")).fit(x_train_3month_downsampled, y_train_3month_downsampled)
y_pred_3month_lasso_downsampled = modellasso_3month_downsampled.predict(np.array(x_test_3month))

continuous_evaluation(y_test_3month, y_pred_3month_lasso_downsampled)

## 6 month model

In [None]:
%%time
##Linear Regression Model w/ Lasso regularizer

modellasso_6month = make_pipeline(StandardScaler(), LassoCV(max_iter = 10000, selection = "random")).fit(x_train_6month, y_train_6month)
y_pred_6month_lasso = modellasso_6month.predict(np.array(x_test_6month))

continuous_evaluation(y_test_6month, y_pred_6month_lasso)



In [None]:
%%time
##Upsampled Linear Regression Model w/ Lasso regularizer

modellasso_6month_upsampled = make_pipeline(StandardScaler(), LassoCV(max_iter = 10000, selection = "random")).fit(x_train_6month_upsampled, y_train_6month_upsampled)
y_pred_6month_lasso_upsampled = modellasso_6month_upsampled.predict(np.array(x_test_6month))

continuous_evaluation(y_test_6month, y_pred_6month_lasso_upsampled)

In [None]:
%%time
##Downsampled Linear Regression Model w/ Lasso regularizer

modellasso_6month_downsampled = make_pipeline(StandardScaler(), LassoCV(max_iter = 10000, selection = "random")).fit(x_train_6month_downsampled, y_train_6month_downsampled)
y_pred_6month_lasso_downsampled = modellasso_6month_downsampled.predict(np.array(x_test_6month))

continuous_evaluation(y_test_6month, y_pred_6month_lasso_downsampled)

## 1 year model

In [None]:
%%time
##Linear Regression Model w/ Lasso regularizer

modellasso_1year = make_pipeline(StandardScaler(), LassoCV(max_iter = 10000, random_state = 6)).fit(x_train_1year, y_train_1year)
y_pred_1year_lasso = modellasso_1year.predict(np.array(x_test_1year))

continuous_evaluation(y_test_1year, y_pred_1year_lasso)

In [None]:
%%time
##Upsampled Linear Regression Model w/ Lasso regularizer

modellasso_1year_upsampled = make_pipeline(StandardScaler(), LassoCV(max_iter = 100000, random_state=6)).fit(x_train_1year_upsampled, y_train_1year_upsampled)
y_pred_1year_lasso_upsampled = modellasso_1year_upsampled.predict(np.array(x_test_1year))

continuous_evaluation(y_test_1year, y_pred_1year_lasso_upsampled)

In [None]:
%%time
##Downsampled Linear Regression Model w/ Lasso regularizer

modellasso_1year_downsampled = make_pipeline(StandardScaler(), LassoCV(max_iter = 10000, selection = "random")).fit(x_train_1year_downsampled, y_train_1year_downsampled)
y_pred_1year_lasso_downsampled = modellasso_1year_downsampled.predict(np.array(x_test_1year))

continuous_evaluation(y_test_1year, y_pred_1year_lasso_downsampled)