In [1]:
# Importing the necessary packages
import pandas as pd
import numpy as np
from datetime import datetime
from math import ceil

In [2]:
# Reading in each csv file for the years of customer data
data = pd.read_csv('data.csv', encoding = 'cp1252')

In [3]:
# Spliting the Created column into two rows with the date and time as seperate columns
data['date_created'], data['time_created'] = data['Created'].str.split(' ', 1).str

In [4]:
# Printing out the first 5 rows of each column in the dataset
print(data.head())

   #Prod                    Agent Label +ID      CUSTOM Cancel Reason  \
0    1.0      Church Bulletin 0011 (109398)  021 - No reason provided   
1    1.0  Benefits and More 4 Paid (143595)   020 - Financial Reasons   
2    1.0  Benefits and More 4 Paid (143595)                       NaN   
3    1.0  Benefits and More 4 Paid (143595)   020 - Financial Reasons   
4    1.0       Benefits and More 4 (116063)                       NaN   

         Created           ID   Income  Last Pay Amount  Last Pay Complete  \
0  7/30/14 14:36  657917124.0  64780.0            34.95                1.0   
1  10/5/15 19:15  658935850.0  64780.0            29.95                1.0   
2   5/2/16 20:16  659370939.0  64780.0            29.95                1.0   
3  4/22/16 19:55  659356995.0  64780.0            39.95                1.0   
4  7/28/16 16:01  659940549.0  64780.0              NaN                NaN   

  Last Pay Date  Last Pay Status Last Pay Type  Pay Method              Stage  \
0       8/2

In [5]:
# Converting the two new split columns into datetime objects
data['date_created'] = pd.to_datetime(data['date_created'])
data['Last Pay Date'] = pd.to_datetime(data['Last Pay Date'])

# Taking the difference between the last pay date and the date the account was created 
# Using the dt.days function to calculate amount of days in between those dates
data['days'] = (data['Last Pay Date'] - data['date_created']).dt.days

# Dividing the number of days by 30 to get the number of months with the company
data['months'] = data['days']/30
data.head()

Unnamed: 0,#Prod,Agent Label +ID,CUSTOM Cancel Reason,Created,ID,Income,Last Pay Amount,Last Pay Complete,Last Pay Date,Last Pay Status,Last Pay Type,Pay Method,Stage,State,Status,ZipCodeNew,date_created,time_created,days,months
0,1.0,Church Bulletin 0011 (109398),021 - No reason provided,7/30/14 14:36,657917124.0,64780.0,34.95,1.0,2014-08-25,1.0,Refund,Discover,Back in Inventory,MA,Returned <30 days,1001.0,2014-07-30,14:36,26.0,0.866667
1,1.0,Benefits and More 4 Paid (143595),020 - Financial Reasons,10/5/15 19:15,658935850.0,64780.0,29.95,1.0,2015-10-05,1.0,Sale,Discover,Back in Inventory,MA,Returned < 90 days,1001.0,2015-10-05,19:15,0.0,0.0
2,1.0,Benefits and More 4 Paid (143595),,5/2/16 20:16,659370939.0,64780.0,29.95,1.0,2016-05-02,1.0,Sale,Visa,,MA,Switched to LW,1001.0,2016-05-02,20:16,0.0,0.0
3,1.0,Benefits and More 4 Paid (143595),020 - Financial Reasons,4/22/16 19:55,659356995.0,64780.0,39.95,1.0,2016-05-12,1.0,Refund,MasterCard,Back in Inventory,MA,Returned <30 days,1001.0,2016-04-22,19:55,20.0,0.666667
4,1.0,Benefits and More 4 (116063),,7/28/16 16:01,659940549.0,64780.0,,,NaT,,,ACH,Post Date Cancel,ME,Order Canceled,1001.0,2016-07-28,16:01,,


In [6]:
# Rounding up the 'months' and 'days' columns to the hundredth decimal place
data['months'] = np.round(data['months'], 2)
data['days'] = np.round(data['days'], 2)

## **Changing Column Types and Names**

In [7]:
# Showing the data types for each column
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119833 entries, 0 to 119832
Data columns (total 20 columns):
#Prod                   119794 non-null float64
Agent Label +ID         119832 non-null object
CUSTOM Cancel Reason    80428 non-null object
Created                 119832 non-null object
ID                      119832 non-null float64
Income                  100711 non-null float64
Last Pay Amount         113509 non-null float64
Last Pay Complete       113509 non-null float64
Last Pay Date           113509 non-null datetime64[ns]
Last Pay Status         113509 non-null float64
Last Pay Type           113288 non-null object
Pay Method              118063 non-null object
Stage                   100142 non-null object
State                   119832 non-null object
Status                  119832 non-null object
ZipCodeNew              119832 non-null float64
date_created            119832 non-null datetime64[ns]
time_created            119832 non-null object
days                 

In [8]:
# Changing data column types to their appropriate types
data['ID'] = data['ID'].astype(str)
data['ZipCodeNew'] = data['ZipCodeNew'].astype(str)

In [9]:
# Deleting the row 'Created' as it's redundant
data.drop(['Created'], axis=1)

# Changing the order of the columns in the dataset
data = data[['ID', 'Agent Label +ID', 'date_created', 'time_created', 'Last Pay Date', 'Pay Method', 'State', 'ZipCodeNew',
             'Last Pay Amount', 'Last Pay Complete', 'Last Pay Status', 'Last Pay Type', '#Prod', 'CUSTOM Cancel Reason',
             'Stage', 'Status', 'Income', 'days', 'months']]

In [10]:
# Renaming the columns in the data.csv dataset
data.columns = ['id', 'agent', 'date_created', 'time_created', 'lp_date', 'pay_method', 'state', 'zip_code',
               'lp_amount', 'lp_complete', 'lp_status', 'lp_type', 'prod', 'cancel_reason', 'stage',
               'status', 'income', 'days', 'months']

## **Imputations**

In [11]:
# Same method we used in Data Mining to calculate number of missing values per column

# Creating an empty list for column names
names = []

# Creating an empty list for the number of null values in each column
values = []

# Checking for Missing Values
for col in data.columns:
    names.append(col)
    values.append(data[col].isnull().sum())
    print(names[-1],values[-1])

id 0
agent 1
date_created 1
time_created 1
lp_date 6324
pay_method 1770
state 1
zip_code 0
lp_amount 6324
lp_complete 6324
lp_status 6324
lp_type 6545
prod 39
cancel_reason 39405
stage 19691
status 1
income 19122
days 6324
months 6324


In [12]:
# Imputing income using the back-fill option
# Dataset is sorted by zip code so I wanted the imputation to take into account geographic proximity

data['income'] = data['income'].fillna(method='bfill')

In [13]:
# Deleting rows with no pay method given since most of these rows have many missing values other than pay method

data = data[pd.notnull(data['pay_method'])] 

# Deleting rows that do not include a Last Pay Date since the customer never paid for a product
data = data[pd.notnull(data['lp_date'])]

#Deleting one row where the zip code was never given
data = data[pd.notnull(data['zip_code'])]

# Deleting rows that include Quick Cancel because these customer canceled the order before the end of a full month
data = data[~data['stage'].isin(['Quick Cancel'])]

In [14]:
# Binning categorical columns to larger groups
# Printing unique values for columns
print(data['pay_method'].unique())

#Binning 'pay_method' into 4 groups: Credit card, ACH, Paper Check, & Other
data['pay_method'] = data['pay_method'].replace({'Visa': 'credit card', 'MasterCard': 'credit card', 
                                                 'Discover': 'credit card', 'American Express': 'credit card',
                                                'Mastercard': 'credit card', 'Maestro': 'credit card',
                                                'Paper_Check': 'paper check'})  

# Counting the number of instances for each value in the 'pay_method' column
print(data['pay_method'].value_counts())

['Discover' 'Visa' 'MasterCard' 'ACH' 'American Express' 'Paper_Check'
 'Other' 'Mastercard' 'Maestro']
credit card    93863
ACH            13262
paper check     1437
Other            307
Name: pay_method, dtype: int64


In [15]:
# Same method we used in Data Mining to calculate number of missing values per column

# Creating an empty list for column names
names = []

# Creating an empty list for the number of null values in each column
values = []

# Checking for Missing Values
for col in data.columns:
    names.append(col)
    values.append(data[col].isnull().sum())
    print(names[-1],values[-1])

id 0
agent 0
date_created 0
time_created 0
lp_date 0
pay_method 0
state 0
zip_code 0
lp_amount 0
lp_complete 0
lp_status 0
lp_type 200
prod 0
cancel_reason 35044
stage 17110
status 0
income 0
days 0
months 0


In [17]:
# Creating the churn binary variable using numpy's where function
# Setting acive to 1 and left as 0
data['active_customer'] = np.where(data['status'].str.contains('Active'), 1, 0)

In [18]:
# Creating a binary code for whether a customer staying over 18 months or not
# 1 = greater than 18 months
data['18_months'] = np.where(data['months'] >= 18., 1, 0)

In [23]:
print(data['cancel_reason'].value_counts())

021 - No reason provided                                           25890
020 - Financial Reasons                                            12409
026 - Does not like / want                                         11985
013 - Duplicate account                                             2833
011 - Changed Mind                                                  2588
014 - Obtained another product                                      1664
022 - Other                                                         1637
001 - Deactivation Due to Non Payment                               1549
017 - Dissatisfied with product                                     1340
028 - Refuses to Use                                                1313
001 - Passed Away                                                   1282
029 - Call Center Requested                                         1162
023 - Restricted                                                     727
004 - Assisted Living Facility                     

In [34]:
# Creating a dummy variable for customers who disclosed their reason for leaving being a financial reason
data['reason_financial'] = np.where((data['cancel_reason'] == '020 - Financial Reasons') 
                                    | (data['cancel_reason'] == '003 - Financial Reasons'), 1, 0)

# Printing out the count of each binary code to make sure the number of 1's match up with the number in the print
# statement above
print(data['reason_financial'].value_counts())

0    96146
1    12723
Name: reason_financial, dtype: int64


In [36]:
# Adding dummy variables to the 'data' dataframe for the column 'pay_method'
data = pd.concat([data, pd.get_dummies(data['pay_method'])], axis=1)

In [37]:
data.head()

Unnamed: 0,id,agent,date_created,time_created,lp_date,pay_method,state,zip_code,lp_amount,lp_complete,...,income,days,months,churn,18_months,reason_financial,ACH,Other,credit card,paper check
0,657917124.0,Church Bulletin 0011 (109398),2014-07-30,14:36,2014-08-25,credit card,MA,1001.0,34.95,1.0,...,64780.0,26.0,0.87,0,0,0,0,0,1,0
1,658935850.0,Benefits and More 4 Paid (143595),2015-10-05,19:15,2015-10-05,credit card,MA,1001.0,29.95,1.0,...,64780.0,0.0,0.0,0,0,1,0,0,1,0
2,659370939.0,Benefits and More 4 Paid (143595),2016-05-02,20:16,2016-05-02,credit card,MA,1001.0,29.95,1.0,...,64780.0,0.0,0.0,0,0,0,0,0,1,0
3,659356995.0,Benefits and More 4 Paid (143595),2016-04-22,19:55,2016-05-12,credit card,MA,1001.0,39.95,1.0,...,64780.0,20.0,0.67,0,0,1,0,0,1,0
5,657847231.0,Newspaper (109455),2014-06-16,15:40,2014-07-03,credit card,MA,1002.0,34.95,1.0,...,77857.0,17.0,0.57,0,0,1,0,0,1,0
