In [79]:
# Importing the necessary packages
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

# Reading in each csv file for the years of customer data
data = pd.read_csv('data (1) copy.csv', encoding = 'ISO-8859-1')

In [80]:
# Spliting the Created column into two rows with the date and time as seperate columns
data['date_created'], data['time_created'] = data['Created'].str.split(' ', 1).str

# Printing out the first 5 rows of each column in the dataset
print(data.head())

   #Prod                    Agent Label +ID        Created           ID  \
0    1.0      Church Bulletin 0011 (109398)  7/30/14 14:36  657917124.0   
1    1.0  Benefits and More 4 Paid (143595)  10/5/15 19:15  658935850.0   
2    1.0  Benefits and More 4 Paid (143595)   5/2/16 20:16  659370939.0   
3    1.0  Benefits and More 4 Paid (143595)  4/22/16 19:55  659356995.0   
4    1.0       Benefits and More 4 (116063)  7/28/16 16:01  659940549.0   

    Income  Last Pay Amount  Last Pay Complete Last Pay Date  Last Pay Status  \
0  36250.0            34.95                1.0       8/25/14              1.0   
1  36250.0            29.95                1.0       10/5/15              1.0   
2  36250.0            29.95                1.0        5/2/16              1.0   
3  36250.0            39.95                1.0       5/12/16              1.0   
4  36250.0              NaN                NaN           NaN              NaN   

  Last Pay Type  Pay Method State              Status  ZipCode

In [81]:
# Converting the two new split columns into datetime objects
data['date_created'] = pd.to_datetime(data['date_created'])
data['Last Pay Date'] = pd.to_datetime(data['Last Pay Date'])

# Taking the difference between the last pay date and the date the account was created 
# Using the dt.days function to calculate amount of days in between those dates
data['days'] = (data['Last Pay Date'] - data['date_created']).dt.days

# Dividing the number of days by 30 to get the number of months with the company
data['months'] = data['days']/30

In [82]:
# Extracting the year from the date_created column to create a 'year' column
data['year'] = pd.DatetimeIndex(data['date_created']).year

In [83]:
# Rounding up the 'months' and 'days' columns to the hundredth decimal place
data['months'] = np.round(data['months'], 2)
data['days'] = np.round(data['days'], 2)

In [84]:
# Showing the data types for each column
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119833 entries, 0 to 119832
Data columns (total 20 columns):
#Prod                119794 non-null float64
Agent Label +ID      119832 non-null object
Created              119832 non-null object
ID                   119832 non-null float64
Income               118524 non-null float64
Last Pay Amount      113509 non-null float64
Last Pay Complete    113509 non-null float64
Last Pay Date        113509 non-null datetime64[ns]
Last Pay Status      113509 non-null float64
Last Pay Type        113288 non-null object
Pay Method           118063 non-null object
State                119832 non-null object
Status               119832 non-null object
ZipCodeNew           119832 non-null float64
Stage                100142 non-null object
date_created         119832 non-null datetime64[ns]
time_created         119832 non-null object
days                 113509 non-null float64
months               113509 non-null float64
year                 119832 

In [85]:
# Deleting the row 'Created' as it's redundant
data.drop(['Created'], axis=1)

# Changing the order of the columns in the dataset
data = data[['ID', 'year', 'Agent Label +ID', 'date_created', 'time_created', 'Last Pay Date', 'Pay Method', 
             'State', 'ZipCodeNew','Last Pay Amount', 'Last Pay Complete', 'Last Pay Status', 'Last Pay Type', 
             '#Prod', 'Stage', 'Status', 'Income', 'days', 'months']]

# Renaming the columns in the data.csv dataset
data.columns = ['id', 'year', 'agent', 'date_created', 'time_created', 'lp_date', 'pay_method', 'state', 'zip_code',
               'lp_amount', 'lp_complete', 'lp_status', 'lp_type', 'prod', 'stage', 'status', 'income', 'days', 'months']

## **Imputations**

In [86]:
# Same code we used in Data Mining to calculate number of missing values per column

# Creating an empty list for column names
names = []

# Creating an empty list for the number of null values in each column
values = []

# Checking for Missing Values
for col in data.columns:
    names.append(col)
    values.append(data[col].isnull().sum())
    print(names[-1],values[-1])

id 1
year 1
agent 1
date_created 1
time_created 1
lp_date 6324
pay_method 1770
state 1
zip_code 1
lp_amount 6324
lp_complete 6324
lp_status 6324
lp_type 6545
prod 39
stage 19691
status 1
income 1309
days 6324
months 6324


In [87]:
# Imputing the income column using the median of the income column
# Filling in any null values or 0 income values from the US Household Income dataset
data['income'] = data['income'].fillna(data['income'].median())
data['income'] = data['income'].replace(0, data['income'].median())

In [88]:
# Deleting rows with no pay method given since most of these rows have many missing values other than pay method
data = data[pd.notnull(data['pay_method'])] 

# Deleting rows that do not include a Last Pay Date since the customer never paid for a product
data = data[pd.notnull(data['lp_date'])]

# Deleting one row where the zip code was never given
data = data[pd.notnull(data['zip_code'])]

# Deleting rows with null lp_type values since no payment was received so no type was recorded
data = data[pd.notnull(data['lp_type'])]

# Deleting rows of customers who do not live in the United States
data = data[~data['state'].isin(['PR'])]
data = data[~data['state'].isin(['NT'])]
data = data[~data['state'].isin(['PE'])]
data = data[~data['state'].isin(['BC'])]

data.head()

Unnamed: 0,id,year,agent,date_created,time_created,lp_date,pay_method,state,zip_code,lp_amount,lp_complete,lp_status,lp_type,prod,stage,status,income,days,months
0,657917124.0,2014.0,Church Bulletin 0011 (109398),2014-07-30,14:36,2014-08-25,Discover,MA,1001.0,34.95,1.0,1.0,Refund,1.0,Back in Inventory,Returned <30 days,36250.0,26.0,0.87
1,658935850.0,2015.0,Benefits and More 4 Paid (143595),2015-10-05,19:15,2015-10-05,Discover,MA,1001.0,29.95,1.0,1.0,Sale,1.0,Back in Inventory,Returned < 90 days,36250.0,0.0,0.0
2,659370939.0,2016.0,Benefits and More 4 Paid (143595),2016-05-02,20:16,2016-05-02,Visa,MA,1001.0,29.95,1.0,1.0,Sale,1.0,,Switched to LW,36250.0,0.0,0.0
3,659356995.0,2016.0,Benefits and More 4 Paid (143595),2016-04-22,19:55,2016-05-12,MasterCard,MA,1001.0,39.95,1.0,1.0,Refund,1.0,Back in Inventory,Returned <30 days,36250.0,20.0,0.67
5,657847231.0,2014.0,Newspaper (109455),2014-06-16,15:40,2014-07-03,Visa,MA,1002.0,34.95,1.0,0.0,Sale,1.0,,Decline Cancel,74083.0,17.0,0.57


In [89]:
# Binning categorical columns to larger groups
# Printing unique values for columns
print(data['pay_method'].unique())

#Binning 'pay_method' into 4 groups: Credit card, ACH, Paper Check, & Other
data['pay_method'] = data['pay_method'].replace({'Visa': 'credit card', 'MasterCard': 'credit card', 
                                                 'Discover': 'credit card', 'American Express': 'credit card',
                                                'Mastercard': 'credit card', 'Maestro': 'credit card',
                                                'Paper_Check': 'paper check'})  

# Counting the number of instances for each value in the 'pay_method' column
print(data['pay_method'].value_counts())

['Discover' 'Visa' 'MasterCard' 'ACH' 'American Express' 'Paper_Check'
 'Other' 'Mastercard' 'Maestro']
credit card    97100
ACH            14412
paper check     1439
Other            306
Name: pay_method, dtype: int64


In [90]:
# Same code we used in Data Mining to calculate number of missing values per column

# Creating an empty list for column names
names = []

# Creating an empty list for the number of null values in each column
values = []

# Checking for Missing Values
for col in data.columns:
    names.append(col)
    values.append(data[col].isnull().sum())
    print(names[-1],values[-1])

id 0
year 0
agent 0
date_created 0
time_created 0
lp_date 0
pay_method 0
state 0
zip_code 0
lp_amount 0
lp_complete 0
lp_status 0
lp_type 0
prod 0
stage 16919
status 0
income 0
days 0
months 0


In [91]:
# Creating a binary code for whether a customer staying over 18 months or not
# 1 = greater than 18 months, 0 = less than 18 months
data['18_months'] = np.where(data['months'] >= 18., 1, 0)

In [92]:
# Deleting customers for the years 2018 and 2019
# Would not be an accurate representation of these customers since they mathematically have not reached 18 months yet
data = data[~((data['status'] == 'Active') & (data['year'] == 2018))]
data = data[~((data['status'] == 'Active') & (data['year'] == 2019))]

In [93]:
# Printing out the unique values for the 'status' column to see what values need to be combined
print(data['status'].unique())

['Returned <30 days' 'Returned < 90 days' 'Switched to LW'
 'Decline Cancel' 'Order Canceled' 'Returned' 'Decline' 'Duplicate'
 'Active' 'Returned to Sender' 'Deactivated' 'Restricted' 'Daily Decline'
 'Returned <60 Days' 'Suspend' 'In Process' 'Collections'
 'Chargeback Received' 'Test' 'Post Date']


In [94]:
# Replacing the status names to more general names
data['status'] = data['status'].replace({'Returned <30 days': 'Returned_90', 'Decline Cancel': 'Decline',
                                        'Order Canceled': 'Order Cancelled', 'Daily Decline': 'Decline', 
                                        'Returned <60 Days': 'Returned_90', 'Chargeback Received': 'Other',
                                        'In Process': 'Other', 'Post Date': 'Other', 'Suspend': 'Other',
                                        'Test': 'Other', 'Returned <90 days': 'Returned_90'}) 

In [95]:
# Using the method of one-hot encoding for the different forms of pay methods
data = pd.concat([data, pd.get_dummies(data['pay_method'], prefix = 'pay_method')], axis = 1)
data.columns

Index(['id', 'year', 'agent', 'date_created', 'time_created', 'lp_date',
       'pay_method', 'state', 'zip_code', 'lp_amount', 'lp_complete',
       'lp_status', 'lp_type', 'prod', 'stage', 'status', 'income', 'days',
       'months', '18_months', 'pay_method_ACH', 'pay_method_Other',
       'pay_method_credit card', 'pay_method_paper check'],
      dtype='object')

In [96]:
# Displaying all of the states in the 'states' column
data.state.unique()

array(['MA', 'TX', 'NY', 'CO', 'ME', 'PA', 'MT', 'NJ', 'CA', 'NC', 'RI',
       'SC', 'NH', 'GA', 'FL', 'KY', 'OH', 'MI', 'VT', 'AL', 'WI', 'CT',
       'IL', 'KS', 'VA', 'LA', 'NM', 'UT', 'NE', 'MD', 'DE', 'DC', 'WA',
       'MN', 'NV', 'WV', 'AZ', 'TN', 'MS', 'HI', 'IN', 'MO', 'AK', 'AR',
       'IA', 'ID', 'WY', 'SD', 'ND', 'OK', 'OR'], dtype=object)

In [97]:
# Grouping states into regions to avoid dimensionality problems when running the data in the models
data['state'] = data.state.replace({'MA': 'north_east', 'TX': 'south', 'NY': 'north_east', 'CO': 'rocky', 
                                    'ME': 'north_east', 'PA': 'north_east', 'MT': 'rocky', 'NJ': 'north_east', 
                                    'CA': 'west', 'NC': 'south_east', 'RI': 'north_east','SC': 'south_east', 
                                    'NH': 'north_east', 'GA': 'south_east', 'FL': 'south_east', 'KY': 'central', 
                                    'OH': 'central', 'MI': 'central', 'VT': 'north_east', 'AL': 'south', 'WI': 'central', 
                                    'CT': 'north_east','IL': 'central', 'KS': 'central', 'VA': 'north_east', 
                                    'LA': 'south', 'NM': 'south', 'UT': 'rocky', 'NE': 'rocky', 
                                    'DE': 'north_east', 'DC': 'north_east', 'WA': 'west','MD': 'north_east', 
                                    'MN': 'central', 'NV': 'west', 'WV': 'central', 'AZ': 'west', 'TN': 'central',
                                    'MS': 'south', 'HI': 'west', 'IN': 'central', 'MO': 'south', 'AK': 'west', 
                                    'AR': 'south', 'IA': 'central', 'ID': 'rocky', 'WY': 'rocky', 'SD': 'rocky', 
                                    'ND': 'rocky', 'OK': 'south', 'OR': 'west'})

In [98]:
# Prinitng out the new state bins to make sure no states were omitted in the replace dictionary
data.state.value_counts()

north_east    24464
south         24135
central       22298
south_east    21521
west          15060
rocky          3353
Name: state, dtype: int64

In [99]:
# Using the method of one-hot encoding for the different regions
data = pd.concat([data, pd.get_dummies(data['state'], prefix = 'region')], axis = 1)
data.columns

Index(['id', 'year', 'agent', 'date_created', 'time_created', 'lp_date',
       'pay_method', 'state', 'zip_code', 'lp_amount', 'lp_complete',
       'lp_status', 'lp_type', 'prod', 'stage', 'status', 'income', 'days',
       'months', '18_months', 'pay_method_ACH', 'pay_method_Other',
       'pay_method_credit card', 'pay_method_paper check', 'region_central',
       'region_north_east', 'region_rocky', 'region_south',
       'region_south_east', 'region_west'],
      dtype='object')

In [100]:
# Using the method of one-hot encoding for the different status types
data = pd.concat([data, pd.get_dummies(data['status'], prefix = 'status')], axis = 1)
data.columns

Index(['id', 'year', 'agent', 'date_created', 'time_created', 'lp_date',
       'pay_method', 'state', 'zip_code', 'lp_amount', 'lp_complete',
       'lp_status', 'lp_type', 'prod', 'stage', 'status', 'income', 'days',
       'months', '18_months', 'pay_method_ACH', 'pay_method_Other',
       'pay_method_credit card', 'pay_method_paper check', 'region_central',
       'region_north_east', 'region_rocky', 'region_south',
       'region_south_east', 'region_west', 'status_Active',
       'status_Collections', 'status_Deactivated', 'status_Decline',
       'status_Duplicate', 'status_Order Cancelled', 'status_Other',
       'status_Restricted', 'status_Returned', 'status_Returned < 90 days',
       'status_Returned to Sender', 'status_Returned_90',
       'status_Switched to LW'],
      dtype='object')

In [101]:
# dropping columns we will not be using in the models
data = data.drop(['id', 'agent', 'date_created', 'time_created', 'lp_date', 'pay_method', 'zip_code',
                  'days', 'lp_complete', 'lp_status', 'lp_type', 'prod', 'stage', 'status_Collections', 
                  'status_Deactivated', 'status_Duplicate', 'status_Order Cancelled', 'status_Other', 'status_Restricted',
                  'status_Returned to Sender','status_Switched to LW'], axis = 1)
data.columns

Index(['year', 'state', 'lp_amount', 'status', 'income', 'months', '18_months',
       'pay_method_ACH', 'pay_method_Other', 'pay_method_credit card',
       'pay_method_paper check', 'region_central', 'region_north_east',
       'region_rocky', 'region_south', 'region_south_east', 'region_west',
       'status_Active', 'status_Decline', 'status_Returned',
       'status_Returned < 90 days', 'status_Returned_90'],
      dtype='object')

In [102]:
# Calling data.head() to view the data and the dimensions of the new dataset
data.head()

Unnamed: 0,year,state,lp_amount,status,income,months,18_months,pay_method_ACH,pay_method_Other,pay_method_credit card,...,region_north_east,region_rocky,region_south,region_south_east,region_west,status_Active,status_Decline,status_Returned,status_Returned < 90 days,status_Returned_90
0,2014.0,north_east,34.95,Returned_90,36250.0,0.87,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1
1,2015.0,north_east,29.95,Returned < 90 days,36250.0,0.0,0,0,0,1,...,1,0,0,0,0,0,0,0,1,0
2,2016.0,north_east,29.95,Switched to LW,36250.0,0.0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
3,2016.0,north_east,39.95,Returned_90,36250.0,0.67,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1
5,2014.0,north_east,34.95,Decline,74083.0,0.57,0,0,0,1,...,1,0,0,0,0,0,1,0,0,0


In [103]:
# Saving the imputed data as impute_data.csv to begin normalization
data.to_csv('imputed_data.csv',',')