In [1]:
# Importing the necessary packages
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import category_encoders as ce

In [2]:
# Reading in each csv file for the years of customer data
data = pd.read_csv('data (1) copy.csv', encoding = 'ISO-8859-1')

In [3]:
# Spliting the Created column into two rows with the date and time as seperate columns
data['date_created'], data['time_created'] = data['Created'].str.split(' ', 1).str

In [4]:
# Printing out the first 5 rows of each column in the dataset
print(data.head())

   #Prod                    Agent Label +ID        Created           ID  \
0    1.0      Church Bulletin 0011 (109398)  7/30/14 14:36  657917124.0   
1    1.0  Benefits and More 4 Paid (143595)  10/5/15 19:15  658935850.0   
2    1.0  Benefits and More 4 Paid (143595)   5/2/16 20:16  659370939.0   
3    1.0  Benefits and More 4 Paid (143595)  4/22/16 19:55  659356995.0   
4    1.0       Benefits and More 4 (116063)  7/28/16 16:01  659940549.0   

    Income  Last Pay Amount  Last Pay Complete Last Pay Date  Last Pay Status  \
0  74467.0            34.95                1.0       8/25/14              1.0   
1  74467.0            29.95                1.0       10/5/15              1.0   
2  74467.0            29.95                1.0        5/2/16              1.0   
3  74467.0            39.95                1.0       5/12/16              1.0   
4  74467.0              NaN                NaN           NaN              NaN   

  Last Pay Type  Pay Method State              Status  ZipCode

In [5]:
# Converting the two new split columns into datetime objects
data['date_created'] = pd.to_datetime(data['date_created'])
data['Last Pay Date'] = pd.to_datetime(data['Last Pay Date'])

# Taking the difference between the last pay date and the date the account was created 
# Using the dt.days function to calculate amount of days in between those dates
data['days'] = (data['Last Pay Date'] - data['date_created']).dt.days

# Dividing the number of days by 30 to get the number of months with the company
data['months'] = data['days']/30

In [6]:
# Extracting the year from the date_created column to create a 'year' column
data['year'] = pd.DatetimeIndex(data['date_created']).year

In [7]:
# Rounding up the 'months' and 'days' columns to the hundredth decimal place
data['months'] = np.round(data['months'], 2)
data['days'] = np.round(data['days'], 2)
data.head()

Unnamed: 0,#Prod,Agent Label +ID,Created,ID,Income,Last Pay Amount,Last Pay Complete,Last Pay Date,Last Pay Status,Last Pay Type,Pay Method,State,Status,ZipCodeNew,Stage,date_created,time_created,days,months,year
0,1.0,Church Bulletin 0011 (109398),7/30/14 14:36,657917124.0,74467.0,34.95,1.0,2014-08-25,1.0,Refund,Discover,MA,Returned <30 days,1001.0,Back in Inventory,2014-07-30,14:36,26.0,0.87,2014.0
1,1.0,Benefits and More 4 Paid (143595),10/5/15 19:15,658935850.0,74467.0,29.95,1.0,2015-10-05,1.0,Sale,Discover,MA,Returned < 90 days,1001.0,Back in Inventory,2015-10-05,19:15,0.0,0.0,2015.0
2,1.0,Benefits and More 4 Paid (143595),5/2/16 20:16,659370939.0,74467.0,29.95,1.0,2016-05-02,1.0,Sale,Visa,MA,Switched to LW,1001.0,,2016-05-02,20:16,0.0,0.0,2016.0
3,1.0,Benefits and More 4 Paid (143595),4/22/16 19:55,659356995.0,74467.0,39.95,1.0,2016-05-12,1.0,Refund,MasterCard,MA,Returned <30 days,1001.0,Back in Inventory,2016-04-22,19:55,20.0,0.67,2016.0
4,1.0,Benefits and More 4 (116063),7/28/16 16:01,659940549.0,74467.0,,,NaT,,,ACH,ME,Order Canceled,1001.0,Post Date Cancel,2016-07-28,16:01,,,2016.0


## **Changing Column Types and Names**

In [8]:
# Showing the data types for each column
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119833 entries, 0 to 119832
Data columns (total 20 columns):
#Prod                119794 non-null float64
Agent Label +ID      119832 non-null object
Created              119832 non-null object
ID                   119832 non-null float64
Income               112987 non-null float64
Last Pay Amount      113509 non-null float64
Last Pay Complete    113509 non-null float64
Last Pay Date        113509 non-null datetime64[ns]
Last Pay Status      113509 non-null float64
Last Pay Type        113288 non-null object
Pay Method           118063 non-null object
State                119832 non-null object
Status               119832 non-null object
ZipCodeNew           119832 non-null float64
Stage                100142 non-null object
date_created         119832 non-null datetime64[ns]
time_created         119832 non-null object
days                 113509 non-null float64
months               113509 non-null float64
year                 119832 

In [9]:
# Changing data column types to their appropriate types
data['ID'] = data['ID'].astype(str)
data['ZipCodeNew'] = data['ZipCodeNew'].astype(str)

In [10]:
# Deleting the row 'Created' as it's redundant
data.drop(['Created'], axis=1)

# Changing the order of the columns in the dataset
data = data[['ID', 'year', 'Agent Label +ID', 'date_created', 'time_created', 'Last Pay Date', 'Pay Method', 'State', 'ZipCodeNew',
             'Last Pay Amount', 'Last Pay Complete', 'Last Pay Status', 'Last Pay Type', '#Prod', 'Stage',
             'Status', 'Income', 'days', 'months']]

In [11]:
# Renaming the columns in the data.csv dataset
data.columns = ['id', 'year', 'agent', 'date_created', 'time_created', 'lp_date', 'pay_method', 'state', 'zip_code',
               'lp_amount', 'lp_complete', 'lp_status', 'lp_type', 'prod', 'stage', 'status', 'income', 'days', 'months']

## **Imputations**

In [12]:
# Same method we used in Data Mining to calculate number of missing values per column

# Creating an empty list for column names
names = []

# Creating an empty list for the number of null values in each column
values = []

# Checking for Missing Values
for col in data.columns:
    names.append(col)
    values.append(data[col].isnull().sum())
    print(names[-1],values[-1])

id 0
year 1
agent 1
date_created 1
time_created 1
lp_date 6324
pay_method 1770
state 1
zip_code 0
lp_amount 6324
lp_complete 6324
lp_status 6324
lp_type 6545
prod 39
stage 19691
status 1
income 6846
days 6324
months 6324


In [13]:
# Imputing income using the back-fill option
# Dataset is sorted by zip code so I wanted the imputation to take into account geographic proximity

data['income'] = data['income'].fillna(data['income'].median())
data['income'] = data['income'].replace(0, data['income'].median())

In [14]:
# Deleting rows with no pay method given since most of these rows have many missing values other than pay method

data = data[pd.notnull(data['pay_method'])] 

# Deleting rows that do not include a Last Pay Date since the customer never paid for a product
data = data[pd.notnull(data['lp_date'])]

# Deleting one row where the zip code was never given
data = data[pd.notnull(data['zip_code'])]

# Deleting rows with null lp_type values since no payment was received so no type was recorded
data = data[pd.notnull(data['lp_type'])]

# Deleting rows that include Quick Cancel because these customer canceled the order before the end of a full month
data = data[~data['stage'].isin(['Quick Cancel'])]

In [15]:
# Binning categorical columns to larger groups
# Printing unique values for columns
print(data['pay_method'].unique())

#Binning 'pay_method' into 4 groups: Credit card, ACH, Paper Check, & Other
data['pay_method'] = data['pay_method'].replace({'Visa': 'credit card', 'MasterCard': 'credit card', 
                                                 'Discover': 'credit card', 'American Express': 'credit card',
                                                'Mastercard': 'credit card', 'Maestro': 'credit card',
                                                'Paper_Check': 'paper check'})  

# Counting the number of instances for each value in the 'pay_method' column
print(data['pay_method'].value_counts())

['Discover' 'Visa' 'MasterCard' 'ACH' 'American Express' 'Paper_Check'
 'Other' 'Mastercard' 'Maestro']
credit card    93676
ACH            13249
paper check     1437
Other            307
Name: pay_method, dtype: int64


In [16]:
# Same method we used in Data Mining to calculate number of missing values per column

# Creating an empty list for column names
names = []

# Creating an empty list for the number of null values in each column
values = []

# Checking for Missing Values
for col in data.columns:
    names.append(col)
    values.append(data[col].isnull().sum())
    print(names[-1],values[-1])

id 0
year 0
agent 0
date_created 0
time_created 0
lp_date 0
pay_method 0
state 0
zip_code 0
lp_amount 0
lp_complete 0
lp_status 0
lp_type 0
prod 0
stage 16919
status 0
income 0
days 0
months 0


In [17]:
# Creating a binary code for whether a customer staying over 18 months or not
# 1 = greater than 18 months
data['18_months'] = np.where(data['months'] >= 18., 1, 0)

In [18]:
# Deleting customers for the years 2018 and 2019 since they have not reached the 18 month mark yet
data = data[~((data['status'] == 'Active') & (data['year'] == 2018))]
data = data[~((data['status'] == 'Active') & (data['year'] == 2019))]

In [19]:
# Printing out the unique values and value counts to see what needs to be combined
print(data['status'].unique())
print(data['status'].value_counts())

# Changing names of categories into bins
data['status'] = data['status'].replace({'Returned <30 days': 'Returned < 90 days', 'Decline Cancel': 'Decline',
                                        'Order Canceled': 'Order Cancelled', 'Daily Decline': 'Decline', 
                                        'Returned <60 Days': 'Returned < 90 days', 'Chargeback Received': 'Other',
                                        'In Process': 'Other', 'Post Date': 'Other', 'Suspend': 'Other',
                                        'Test': 'Other'}) 

['Returned <30 days' 'Returned < 90 days' 'Switched to LW'
 'Decline Cancel' 'Returned' 'Order Canceled' 'Decline' 'Duplicate'
 'Active' 'Returned to Sender' 'Deactivated' 'Restricted' 'Daily Decline'
 'Returned <60 Days' 'Suspend' 'In Process' 'Collections'
 'Chargeback Received' 'Test' 'Post Date']
Returned               30552
Decline                23717
Returned <30 days      15418
Decline Cancel          8681
Order Canceled          7181
Returned < 90 days      6509
Switched to LW          4269
Active                  3292
Duplicate               3232
Deactivated             1756
Returned to Sender       966
Daily Decline            245
Restricted               228
Returned <60 Days        107
Collections               45
Chargeback Received       17
In Process                10
Post Date                  8
Suspend                    5
Test                       5
Name: status, dtype: int64


In [20]:
# Using the method of one-hot encoding for the different forms of pay methods
data = pd.concat([data, pd.get_dummies(data['pay_method'], prefix = 'pay_method')], axis = 1)
data.head()

Unnamed: 0,id,year,agent,date_created,time_created,lp_date,pay_method,state,zip_code,lp_amount,...,stage,status,income,days,months,18_months,pay_method_ACH,pay_method_Other,pay_method_credit card,pay_method_paper check
0,657917124.0,2014.0,Church Bulletin 0011 (109398),2014-07-30,14:36,2014-08-25,credit card,MA,1001.0,34.95,...,Back in Inventory,Returned < 90 days,74467.0,26.0,0.87,0,0,0,1,0
1,658935850.0,2015.0,Benefits and More 4 Paid (143595),2015-10-05,19:15,2015-10-05,credit card,MA,1001.0,29.95,...,Back in Inventory,Returned < 90 days,74467.0,0.0,0.0,0,0,0,1,0
2,659370939.0,2016.0,Benefits and More 4 Paid (143595),2016-05-02,20:16,2016-05-02,credit card,MA,1001.0,29.95,...,,Switched to LW,74467.0,0.0,0.0,0,0,0,1,0
3,659356995.0,2016.0,Benefits and More 4 Paid (143595),2016-04-22,19:55,2016-05-12,credit card,MA,1001.0,39.95,...,Back in Inventory,Returned < 90 days,74467.0,20.0,0.67,0,0,0,1,0
5,657847231.0,2014.0,Newspaper (109455),2014-06-16,15:40,2014-07-03,credit card,MA,1002.0,34.95,...,,Decline,55937.0,17.0,0.57,0,0,0,1,0


In [21]:
# Using the method of binary encoding we learned in week 6 of Data Mining to encode the states column
# Since the 'state' column has 50 different levels, I used this method to avoid any dimensionality problems.
encoder = ce.BinaryEncoder(cols=['state'])
data1 = encoder.fit_transform(data)
data1.head()

Unnamed: 0,id,year,agent,date_created,time_created,lp_date,pay_method,state_0,state_1,state_2,...,stage,status,income,days,months,18_months,pay_method_ACH,pay_method_Other,pay_method_credit card,pay_method_paper check
0,657917124.0,2014.0,Church Bulletin 0011 (109398),2014-07-30,14:36,2014-08-25,credit card,0,0,0,...,Back in Inventory,Returned < 90 days,74467.0,26.0,0.87,0,0,0,1,0
1,658935850.0,2015.0,Benefits and More 4 Paid (143595),2015-10-05,19:15,2015-10-05,credit card,0,0,0,...,Back in Inventory,Returned < 90 days,74467.0,0.0,0.0,0,0,0,1,0
2,659370939.0,2016.0,Benefits and More 4 Paid (143595),2016-05-02,20:16,2016-05-02,credit card,0,0,0,...,,Switched to LW,74467.0,0.0,0.0,0,0,0,1,0
3,659356995.0,2016.0,Benefits and More 4 Paid (143595),2016-04-22,19:55,2016-05-12,credit card,0,0,0,...,Back in Inventory,Returned < 90 days,74467.0,20.0,0.67,0,0,0,1,0
5,657847231.0,2014.0,Newspaper (109455),2014-06-16,15:40,2014-07-03,credit card,0,0,0,...,,Decline,55937.0,17.0,0.57,0,0,0,1,0


In [22]:
# Using the method of binary encoding we learned in week 6 of Data Mining to encode the 'status' column
encoder = ce.BinaryEncoder(cols=['status'])
data2 = encoder.fit_transform(data)

data2 = data2.drop(['id', 'year', 'agent', 'date_created', 'time_created', 'lp_date',
       'pay_method', 'state', 'zip_code', 'lp_amount', 'lp_complete',
       'lp_status', 'lp_type', 'prod', 'stage','income', 'days', 'months',
       '18_months', 'pay_method_ACH', 'pay_method_Other',
       'pay_method_credit card', 'pay_method_paper check'] ,axis=1)
data2.head()

Unnamed: 0,status_0,status_1,status_2,status_3,status_4
0,0,0,0,0,1
1,0,0,0,0,1
2,0,0,0,1,0
3,0,0,0,0,1
5,0,0,0,1,1


In [23]:
# Creating a new dataframe with the columns I will be using in the classification models
data1 = data1[['year', 'status', 'pay_method_ACH', 'pay_method_credit card', 
              'pay_method_paper check', 'state_0', 'state_1', 'state_2', 'state_3', 'state_4',
              'state_5', 'state_6',  'lp_amount', 'prod', 'income', 'months', '18_months']]

data1['state'] = data['state']
data1['status'] = data['status']

data1 = pd.concat([data1, data2], axis = 1)
data1.head()

Unnamed: 0,year,status,pay_method_ACH,pay_method_credit card,pay_method_paper check,state_0,state_1,state_2,state_3,state_4,...,prod,income,months,18_months,state,status_0,status_1,status_2,status_3,status_4
0,2014.0,Returned < 90 days,0,1,0,0,0,0,0,0,...,1.0,74467.0,0.87,0,MA,0,0,0,0,1
1,2015.0,Returned < 90 days,0,1,0,0,0,0,0,0,...,1.0,74467.0,0.0,0,MA,0,0,0,0,1
2,2016.0,Switched to LW,0,1,0,0,0,0,0,0,...,1.0,74467.0,0.0,0,MA,0,0,0,1,0
3,2016.0,Returned < 90 days,0,1,0,0,0,0,0,0,...,1.0,74467.0,0.67,0,MA,0,0,0,0,1
5,2014.0,Decline,0,1,0,0,0,0,0,0,...,1.0,55937.0,0.57,0,MA,0,0,0,1,1


In [24]:
# Saving the imputed data as impute_data.csv to begin normalization
data1.to_csv('imputed_data.csv',',')