In [1]:
# Importing the necessary packages
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

# Reading in each csv file for the years of customer data
data = pd.read_csv('data (1) copy.csv', encoding = 'ISO-8859-1')
data.shape

(119833, 15)

In [2]:
# Spliting the Created column into two rows with the date and time as seperate columns
data['date_created'], data['time_created'] = data['Created'].str.split(' ', 1).str

# Printing out the first 5 rows of each column in the dataset
print(data.head())

   #Prod                    Agent Label +ID        Created           ID  \
0    1.0      Church Bulletin 0011 (109398)  7/30/14 14:36  657917124.0   
1    1.0  Benefits and More 4 Paid (143595)  10/5/15 19:15  658935850.0   
2    1.0  Benefits and More 4 Paid (143595)   5/2/16 20:16  659370939.0   
3    1.0  Benefits and More 4 Paid (143595)  4/22/16 19:55  659356995.0   
4    1.0       Benefits and More 4 (116063)  7/28/16 16:01  659940549.0   

    Income  Last Pay Amount  Last Pay Complete Last Pay Date  Last Pay Status  \
0  36250.0            34.95                1.0       8/25/14              1.0   
1  36250.0            29.95                1.0       10/5/15              1.0   
2  36250.0            29.95                1.0        5/2/16              1.0   
3  36250.0            39.95                1.0       5/12/16              1.0   
4  36250.0              NaN                NaN           NaN              NaN   

  Last Pay Type  Pay Method State              Status  ZipCode

In [3]:
# Converting the two new split columns into datetime objects
data['date_created'] = pd.to_datetime(data['date_created'])
data['Last Pay Date'] = pd.to_datetime(data['Last Pay Date'])

# Taking the difference between the last pay date and the date the account was created 
# Using the dt.days function to calculate amount of days in between those dates
data['days'] = (data['Last Pay Date'] - data['date_created']).dt.days

# Dividing the number of days by 30 to get the number of months with the company
data['months'] = data['days']/30

In [4]:
# Extracting the year from the date_created column to create a 'year' column
data['year'] = pd.DatetimeIndex(data['date_created']).year

In [5]:
# Rounding up the 'months' and 'days' columns to the hundredth decimal place
data['months'] = np.round(data['months'], 2)
data['days'] = np.round(data['days'], 2)

In [6]:
# Showing the data types for each column
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119833 entries, 0 to 119832
Data columns (total 20 columns):
#Prod                119794 non-null float64
Agent Label +ID      119832 non-null object
Created              119832 non-null object
ID                   119832 non-null float64
Income               118524 non-null float64
Last Pay Amount      113509 non-null float64
Last Pay Complete    113509 non-null float64
Last Pay Date        113509 non-null datetime64[ns]
Last Pay Status      113509 non-null float64
Last Pay Type        113288 non-null object
Pay Method           118063 non-null object
State                119832 non-null object
Status               119832 non-null object
ZipCodeNew           119832 non-null float64
Stage                100142 non-null object
date_created         119832 non-null datetime64[ns]
time_created         119832 non-null object
days                 113509 non-null float64
months               113509 non-null float64
year                 119832 

In [7]:
# Deleting the row 'Created' as it's redundant
data.drop(['Created'], axis=1)

# Changing the order of the columns in the dataset
data = data[['ID', 'year', 'Agent Label +ID', 'date_created', 'time_created', 'Last Pay Date', 'Pay Method', 
             'State', 'ZipCodeNew','Last Pay Amount', 'Last Pay Complete', 'Last Pay Status', 'Last Pay Type', 
             '#Prod', 'Stage', 'Status', 'Income', 'days', 'months']]

# Renaming the columns in the data.csv dataset
data.columns = ['id', 'year', 'agent', 'date_created', 'time_created', 'lp_date', 'pay_method', 'state', 'zip_code',
               'lp_amount', 'lp_complete', 'lp_status', 'lp_type', 'prod', 'stage', 'status', 'income', 'days', 'months']

## **Imputations**

In [8]:
# Deleting customers after the data 5/16/2018
# The csv files were created on 5/16/2019 so we want to exclude all the dates that cannot hit the 12 month threshold
data = data[~(data['date_created'] > '5/16/2018')]

# Removing those customers who did not stay with the company over 1 month
# The company would not have received a payment from these customers
data = data[~(data['months'] < .99)]
data = data[~data['status'].isin(['Switched to LW'])]
data = data[~data['stage'].isin(['Quick Cancel'])]

data.shape

(60060, 19)

In [9]:
# Same code we used in Data Mining to calculate number of missing values per column

# Creating an empty list for column names
names = []

# Creating an empty list for the number of null values in each column
values = []

# Checking for Missing Values
for col in data.columns:
    names.append(col)
    values.append(data[col].isnull().sum())
    print(names[-1],values[-1])

id 1
year 1
agent 1
date_created 1
time_created 1
lp_date 5495
pay_method 1752
state 1
zip_code 1
lp_amount 5495
lp_complete 5495
lp_status 5495
lp_type 5514
prod 36
stage 4438
status 1
income 503
days 5495
months 5495


In [10]:
# Imputing the income column using the median of the income column
# Filling in any null values or 0 income values from the US Household Income dataset
data['income'] = data['income'].fillna(data['income'].median())
data['income'] = data['income'].replace(0, data['income'].median())

In [11]:
# Deleting rows that do not include a Last Pay Date since the customer never paid for a product
data = data[pd.notnull(data['lp_date'])]

# Deleting one row where the zip code was never given
data = data[pd.notnull(data['zip_code'])]

# Deleting rows of customers who do not live in the United States
data = data[~data['state'].isin(['PR'])]
data = data[~data['state'].isin(['NT'])]
data = data[~data['state'].isin(['PE'])]
data = data[~data['state'].isin(['BC'])]

data.shape

(54565, 19)

In [12]:
# Same code we used in Data Mining to calculate number of missing values per column

# Creating an empty list for column names
names = []

# Creating an empty list for the number of null values in each column
values = []

# Checking for Missing Values
for col in data.columns:
    names.append(col)
    values.append(data[col].isnull().sum())
    print(names[-1],values[-1])

id 0
year 0
agent 0
date_created 0
time_created 0
lp_date 0
pay_method 13
state 0
zip_code 0
lp_amount 0
lp_complete 0
lp_status 0
lp_type 19
prod 0
stage 1873
status 0
income 0
days 0
months 0


In [13]:
# Grouping pay_method to less groups
# Printing unique values for columns
print(data['pay_method'].unique())

#Binning 'pay_method' into 4 groups: Credit card, ACH, Paper Check, & Other
data['pay_method'] = data['pay_method'].replace({'Visa': 'credit card', 'MasterCard': 'credit card', 
                                                 'Discover': 'credit card', 'American Express': 'credit card',
                                                'Mastercard': 'credit card', 'Maestro': 'credit card',
                                                'Paper_Check': 'paper check', np.nan: 'Other'})  

# Counting the number of instances for each value in the 'pay_method' column
print(data['pay_method'].value_counts())

['MasterCard' 'Discover' 'Visa' 'ACH' 'American Express' 'Paper_Check'
 'Mastercard' nan 'Other' 'Maestro']
credit card    45074
ACH             8121
paper check     1343
Other             27
Name: pay_method, dtype: int64


In [14]:
# Creating a binary code for whether a customer staying over 12 months or not
# 1 = greater than 12 months, 0 = less than 12 months
data['Y'] = np.where(data['months'] >= 12., 1, 0)

In [15]:
# Printing out the unique values for the 'status' column to see what values need to be combined
print(data['status'].unique())

['Returned' 'Order Canceled' 'Returned < 90 days' 'Decline Cancel'
 'Active' 'Returned <30 days' 'Decline' 'Returned to Sender' 'Deactivated'
 'Restricted' 'Daily Decline' 'Duplicate' 'Returned <60 Days' 'Suspend'
 'Collections' 'Chargeback Received' 'Test' 'In Process']


In [16]:
# Replacing the status names to more general names
data['status'] = data['status'].replace({'Returned <30 days': 'Returned_90', 'Decline Cancel': 'Decline',
                                        'Order Canceled': 'Order Cancelled', 'Daily Decline': 'Decline', 
                                        'Returned <60 Days': 'Returned_90', 'Chargeback Received': 'Other',
                                        'In Process': 'Other', 'Post Date': 'Other', 'Suspend': 'Other',
                                        'Test': 'Other', 'Returned < 90 days': 'Returned_90'}) 

In [17]:
# Using the method of one-hot encoding for the different forms of pay methods
data = pd.concat([data, pd.get_dummies(data['pay_method'], prefix = 'pay_method')], axis = 1)
data.columns

Index(['id', 'year', 'agent', 'date_created', 'time_created', 'lp_date',
       'pay_method', 'state', 'zip_code', 'lp_amount', 'lp_complete',
       'lp_status', 'lp_type', 'prod', 'stage', 'status', 'income', 'days',
       'months', 'Y', 'pay_method_ACH', 'pay_method_Other',
       'pay_method_credit card', 'pay_method_paper check'],
      dtype='object')

In [18]:
# Displaying all of the states in the 'states' column
data.state.unique()

array(['MA', 'TX', 'NY', 'ME', 'NC', 'MT', 'RI', 'NH', 'KY', 'VT', 'CT',
       'PA', 'NJ', 'NM', 'CA', 'NE', 'VA', 'DE', 'DC', 'MD', 'WV', 'WI',
       'TN', 'SC', 'GA', 'FL', 'LA', 'AL', 'MS', 'MI', 'KS', 'MO', 'OH',
       'AR', 'IN', 'IA', 'WA', 'MN', 'SD', 'ND', 'IL', 'AZ', 'OK', 'OR',
       'CO', 'WY', 'ID', 'UT', 'NV', 'HI', 'AK'], dtype=object)

In [19]:
# Grouping states into regions to avoid dimensionality problems when running the data in the models
data['state'] = data.state.replace({'MA': 'east', 'TX': 'central', 'NY': 'east', 'CO': 'west', 
                                    'ME': 'east', 'PA': 'east', 'MT': 'west', 'NJ': 'east', 
                                    'CA': 'west', 'NC': 'south', 'RI': 'east','SC': 'south', 
                                    'NH': 'east', 'GA': 'south', 'FL': 'south', 'KY': 'south', 
                                    'OH': 'east', 'MI': 'central', 'VT': 'east', 'AL': 'south', 'WI': 'central', 
                                    'CT': 'east','IL': 'central', 'KS': 'central', 'VA': 'east', 
                                    'LA': 'south', 'NM': 'west', 'UT': 'west', 'NE': 'central', 
                                    'DE': 'east', 'DC': 'east', 'WA': 'west','MD': 'east', 
                                    'MN': 'central', 'NV': 'west', 'WV': 'east', 'AZ': 'west', 'TN': 'south',
                                    'MS': 'south', 'HI': 'west', 'IN': 'central', 'MO': 'central', 'AK': 'west', 
                                    'AR': 'south', 'IA': 'central', 'ID': 'west', 'WY': 'west', 'SD': 'central', 
                                    'ND': 'central', 'OK': 'central', 'OR': 'west'})

In [27]:
# Printing the frequency of the regions
print(data.state.value_counts())

# Creating a dataframe of the customers who stayed over 12 months
dataY = data[data['Y'] == 1]
print(dataY.state.value_counts())

south      17585
east       14732
central    13073
west        9175
Name: state, dtype: int64
south      5903
east       5472
central    4470
west       3276
Name: state, dtype: int64


In [21]:
# Using the method of one-hot encoding for the different regions
data = pd.concat([data, pd.get_dummies(data['state'], prefix = 'region')], axis = 1)
data.columns

Index(['id', 'year', 'agent', 'date_created', 'time_created', 'lp_date',
       'pay_method', 'state', 'zip_code', 'lp_amount', 'lp_complete',
       'lp_status', 'lp_type', 'prod', 'stage', 'status', 'income', 'days',
       'months', 'Y', 'pay_method_ACH', 'pay_method_Other',
       'pay_method_credit card', 'pay_method_paper check', 'region_central',
       'region_east', 'region_south', 'region_west'],
      dtype='object')

In [22]:
# Using the method of one-hot encoding for the different status types
data = pd.concat([data, pd.get_dummies(data['status'], prefix = 'status')], axis = 1)
data.columns

Index(['id', 'year', 'agent', 'date_created', 'time_created', 'lp_date',
       'pay_method', 'state', 'zip_code', 'lp_amount', 'lp_complete',
       'lp_status', 'lp_type', 'prod', 'stage', 'status', 'income', 'days',
       'months', 'Y', 'pay_method_ACH', 'pay_method_Other',
       'pay_method_credit card', 'pay_method_paper check', 'region_central',
       'region_east', 'region_south', 'region_west', 'status_Active',
       'status_Collections', 'status_Deactivated', 'status_Decline',
       'status_Duplicate', 'status_Order Cancelled', 'status_Other',
       'status_Restricted', 'status_Returned', 'status_Returned to Sender',
       'status_Returned_90'],
      dtype='object')

In [23]:
# dropping columns we will not be using in the models
data = data.drop(['id', 'date_created', 'time_created', 'lp_date','days', 'lp_complete', 'lp_status', 'lp_type', 
                  'prod', 'stage', 'lp_amount',  'status_Collections', 'status_Deactivated', 'status_Duplicate', 
                  'status_Order Cancelled', 'status_Other', 'status_Restricted', 'status_Returned to Sender', 
                  'pay_method_Other'], axis = 1)
data.columns

Index(['year', 'agent', 'pay_method', 'state', 'zip_code', 'status', 'income',
       'months', 'Y', 'pay_method_ACH', 'pay_method_credit card',
       'pay_method_paper check', 'region_central', 'region_east',
       'region_south', 'region_west', 'status_Active', 'status_Decline',
       'status_Returned', 'status_Returned_90'],
      dtype='object')

In [24]:
# Calling data.head() to view the data and the dimensions of the new dataset
print(data.head())
print(data.shape)

      year                              agent   pay_method state  zip_code  \
7   2017.0             Yasha - Pers1 (264925)  credit card  east    1002.0   
13  2015.0  Benefits and More 4 Paid (143595)  credit card  east    1010.0   
16  2014.0                 Newspaper (109455)  credit card  east    1013.0   
17  2014.0                  Hospital (109465)  credit card  east    1013.0   
19  2015.0  Benefits and More 4 Paid (143595)  credit card  east    1013.0   

             status   income  months  Y  pay_method_ACH  \
7          Returned  74083.0    3.73  0               0   
13         Returned  51250.0   13.67  1               0   
16  Order Cancelled  29243.0    2.07  0               0   
17      Returned_90  29243.0    3.00  0               0   
19         Returned  29243.0   13.57  1               0   

    pay_method_credit card  pay_method_paper check  region_central  \
7                        1                       0               0   
13                       1          

In [25]:
# Same code we used in Data Mining to calculate number of missing values per column

# Creating an empty list for column names
names = []

# Creating an empty list for the number of null values in each column
values = []

# Checking for Missing Values
for col in data.columns:
    names.append(col)
    values.append(data[col].isnull().sum())
    print(names[-1],values[-1])

year 0
agent 0
pay_method 0
state 0
zip_code 0
status 0
income 0
months 0
Y 0
pay_method_ACH 0
pay_method_credit card 0
pay_method_paper check 0
region_central 0
region_east 0
region_south 0
region_west 0
status_Active 0
status_Decline 0
status_Returned 0
status_Returned_90 0


In [26]:
# Saving the imputed data as impute_data.csv to begin normalization
data.to_csv('imputed_data.csv',',')