# Loading Packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Progress Bar

In [2]:
%%capture
from tqdm import tqdm_notebook
tqdm_notebook().pandas()

## Pandas Options

In [3]:
pd.set_option('max_columns', None)
pd.set_option('display.max_rows', None)

# Loading Files

## 1 - Read Default Data

In [4]:
path = 'D:\\Desktop\\MLPGD_Capstone_Project\\resources\\aaa_sample_data.xlsx'

df_original = pd.read_excel(path)

* Data Glance

In [5]:
df_original.head(n=2)

Unnamed: 0,Column1,Individual Key,Household Key,Member Flag,City,State - Grouped,ZIP5,ZIP9,FSV CMSI Flag,FSV Credit Card Flag,FSV Deposit Program Flag,FSV Home Equity Flag,FSV ID Theft Flag,FSV Mortgage Flag,INS Client Flag,TRV Globalware Flag,Number of Children,Responded to Catalog,Race,Length Of Residence,Mail Responder,Home Owner,Income,Date Of Birth,Children,Education,Dwelling Type,Credit Ranges,Language,Gender,Active Expiration Date,Address Change Date,Bad Address Flag,Billing Code Description,Birth Date MMDDYYYY,Branch Name,Cancel Date,Cancel Reason,County,Do Not Direct Mail Solicit,Email Available,Email Status,ERS ENT Count Year 1,ERS ENT Count Year 2,ERS ENT Count Year 3,ERS Member Cost Year 1,ERS Member Cost Year 2,ERS Member Cost Year 3,Right_Gender,Right_Individual Key,Join AAA Date,Join Club Date,Member Key,Member Map Location,Member Number Associate ID,Member Phone Type,Member Status,Member Tenure Years,Member Type,Membership ID,Months from Join to Cancel,Opt-Out - Publication,Reason Joined,Reinstate Date,Renew Method,ZIP,Mosaic Household,Mosaic Global Household,kcl_B_IND_MosaicsGrouping,New Mover Flag,Occupation Code,Occupation Group,Right_Dwelling Type,Move Distance,Occupant Type,Breakdown Map Location,Breakdown City,Breakdown State,Basic Cost,Calculated Tow Miles,Call Canceled,Call Killed,Call Status Recv Date,Cash Call,Clearing Code Last Description,Dispatch Code1 Description,Dispatch Code2Description,DTL Prob1 Code Description,Fleet Indicator,Is Duplicate,Is NSR,Member Match Flag,Member Number and Associate ID,Motorcycle Indicator,Plus Cost,Plus Indicator Description,Premier Cost,Prob1 Code Description,Prob2 Code Description,SC Call Club Code Description,SC Date,Rec ID,SC STS RSN Code Description,SC Vehicle Manufacturer Name,SC Vehicle Model Name,SVC Facility Name,SVC Facility Type,Total Cost,Tow Destination Latitude,Tow Destination Longitude,Tow Destination Name,Was Duplicated,Was Towed To AAR Referral
0,0,100000030,104625900,Y,NEW HAVEN,CT,65110,651113490,N,N,N,N,N,N,N,N,,,,,,,,NaT,,,Small or large multi-family w/apt number,,,Male,NaT,NaT,,,NaT,,NaT,,,,,,,,,,,,,,NaT,NaT,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,
1,1,522115500,45007910,Y,WEST WARWICK,RI,28930,289338500,N,Y,N,N,N,N,N,N,One Child,,Caucasion / White - English,150.0,Yes,Home Owner,"30-39,999",1922-02-05,Yes,Some College,Small or large multi-family w/apt number,600-649,English,,2020-09-01,2005-01-28 12:41:09,F,New Member,1922-02-05,RI - WARWICK BRANCH,NaT,,KENT,0.0,0.0,,0.0,0.0,20.0,0.0,0.0,650.0,,522115500.0,1970-09-01,1970-09-01,15000140.0,"{41.696,-71.5308}",153000.0,VoIP,ACTIVE,490.0,Primary,1530.0,,Opt-In,,2019-08-23,AUTO RENEW,28930.0,Senior Discounts,Low Income Elders,Golden Year Guardians,N,,,,,,"{41,-71}",West Warwick,RI,325.0,8.0,N,N,2017-04-13 10:05:17,Y,CASH CALL ONLY C,Engine Overheat,,Engine Overheat,N,0.0,0.0,10.0,153000.0,N,0.0,Basic Membership,0.0,Tow,,AAA Northeast,2017-04-13,97073200.0,CASH CALL ONLY C,TOYOTA,CAMRY,ASTRO WRECKER SERVICE,independent repair,325.0,410.0,-710.0,Aar / Johnson's Auto Service,0.0,10.0


* General Info

In [6]:
df_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21344 entries, 0 to 21343
Columns: 113 entries, Column1 to Was Towed To AAR Referral
dtypes: datetime64[ns](9), float64(31), int64(5), object(68)
memory usage: 18.4+ MB


In [7]:
df_original.dtypes

Column1                                    int64
Individual Key                             int64
Household Key                              int64
Member Flag                               object
City                                      object
State - Grouped                           object
ZIP5                                       int64
ZIP9                                       int64
FSV CMSI Flag                             object
FSV Credit Card Flag                      object
FSV Deposit Program Flag                  object
FSV Home Equity Flag                      object
FSV ID Theft Flag                         object
FSV Mortgage Flag                         object
INS Client Flag                           object
TRV Globalware Flag                       object
Number of Children                        object
Responded to Catalog                      object
Race                                      object
Length Of Residence                      float64
Mail Responder      

* Reading Dictionaries

In [8]:
path_dict_member = 'D:\\Desktop\\MLPGD_Capstone_Project\\resources\\member_data_dict.xlsx'

dict_member = pd.read_excel(path_dict_member)

In [9]:
path_dict_rodaside = 'D:\\Desktop\\MLPGD_Capstone_Project\\resources\\roadside_data_dict.xlsx'

dict_roadside = pd.read_excel(path_dict_rodaside)

# Data processing (Exploratory Data Analysis - EDA)

* Creating Dictionary to keep track of each operation and each filtering per operation

In [10]:
removed_columns = {}

## 1 - Dropping Erroneous Column (excel did that on converting the csv original file)

    1.1 - Defining the key for this operation 

In [11]:
removed_columns['Erroneous Columns Removal'] = ['Column1'] 

    1.2 - Performing Filtering

In [12]:
df_original.drop(columns='Column1', inplace=True)

## 2 - Evaluating number of null values

In [13]:
nullval_ratio = (df_original.isnull().sum() * 100 / len(df_original)).sort_values(ascending=False) # percentage of null values per column

In [14]:
nullval_ratio

Dispatch Code2Description         100.000000
Prob2 Code Description             99.962519
Reason Joined                      98.182159
Reinstate Date                     97.680847
Move Distance                      96.996814
Occupant Type                      96.570465
Right_Dwelling Type                96.139430
Responded to Catalog               95.970765
Tow Destination Name               82.702399
Right_Gender                       74.779798
Months from Join to Cancel         61.689468
Cancel Date                        61.689468
Cancel Reason                      61.689468
Occupation Group                   58.854948
Occupation Code                    58.854948
Gender                             58.850262
Date Of Birth                      52.253561
Email Status                       48.833396
Address Change Date                46.640742
Home Owner                         45.516304
Tow Destination Latitude           44.654235
Tow Destination Longitude          44.654235
Income    

    2.1 - Removing columns with more than 70% of null values

            2.1.1 - Maintaining Original Data file 

In [15]:
removed_null_data = df_original.copy()

            2.1.2 - Defining Threshold 

In [16]:
threshold = 70.0

            2.1.3 - Defining the key for this operation

In [17]:
key2 = str(threshold) + '% Threshold Column Removal'

removed_columns[key2] = []

            2.1.4 - Performing Filtering

In [18]:
for c in tqdm_notebook(df_original.columns, desc='Process Progress'):

    if nullval_ratio[c] > threshold:

        removed_null_data.drop(columns=c, inplace=True)

        removed_columns[key2].append(c)

print('Removed Columns due selected treshold: ', removed_columns[key2])

HBox(children=(FloatProgress(value=0.0, description='Process Progress', max=112.0, style=ProgressStyle(descrip…


Removed Columns due selected treshold:  {'Erroneous Columns Removal': ['Column1'], '70.0% Threshold Column Removal': ['Responded to Catalog', 'Right_Gender', 'Reason Joined', 'Reinstate Date', 'Right_Dwelling Type', 'Move Distance', 'Occupant Type', 'Dispatch Code2Description', 'Prob2 Code Description', 'Tow Destination Name']}


## 3 - Subjective Evaluation

    3.1 - Rename FSV, INS, and TRV columns that are related to products purchase flags

In [19]:
matching_FSV = [s for s in list(removed_null_data.columns) if "FSV" in s]
matching_INS = [s for s in list(removed_null_data.columns) if "INS" in s]
matching_TRV = [s for s in list(removed_null_data.columns) if "TRV" in s]
final_matching = matching_FSV + matching_INS + matching_TRV
print(final_matching)

['FSV CMSI Flag', 'FSV Credit Card Flag', 'FSV Deposit Program Flag', 'FSV Home Equity Flag', 'FSV ID Theft Flag', 'FSV Mortgage Flag', 'INS Client Flag', 'TRV Globalware Flag']


In [20]:
counter = 1

for c in final_matching:

    removed_null_data.rename(columns={c : 'Purchased Product' + ' ' + str(counter)}, inplace=True)

    counter = counter + 1

In [22]:
removed_null_data.head(n=2)

Unnamed: 0,Individual Key,Household Key,Member Flag,City,State - Grouped,ZIP5,ZIP9,Purchased Product 1,Purchased Product 2,Purchased Product 3,Purchased Product 4,Purchased Product 5,Purchased Product 6,Purchased Product 7,Purchased Product 8,Number of Children,Race,Length Of Residence,Mail Responder,Home Owner,Income,Date Of Birth,Children,Education,Dwelling Type,Credit Ranges,Language,Gender,Active Expiration Date,Address Change Date,Bad Address Flag,Billing Code Description,Birth Date MMDDYYYY,Branch Name,Cancel Date,Cancel Reason,County,Do Not Direct Mail Solicit,Email Available,Email Status,ERS ENT Count Year 1,ERS ENT Count Year 2,ERS ENT Count Year 3,ERS Member Cost Year 1,ERS Member Cost Year 2,ERS Member Cost Year 3,Right_Individual Key,Join AAA Date,Join Club Date,Member Key,Member Map Location,Member Number Associate ID,Member Phone Type,Member Status,Member Tenure Years,Member Type,Membership ID,Months from Join to Cancel,Opt-Out - Publication,Renew Method,ZIP,Mosaic Household,Mosaic Global Household,kcl_B_IND_MosaicsGrouping,New Mover Flag,Occupation Code,Occupation Group,Breakdown Map Location,Breakdown City,Breakdown State,Basic Cost,Calculated Tow Miles,Call Canceled,Call Killed,Call Status Recv Date,Cash Call,Clearing Code Last Description,Dispatch Code1 Description,DTL Prob1 Code Description,Fleet Indicator,Is Duplicate,Is NSR,Member Match Flag,Member Number and Associate ID,Motorcycle Indicator,Plus Cost,Plus Indicator Description,Premier Cost,Prob1 Code Description,SC Call Club Code Description,SC Date,Rec ID,SC STS RSN Code Description,SC Vehicle Manufacturer Name,SC Vehicle Model Name,SVC Facility Name,SVC Facility Type,Total Cost,Tow Destination Latitude,Tow Destination Longitude,Was Duplicated,Was Towed To AAR Referral
0,100000030,104625900,Y,NEW HAVEN,CT,65110,651113490,N,N,N,N,N,N,N,N,,,,,,,NaT,,,Small or large multi-family w/apt number,,,Male,NaT,NaT,,,NaT,,NaT,,,,,,,,,,,,,NaT,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,
1,522115500,45007910,Y,WEST WARWICK,RI,28930,289338500,N,Y,N,N,N,N,N,N,One Child,Caucasion / White - English,150.0,Yes,Home Owner,"30-39,999",1922-02-05,Yes,Some College,Small or large multi-family w/apt number,600-649,English,,2020-09-01,2005-01-28 12:41:09,F,New Member,1922-02-05,RI - WARWICK BRANCH,NaT,,KENT,0.0,0.0,,0.0,0.0,20.0,0.0,0.0,650.0,522115500.0,1970-09-01,1970-09-01,15000140.0,"{41.696,-71.5308}",153000.0,VoIP,ACTIVE,490.0,Primary,1530.0,,Opt-In,AUTO RENEW,28930.0,Senior Discounts,Low Income Elders,Golden Year Guardians,N,,,"{41,-71}",West Warwick,RI,325.0,8.0,N,N,2017-04-13 10:05:17,Y,CASH CALL ONLY C,Engine Overheat,Engine Overheat,N,0.0,0.0,10.0,153000.0,N,0.0,Basic Membership,0.0,Tow,AAA Northeast,2017-04-13,97073200.0,CASH CALL ONLY C,TOYOTA,CAMRY,ASTRO WRECKER SERVICE,independent repair,325.0,410.0,-710.0,0.0,10.0


    3.2 -  Removing columns that are not allowed or not ethical to work


        3.2.1 - Creating Dictionary Key

In [23]:
key3 = 'Unethical Columns'

removed_columns[key3] = ['Race', 'Language'] # is gender an unethical information for market segmentation?

        3.2.2 - Performing Filtering

In [24]:
for c in tqdm_notebook(removed_columns[key3], desc='Process Progress'):

    removed_null_data.drop(columns=c, inplace=True)

print('Removed Columns after filtering process: ', removed_columns[key3])

HBox(children=(FloatProgress(value=0.0, description='Process Progress', max=2.0, style=ProgressStyle(descripti…


Removed Columns after filtering process:  {'Erroneous Columns Removal': ['Column1'], '70.0% Threshold Column Removal': ['Responded to Catalog', 'Right_Gender', 'Reason Joined', 'Reinstate Date', 'Right_Dwelling Type', 'Move Distance', 'Occupant Type', 'Dispatch Code2Description', 'Prob2 Code Description', 'Tow Destination Name'], 'Unethical Columns': ['Race', 'Language']}


In [25]:
removed_null_data.head(n=2)

Unnamed: 0,Individual Key,Household Key,Member Flag,City,State - Grouped,ZIP5,ZIP9,Purchased Product 1,Purchased Product 2,Purchased Product 3,Purchased Product 4,Purchased Product 5,Purchased Product 6,Purchased Product 7,Purchased Product 8,Number of Children,Length Of Residence,Mail Responder,Home Owner,Income,Date Of Birth,Children,Education,Dwelling Type,Credit Ranges,Gender,Active Expiration Date,Address Change Date,Bad Address Flag,Billing Code Description,Birth Date MMDDYYYY,Branch Name,Cancel Date,Cancel Reason,County,Do Not Direct Mail Solicit,Email Available,Email Status,ERS ENT Count Year 1,ERS ENT Count Year 2,ERS ENT Count Year 3,ERS Member Cost Year 1,ERS Member Cost Year 2,ERS Member Cost Year 3,Right_Individual Key,Join AAA Date,Join Club Date,Member Key,Member Map Location,Member Number Associate ID,Member Phone Type,Member Status,Member Tenure Years,Member Type,Membership ID,Months from Join to Cancel,Opt-Out - Publication,Renew Method,ZIP,Mosaic Household,Mosaic Global Household,kcl_B_IND_MosaicsGrouping,New Mover Flag,Occupation Code,Occupation Group,Breakdown Map Location,Breakdown City,Breakdown State,Basic Cost,Calculated Tow Miles,Call Canceled,Call Killed,Call Status Recv Date,Cash Call,Clearing Code Last Description,Dispatch Code1 Description,DTL Prob1 Code Description,Fleet Indicator,Is Duplicate,Is NSR,Member Match Flag,Member Number and Associate ID,Motorcycle Indicator,Plus Cost,Plus Indicator Description,Premier Cost,Prob1 Code Description,SC Call Club Code Description,SC Date,Rec ID,SC STS RSN Code Description,SC Vehicle Manufacturer Name,SC Vehicle Model Name,SVC Facility Name,SVC Facility Type,Total Cost,Tow Destination Latitude,Tow Destination Longitude,Was Duplicated,Was Towed To AAR Referral
0,100000030,104625900,Y,NEW HAVEN,CT,65110,651113490,N,N,N,N,N,N,N,N,,,,,,NaT,,,Small or large multi-family w/apt number,,Male,NaT,NaT,,,NaT,,NaT,,,,,,,,,,,,,NaT,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,
1,522115500,45007910,Y,WEST WARWICK,RI,28930,289338500,N,Y,N,N,N,N,N,N,One Child,150.0,Yes,Home Owner,"30-39,999",1922-02-05,Yes,Some College,Small or large multi-family w/apt number,600-649,,2020-09-01,2005-01-28 12:41:09,F,New Member,1922-02-05,RI - WARWICK BRANCH,NaT,,KENT,0.0,0.0,,0.0,0.0,20.0,0.0,0.0,650.0,522115500.0,1970-09-01,1970-09-01,15000140.0,"{41.696,-71.5308}",153000.0,VoIP,ACTIVE,490.0,Primary,1530.0,,Opt-In,AUTO RENEW,28930.0,Senior Discounts,Low Income Elders,Golden Year Guardians,N,,,"{41,-71}",West Warwick,RI,325.0,8.0,N,N,2017-04-13 10:05:17,Y,CASH CALL ONLY C,Engine Overheat,Engine Overheat,N,0.0,0.0,10.0,153000.0,N,0.0,Basic Membership,0.0,Tow,AAA Northeast,2017-04-13,97073200.0,CASH CALL ONLY C,TOYOTA,CAMRY,ASTRO WRECKER SERVICE,independent repair,325.0,410.0,-710.0,0.0,10.0


    3.3 - Removing Columns that represent the same information in a different way (information redundancy)

        3.3.1 - Creating Dictionary Key

In [26]:
key4 = 'Redundancy Information Columns'

removed_columns[key4] = ['Individual Key', 'State - Grouped', 'ZIP5', 'ZIP9', 'Children', 'Birth Date MMDDYYYY', 'Cancel Date', 'County', 'Do Not Direct Mail Solicit', 'Right_Individual Key', 'Member Key', 'Member Number Associate ID', 'Membership ID', 'Reinstate Date', 'ZIP', 'Mosaic Household', 'kcl_B_IND_MosaicsGrouping', 'Occupation Code', 'Breakdown State', 'Call Killed', 'Clearing Code Last Description', 'Dispatch Code1 Description','DTL Prob1 Code Description', 'Is Duplicate', 'Member Match Flag', 'Member Number and Associate ID', 'SC Date', 'Rec ID', 'SC STS RSN Code Description', 'SC Vehicle Model Name', 'SVC Facility Name', 'SVC Facility Type', 'Tow Destination Latitude', 'Tow Destination Longitude'] # is the City variable relevant once we already have lat/long location? Address Change Date? Bad Adress Flag? Reinstate Date (y)? Call Status Recv Date? 

        3.3.2 - Performing Filtering

In [29]:
for c in tqdm_notebook(removed_columns[key4], desc='Process Progress'):

    if c in removed_null_data.columns: # since we have choosen a lot of columns, we have to ensure that all of them are still in the current dataset status

        removed_null_data.drop(columns=c, inplace=True)

print('Removed Columns after filtering process: ', removed_columns[key4])

HBox(children=(FloatProgress(value=0.0, description='Process Progress', max=34.0, style=ProgressStyle(descript…


Removed Columns after filtering process:  ['Individual Key', 'State - Grouped', 'ZIP5', 'ZIP9', 'Children', 'Birth Date MMDDYYYY', 'Cancel Date', 'County', 'Do Not Direct Mail Solicit', 'Right_Individual Key', 'Member Key', 'Member Number Associate ID', 'Membership ID', 'Reinstate Date', 'ZIP', 'Mosaic Household', 'kcl_B_IND_MosaicsGrouping', 'Occupation Code', 'Breakdown State', 'Call Killed', 'Clearing Code Last Description', 'Dispatch Code1 Description', 'DTL Prob1 Code Description', 'Is Duplicate', 'Member Match Flag', 'Member Number and Associate ID', 'SC Date', 'Rec ID', 'SC STS RSN Code Description', 'SC Vehicle Model Name', 'SVC Facility Name', 'SVC Facility Type', 'Tow Destination Latitude', 'Tow Destination Longitude']


* Checking Columns with low percentage of variantional information (counting null values)

for c in remove_null_data.columns:

    if remove_null_data.nunique()[c] == 1:

        remove_null_data.drop(columns=c, inplace=True)

        removed_columns.append(c)
        
print('Removed so far: ', removed_columns)

        We still need to keep an eye on the variables that only have 2 unique values, but it will be more a subjective evaluation 

* Check correlation Matrix

#correlation matrix
corrmat = remove_null_data.corr()
f, ax = plt.subplots(figsize=(15, 10))
sns.heatmap(corrmat, square=True); # considering 70% o correlation as minimum to show

* Observations

  1- There is an "island" between Rec ID and Tow destination Longitude this will be evaluate latter, but looks promissing
  2- Basic Cost has a high correlation with the variables within this "island" 
  3- Individual Key and Right_Individual Key are pratically the same variable
  4- ZIP5 and ZIP9 are pratically the same variable
  5- Months from join to Cancel has no correlation at so ever with the Premier Cost variable
  

    By the correlation matrix of the entire dataset (minus the columns with more than 70% of null values), we can see that there are still some columns with no correlation, or total correlation with all the variables, and some trouble columns. Therefore, the best thing to do is to drop them.

trouble_columns= ['Is Duplicate', 'Member Match Flag']

columns_will_not_use = ['Individual Key', 'Member Flag', 'Right_Individual Key']