# Loading Packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

## Progress Bar

In [None]:
%%capture
from tqdm import tqdm_notebook
tqdm_notebook().pandas()

## Pandas Options

In [None]:
pd.set_option('max_columns', None)
pd.set_option('display.max_rows', None)

# Loading Files

## 1 - Read Default Data

In [None]:
path = 'D:\\Desktop\\MLPGD_Capstone_Project\\resources\\aaa_sample_data.xlsx'

df_original = pd.read_excel(path)

* Data Glance

In [None]:
df_original.head(n=2)

* General Info

In [None]:
df_original.info()

In [None]:
df_original.dtypes;

* Reading Dictionaries

In [None]:
path_dict_member = 'D:\\Desktop\\MLPGD_Capstone_Project\\resources\\member_data_dict.xlsx'

dict_member = pd.read_excel(path_dict_member)

In [None]:
path_dict_rodaside = 'D:\\Desktop\\MLPGD_Capstone_Project\\resources\\roadside_data_dict.xlsx'

dict_roadside = pd.read_excel(path_dict_rodaside)

# Data processing 

* Creating Dictionary to keep track of each operation and each filtering per operation

In [None]:
removed_columns = {}
removed_rows = {}

## 1 - Dropping Erroneous Column (excel did that on converting the csv original file)

    1.1 - Defining the key for this operation 

In [None]:
removed_columns['Erroneous Columns Removal'] = ['Column1'] 

    1.2 - Performing Filtering

In [None]:
df_original.drop(columns='Column1', inplace=True)

In [None]:
df_original.info()

## 2 - Grouping By Househol Key (main objective) and separate by variable type

    2.1 - Defining Data Type Columns and creating individual DataFrame per each

In [None]:
binary_columns = ['FSV CMSI Flag', 'FSV Credit Card Flag', 'FSV Deposit Program Flag', 'FSV Home Equity Flag', 'FSV ID Theft Flag', 'FSV Mortgage Flag', 'INS Client Flag', 'TRV Globalware Flag', 'Responded to Catalog', 'Mail Responder', 'Home Owner', 'Children', 'Gender', 'Bad Address Flag', 'Do Not Direct Mail Solicit', 'Email Available', 'Opt-Out - Publication', 'New Mover Flag', 'Occupant Type', 'Call Canceled', 'Call Killed', 'Cash Call', 'Fleet Indicator', 'Is Duplicate', 'Is NSR',	'Member Match Flag', 'Was Duplicated',	'Was Towed To AAR Referral'] # binary data that will be threat different from the rest of the numerical and categorical data

binary = pd.DataFrame() # individual binary dataframe

binary['Household Key'] = df_original['Household Key'] # concatenating household key column to perform groupby method

numericals = pd.DataFrame() # individual numerical dataframe

string = pd.DataFrame() # individual string dataframe

string['Household Key'] = df_original['Household Key'] # concatenating household key column to perform groupby method

dates = pd.DataFrame() # individual dates dataframe

dates['Household Key'] = df_original['Household Key'] # concatenating household key column to perform groupby method

    2.2 - Filling each Individual Dataframe with the proper columns

In [None]:
for c in tqdm_notebook(df_original.columns, desc='Process Progress'): # for all the columns

    if c in binary_columns: # if the columns is in the binary categorie

        binary[c] = df_original[c] # fill in the binary dataframe
    
    elif df_original.dtypes[c] in ['int64', 'float64']: # if the variables type of the given column are float or iteger

        numericals[c] = df_original[c] # fill in the numerical dataframe

    elif df_original.dtypes[c] in ['<M8[ns]']: # if the variables type of the given column are in the data format
        
        dates[c] = df_original[c] # fill in the date dataframe

    else:

        string[c] = df_original[c] # else, fill in the string dataframe
    

    2.3 - Grouping by Household Key

In [None]:
numericals = numericals.groupby('Household Key').mean() # grouping by the mean of numerical values per each Household Key

binary_numericals = binary.groupby('Household Key').mean() # grouping by the mean of binary values per each Household Key We still need to separate categorical and numerical binary variables to perform the groupby procedure in the categorical columns

        2.3.1 - Converting Dates to Year and Separating Binary Numericals from Binary Categoricals

In [None]:
for c in tqdm_notebook(binary.columns, desc='Process Progress'): # Separating Binary Types
    if (c != 'Household Key'): # we dont want to iterate over the Household Key unecessarily
        if (binary.dtypes[c] in ['int64', 'float64']): # if the variables type of the given column are float or iteger 

            binary.drop(columns=c, inplace=True) # dropped from the future categorical binary dataframe

In [None]:
for c in tqdm_notebook(dates.columns, desc='Process Progress'): # Converting Dates into Years
    if (c != 'Household Key'):
        dates[c] = dates[c].astype('datetime64[ns]')
        dates[c] = dates[c].dt.year
        #dates[c] = dates[c].astype('int64')

        2.3.2 - Grouping by mean and Household Key

In [None]:
dates  = dates.groupby('Household Key').mean() # grouping by the mean of year values per each Household Key

        2.3.3 - Grouping by mode and Household Key

In [None]:
string = string.groupby('Household Key').apply(lambda x: x.mode()) # grouping by the mode of each categorical per each Household Key

In [None]:
string.head(n=2) # by some reason, the groupby method by the mode of string columns returned a multi-index dataframe. Therefore, we need to remove the multi-index of this dataframe to move further our analysis

In [None]:
post_processed_string = pd.DataFrame() # creating a new dataframe for the post processed string variable

In [None]:
for house_id in tqdm_notebook(df_original['Household Key'].unique().tolist(), desc='Process Progress'): # iterating over all unique Household Keys

    if len(string.loc[house_id].index.tolist()) > 1: # if, for a given Household Key, we have more then one multi-index value

        result = string.loc[house_id].iloc[0] + string.loc[house_id].iloc[1] # we sum the first two rows

        non_null_indexes = np.where(string.loc[house_id].iloc[1].isnull().tolist())[0] # we evaluate which of them were not null before the sum procedure

        for i in np.where(string.loc[house_id].iloc[1].isnull().tolist())[0]: # we iterate over the indexes that were not null before the sum procedure

            result[i] = string.loc[house_id].iloc[0][i] # we replace by the previous non null value

        post_processed_string = pd.concat([post_processed_string, result.to_frame().T]) # and we store on our new dataframe

    else: # if, for a given Household Key, we have only one multi-index value

        result = string.loc[house_id].iloc[0] # we store it because we can not change it so far

        post_processed_string = pd.concat([post_processed_string, result.to_frame().T]) # # and we store on our new dataframe

In [None]:
post_processed_string = post_processed_string.set_index('Household Key') # set index to be sorted as Household Key values

post_processed_string.index = post_processed_string.index.astype(int) # converting the index to iterger

post_processed_string.head(n=3)

In [None]:
binary_categoricals = binary.groupby('Household Key').apply(lambda x: x.mode()) # grouping by the mode of each categorical per each Household Key

In [None]:
binary_categoricals.head(n=5) # by some reason, the groupby method by the mode of binary_categoricals columns returned a multi-index dataframe. Therefore, we need to remove the multi-index of this dataframe to move further our analysis

In [None]:
post_processed_binary_categoricals = pd.DataFrame() # creating a new dataframe for the post processed string variable

In [None]:
for house_id in tqdm_notebook(df_original['Household Key'].unique().tolist(), desc='Process Progress'): # iterating over all unique Household Keys

    if len(binary_categoricals.loc[house_id].index.tolist()) > 1: # if, for a given Household Key, we have more then one multi-index value

        result = binary_categoricals.loc[house_id].iloc[0] + binary_categoricals.loc[house_id].iloc[1] # we sum the first two rows

        non_null_indexes = np.where(binary_categoricals.loc[house_id].iloc[1].isnull().tolist())[0] # we evaluate which of them were not null before the sum procedure

        for i in np.where(binary_categoricals.loc[house_id].iloc[1].isnull().tolist())[0]: # we iterate over the indexes that were not null before the sum procedure

            result[i] = binary_categoricals.loc[house_id].iloc[0][i] # we replace by the previous non null value

        post_processed_binary_categoricals = pd.concat([post_processed_binary_categoricals, result.to_frame().T]) # and we store on our new dataframe

    else: # if, for a given Household Key, we have only one multi-index value

        result = binary_categoricals.loc[house_id].iloc[0] # we store it because we can not change it so far

        post_processed_binary_categoricals = pd.concat([post_processed_binary_categoricals, result.to_frame().T]) # # and we store on our new dataframe

In [None]:
post_processed_binary_categoricals = post_processed_binary_categoricals.set_index('Household Key') # set index to be sorted as Household Key values

post_processed_binary_categoricals.index = post_processed_binary_categoricals.index.astype(int) # converting the index to iterger

post_processed_binary_categoricals.head(n=3)

In [None]:
print(len(numericals.index), len(binary_numericals.index), len(post_processed_binary_categoricals.index), len(dates.index), len(post_processed_string.index)) # number of rows per type of variable

In [None]:
print(len(numericals.columns), len(binary_numericals.columns), len(post_processed_binary_categoricals.columns), len(dates.columns), len(post_processed_string.columns)) # number of columns per type of variable

## 3 - Evaluating number of null values

In [None]:
columns_nullval_ratio = {} # dictionary of null values ration for all columns

sequence = ['Numerical', 'Binary Numericals', 'Dates', 'Binary Categoricals', 'Strings'] # key sequence for dictionaries

data_type_collection = [numericals, binary_numericals, dates, post_processed_binary_categoricals, post_processed_string] # list of dataframes per each data type

for var_type, idx in zip(sequence, range(len(sequence))): 

    columns_nullval_ratio[var_type] = (data_type_collection[idx].isnull().sum() * 100 / len(data_type_collection[idx])).sort_values(ascending=False) # filling dataframe dictionary

In [None]:
for var_type in sequence:

    print(var_type +' Column Null Values','\n', columns_nullval_ratio[var_type], '\n') # extensive print, beware

    3.1 - Removing columns beyond given threshold

            3.1.1 - First Checkpoint

In [None]:
checkpoint_1 = {} # creating dictionary for the first checkpoint

for var_type, idx in zip(sequence, range(len(sequence))): 
    checkpoint_1[var_type] = data_type_collection[idx].copy() # filling it with all types of dataframes

            3.1.2 - Defining Column Threshold 

In [None]:
column_threshold = 40.0 # threshold of 40 %

            3.1.3 - Defining the key for this operation

In [None]:
key2 = str(column_threshold) + '% Column Threshold Removal' # key for our track removal dictionary

removed_columns[key2] = [] # adding to the new key to the dictionary

            3.1.4 - Performing Filtering

In [None]:
for c in tqdm_notebook(df_original.columns, desc='Process Progress'): # for all columns on the original dataframe
    for var_type in sequence: # for all type of separatable dataframe 

        if (c == 'Occupation Code') or (c == 'Occupation Group'): # we want to preserve these two columns although we already now they would not pass the column threshold limit
            pass
        
        else:

            if c in checkpoint_1[var_type].columns: # if the column is in the list of columns of the given type of dataframe

                if columns_nullval_ratio[var_type][c] >= column_threshold: # we check if the same does not satisfies the threshold limit

                    checkpoint_1[var_type].drop(columns=c, inplace=True) # in this case we remove the given column from the given type of dataframe
                
                    removed_columns[key2].append(c) # and we add its name to the list of removed columns

print('Removed Columns after filtering process: ', removed_columns[key2])

In [None]:
for var_type, idx in zip(sequence, range(len(sequence))): # lets check the result now

    columns_nullval_ratio[var_type] = (checkpoint_1[var_type].isnull().sum() * 100 / len(checkpoint_1[var_type])).sort_values(ascending=False) # filling dataframe dictionary

for var_type in sequence:

    #print(var_type +' Column Null Values','\n', columns_nullval_ratio[var_type], '\n') # Nice! We have dropped the columns with a lot of null information that would make difficult for us to predict any of them 

    3.2 - Removing rows (Household Keys) beyond given threshold

        3.2.1 - Global Mean of Null Values for all Household Keys available

In [None]:
rows_nullval_ratio = {} # dictionary of null values sum for all columns

for var_type in sequence: 

    rows_nullval_ratio[var_type] = (checkpoint_1[var_type].isnull().sum(axis=1).sort_values(ascending=False)) # filling dataframe dictionary

In [None]:
sum_num_null_rows = [] # list of sum of all null values per row

for var_type in sequence:

    sum_num_null_rows.append(round(rows_nullval_ratio[var_type].sum())) # filling list

total_mean = round(sum(sum_num_null_rows)/len(rows_nullval_ratio['Numerical'].index.tolist())) # general mean of null values per row

print('\n','Mean Number of Null values on Rows:', total_mean) 

        3.2.2 - Creating Key and Dictionary for this operation

In [None]:
removed_rows = {}

row_key1 = str(total_mean) + ' Number of Null per Row Threshold Removal' # key for our track removal dictionary

removed_rows[row_key1] = [] # adding to the new key to the dictionary

        3.2.3 - Global Mean of each Household Key available

In [None]:
house_mean = {} # global mean number of null values for each Household Key

for house_id in tqdm_notebook(checkpoint_1[sequence[0]].index.tolist(), desc='Process Progress'): # for all unique Household Keys on the original dataframe
    tmp = 0 # temporary value for each global mean of the given Household Key
    for var_type in sequence: # for all type of separatable dataframe 

        tmp += rows_nullval_ratio[var_type][house_id]

    house_mean[house_id] =  tmp.mean()

        3.2.4 - Perform Filtering

In [None]:
for house_id in tqdm_notebook(checkpoint_1[sequence[0]].index.tolist(), desc='Process Progress'): # for all unique Household Keys on the original dataframe
    for var_type in sequence: # for all type of separatable dataframe        
        if house_mean[house_id] >= total_mean: # we check if the same does not satisfies the threshold limit

            for var_type in sequence: # for all type of separatable dataframe 

                if house_id in checkpoint_1[var_type].index.tolist(): # check if Household Key has not been already droped to prevent error raise

                    checkpoint_1[var_type].drop(house_id, inplace=True) # drop the row related to the given Household Key in all dataframes type

                    removed_rows[row_key1].append(house_id) # and we add its name to the list of removed columns

print('Removed Rows after filtering process: ', removed_rows[row_key1])

In [None]:
for var_type, idx in zip(sequence, range(len(sequence))): # lets check the result now

    rows_nullval_ratio[var_type] = (checkpoint_1[var_type].isnull().sum(axis=1).sort_values(ascending=False)) # filling dataframe dictionary

In [None]:
for var_type in sequence:

    print(var_type + ':', len(checkpoint_1[var_type].index)) # total number of rows in each type of dataframe

## 4 - Subjective Evaluation

    4.1 - Rename FSV, INS, and TRV columns that are related to products purchase flags

In [None]:
matching_FSV = [s for s in list(checkpoint_1['Binary Categoricals'].columns) if "FSV" in s] # creating a list of the columns that have at least the FSV mnemonic
matching_INS = [s for s in list(checkpoint_1['Binary Categoricals'].columns) if "INS" in s] # creating a list of the columns that have at least the INS mnemonic
matching_TRV = [s for s in list(checkpoint_1['Binary Categoricals'].columns) if "TRV" in s] # creating a list of the columns that have at least the TRV mnemonic
final_matching = matching_FSV + matching_INS + matching_TRV # concatenating them all
print(final_matching)

In [None]:
counter = 1 # counter flag

for c in final_matching:

    checkpoint_1['Binary Categoricals'].rename(columns={c : 'Purchased Product' + ' ' + str(counter)}, inplace=True) # renaming the columns

    counter = counter + 1 # index of the new product column name

In [None]:
checkpoint_1['Binary Categoricals'].head(n=2)

    4.2 -  Removing columns that are not allowed or not ethical to work


        4.2.1 - Creating Dictionary Key

In [None]:
key3 = 'Unethical Columns' # creating key to removed columns

removed_columns[key3] = ['Race', 'Language', 'Gender'] # since we are talking about Households, the same can not be differentiate by gender

        4.2.2 - Performing Filtering

In [None]:
for c in tqdm_notebook(removed_columns[key3], desc='Process Progress'): # iterating over the given columns
    for var_type in sequence: # for each dataframe type
        if c in checkpoint_1[var_type]: # if the columns is in the dataframe

            checkpoint_1[var_type].drop(columns=c, inplace=True) # we dropped

print('Removed Columns after filtering process: ', removed_columns[key3])

    4.3 - Removing Columns that represent the same information in a different way (information redundancy)

        4.3.1 - Creating Dictionary Key

In [None]:
key4 = 'Redundancy Information Columns' # generating key for this filtering procedure

removed_columns[key4] = ['Individual Key', 'State - Grouped', 'ZIP5', 'ZIP9', 'Children', 'Birth Date MMDDYYYY', 'Cancel Date', 'County', 'Do Not Direct Mail Solicit', 'Right_Individual Key', 'Member Key', 'Member Number Associate ID', 'Membership ID', 'Reinstate Date', 'ZIP', 'Mosaic Household', 'kcl_B_IND_MosaicsGrouping', 'Occupation Code', 'Breakdown State', 'Call Killed', 'Clearing Code Last Description', 'Dispatch Code1 Description','DTL Prob1 Code Description', 'Is Duplicate', 'Member Match Flag', 'Member Number and Associate ID', 'SC Date', 'Rec ID', 'SC STS RSN Code Description', 'SC Vehicle Model Name', 'SVC Facility Name', 'SVC Facility Type', 'Tow Destination Latitude', 'Tow Destination Longitude', 'Member Map Location'] #  Comlumns ids

        4.3.2 - Performing Filtering

In [None]:
for c in tqdm_notebook(removed_columns[key4], desc='Process Progress'): # for each column in the redundancy columns list
    for var_type in sequence: # for each dataframe type
        if c in checkpoint_1[var_type]: # if the columns is in the given dataframe
            checkpoint_1[var_type].drop(columns=c, inplace=True) # we dropped

print('Removed Columns after filtering process: ', removed_columns[key4])

In [None]:
for var_type in sequence:

    print(var_type + ':', len(checkpoint_1[var_type].columns)) # number of remaining columns for each dataframe type after the filtering process 

    4.4 - Checking Columns with low percentage of variantional information (counting null values)

        4.4.1 - Creating Dictionay Key 

In [None]:
key5 = 'Low Variational Information Columns' # generating key for this filtering procedure

removed_columns[key5] = [] # generating empty list

        4.4.2 - Performing Filtering

In [None]:
for var_type in tqdm_notebook(sequence, desc='Process Progress'): # for every dataframe type
    for c in checkpoint_1[var_type].columns: # for every column in the given dataframe

        if checkpoint_1[var_type].nunique()[c] == 1: # we check if the values on the given column are all the same

            checkpoint_1[var_type].drop(columns=c, inplace=True) # if it is so, we drop the column

            removed_columns[key5].append(c) # and we fill the removed columns list
        
print('Removed Columns after filtering process: ', removed_columns[key5])

In [None]:
for var_type in sequence:

    print(var_type + ':', len(checkpoint_1[var_type].columns)) # number of remaining columns for each dataframe type after the filtering process 

    4.5 - Removing Columns with Unnecessary Information (extremely subjective)

        4.5.1 - Creating Dictionary Key

In [None]:
key6 = 'Unnecessary Columns' # generating key for this filtering procedure

removed_columns[key6] = ['Address Change Date', 'Bad Address Flag', 'Billing Code Description', 'Join Club Date', 'Member Phone Type', 'Mosaic Global Household', 'New Mover Flag', 'Call Canceled', 'Is NSR', 'Plus Indicator Description', 'Was Duplicated', 'Was Towed To AAR Referral', 'Branch Name', 'Member Map Location', 'Breakdown Map Location'] #  Comlumns ids

        4.5.2 - Performing Filtering

In [None]:
for c in tqdm_notebook(removed_columns[key6], desc='Process Progress'): # for each column in the unncessary columns list
    for var_type in sequence: # for each dataframe type

        if c in checkpoint_1[var_type]: # if the column is in the given dataframe
            checkpoint_1[var_type].drop(columns=c, inplace=True) # we dropped

print('Removed Columns after filtering process: ', removed_columns[key6])

In [None]:
for var_type in sequence:

    print(var_type + ':', len(checkpoint_1[var_type].columns)) # number of remaining columns for each dataframe type after the filtering process 

## 4 - Summarize Information and Fill Nulls in the given columns 

        Saving checkpoint 

In [None]:
checkpoint_2 = {} # creating dictionary for the second checkpoint

for var_type, idx in zip(sequence, range(len(sequence))): 
    checkpoint_2[var_type] = checkpoint_1[var_type].copy() # filling in with all dataframe types

        4.1 - Dealing first with the Numericals Dataframe

In [None]:
checkpoint_2[sequence[0]].head(n=3) # lets remember what variables and how do they behave on this dataframe

                4.1.1 - Lets take a look at the Length Of Residence variable

In [None]:
checkpoint_2[sequence[0]]['Length Of Residence'].unique() # total number of null values for the Length Of Residence variable

                    4.1.1.1 - Round the values and converting them to interger type to prevent any error

In [None]:
for idx in tqdm_notebook(checkpoint_2[sequence[0]]['Length Of Residence'].index, desc='Process Progress'): # for all indexes of the given dataframe

    checkpoint_2[sequence[0]].at[idx, 'Length Of Residence'] = round(checkpoint_2[sequence[0]]['Length Of Residence'][idx]) # replace de Null value by the Household Key mean

    if np.isnan(checkpoint_2[sequence[0]]['Length Of Residence'][idx]):

        checkpoint_2[sequence[0]].at[idx, 'Length Of Residence'] = 0

checkpoint_2[sequence[0]]['Length Of Residence'] = checkpoint_2[sequence[0]]['Length Of Residence'].astype(int)

In [None]:
Counter(checkpoint_2[sequence[0]]['Length Of Residence'].tolist());

    As we can see, we still have a considerable amount of null values in the Length Of Residence variable. If we replaced these values by the mean of the variable we would be disregarding the geographical relation. Therefore, we will use the mean related to the City variable to predict the null values

In [None]:
checkpoint_2['Strings']['City'].isnull().sum() # number of Household Keys with null City values

In [None]:
checkpoint_2[sequence[0]]['City'] = checkpoint_2['Strings']['City'].values # temporarily concatenate the city column on the numerical column

                4.1.1.2 - Luckly, we can rely on the City variable, stored at the Strings dataframe. Thus, we can proceed with the Null values prediction

In [None]:
for idx in tqdm_notebook(checkpoint_2[sequence[0]].index, desc='Process Progress'): # For all Household Keys

    if checkpoint_2[sequence[0]]['Length Of Residence'][idx] == 0: # we check if the Length of Residence variable is equal zero (previouly changed from nan values)

        tmp_city = checkpoint_2[sequence[0]]['City'][idx] # we store a temporary value of the city for the given Household Key

        checkpoint_2[sequence[0]]['Length Of Residence'][idx] = checkpoint_2[sequence[0]]['Length Of Residence'].loc[checkpoint_2[sequence[0]]['City'] == tmp_city].values.mean() # replacing zeroed value by mean of the Length of Residence per grouped by City

                4.1.1.3 - Droping City variable from Numerical dataframe type

In [None]:
checkpoint_2[sequence[0]].drop(columns='City', inplace=True)

        4.1.2 - Rename ERS ENT columns that are related to number of roadside calls and creating a column that presents the sum of it

            4.1.2.1 Finding given columns

In [None]:
matching_ERS_ENT = [s for s in list(checkpoint_2[sequence[0]].columns) if "ERS ENT" in s] # creating a list of the columns that have at least the ERS ENT mnemonic

print(matching_ERS_ENT)

            4.1.2.2 Renaming Columns

In [None]:
counter = 1 # counter flag

for c in matching_ERS_ENT:

    checkpoint_2[sequence[0]].rename(columns={c : 'Number of Roadside calss for Year' + ' ' + str(counter)}, inplace=True) # renaming the columns

    counter = counter + 1 # index of the new product column name

In [None]:
checkpoint_2[sequence[0]].head(n=2)

            4.1.2.3 - Calculating Total Amount of Calls

In [None]:
new_column = [] # creating list for the new column to be appended

for idx in tqdm_notebook(checkpoint_2[sequence[0]].index, desc='Process Progress'): # For each row
    
    total_amount = 0 # total amount of calls for each Household Key

    for c in ['Number of Roadside calss for Year 1', 'Number of Roadside calss for Year 2',	'Number of Roadside calss for Year 3']: # for each Number of Roadside calls for each Year

        if np.isnan(checkpoint_2[sequence[0]][c][idx]): # checking for nan values existance

            checkpoint_2[sequence[0]].at[idx, c] = 0 # replacing them by zero calls
                  
        checkpoint_2[sequence[0]].at[idx, c] = round(checkpoint_2[sequence[0]][c][idx]) # rounding the call values

        checkpoint_2[sequence[0]][c] = checkpoint_2[sequence[0]][c].astype(int) # converting the final values to iterger type

        total_amount += checkpoint_2[sequence[0]][c][idx] # summing the total amount of call

    new_column.append(total_amount) # appending to list representing the new column to be appended

In [None]:
column_name = 'Total Number of Roadside Calls' # naming the new column

checkpoint_2[sequence[0]][column_name] = new_column # appendding new column

checkpoint_2[sequence[0]][column_name] = checkpoint_2[sequence[0]][column_name].astype(int) # converting to iterger type

In [None]:
checkpoint_2[sequence[0]].head(n=2)

## (Exploratory Data Analysis - EDA)

## Purchase Probability Prediction

## Clustering By Household Types

    4.1 - Number of Purchased Products

        4.1.1 - Creating Dictionary Key

In [None]:
key7 = 'Merging total number of purchased products by each row' # generating key for this filtering procedure

removed_columns[key7] = ['Purchased Product 1',	'Purchased Product 2',	'Purchased Product 3',	'Purchased Product 4',	'Purchased Product 5',	'Purchased Product 6',	'Purchased Product 7',	'Purchased Product 8'] # column IDs

        4.1.2 - Performing Summary

            4.1.2.1 - Converting String Values in Numerical Values and filling Null values as 0 purchased products for empty different Household Keys

In [None]:
for c in tqdm_notebook(removed_columns[key7], desc='Process Progress'): # In each Purchased Product Column
    for var_type in sequence: # for each dataframe type
        for idx in range(len(checkpoint_2[var_type])): # For each row

            if c in checkpoint_2[var_type].columns: # if the column belongs to the dataframe

                if (checkpoint_2[var_type][c].values[idx] == 'N') or (checkpoint_2[var_type][c].values[idx] == 'Null') or (checkpoint_2[var_type][c].values[idx] == np.nan): # if the value of the given column on the given index is N (No), Null or Nan

                    checkpoint_2[var_type][c].values[idx] = 0 # set the new value to zero

                else:

                    checkpoint_2[var_type][c].values[idx] = 1 # else, set the new value to one

            4.1.2.2 - Converting Columns to Iterger Type

In [None]:
for c in tqdm_notebook(removed_columns[key7], desc='Process Progress'):
    for var_type in sequence: # for each dataframe type
        if c in checkpoint_2[var_type].columns: # if the column belongs to the dataframe

            checkpoint_2[var_type][c] = checkpoint_2[var_type][c].astype(int) # converting the final values to iterger type

        4.1.2.3 - Summarize all Purchased Columns

In [None]:
new_column = [] # creating list for the new column to be appended

column_name = 'Total Number of Purchased Products' # naming the new column

for var_type in tqdm_notebook(sequence, desc='Process Progress'): # for each dataframe type
    for idx in checkpoint_2[var_type].index: # For each row
        total_amount = 0 # total amount of products bought in each row

        if c in checkpoint_2[var_type].columns: # if the column belongs to the dataframe

            for c in removed_columns[key7]: # In each Purchased Product Column

                if checkpoint_2[var_type][c][idx] == 1:

                    total_amount = total_amount + 1

            new_column.append(total_amount)

        4.1.2.4 - Drop Separate Purchased Data and Concatenate Total Purchased Data

In [None]:
for c in tqdm_notebook(removed_columns[key7], desc='Process Progress'):
    for var_type in sequence: # for each dataframe type
        if c in checkpoint_2[var_type].columns: # if the column belongs to the dataframe

            checkpoint_2[var_type].drop(columns=c, inplace=True)

            flag = var_type

checkpoint_2[sequence[0]][column_name] = new_column



print('Removed Columns after filtering process: ', removed_columns[key7])

In [None]:
for var_type in sequence:

    print(var_type + ':', len(checkpoint_2[var_type].columns))

    4.2 - Correcting Number of Children column

        4.2.2 - Converting String Values in Numerical Values and filling Null values as 0 children for empty different Household Keys

In [None]:
for var_type in sequence:

    if 'Number of Children' in checkpoint_2[var_type].columns:

        mask_n_children = checkpoint_2[var_type]['Number of Children'].isnull() # creating mask of null values in Number of Children column

        flag = var_type

In [None]:
for idx in tqdm_notebook(checkpoint_2[flag].index, desc='Process Progress'): # For each row

    if mask_n_children[idx]: # if the mask is true, that means that the correponded value for that index is Null

         checkpoint_2[var_type]['Number of Children'].values[idx] = 0    
    
    else:

        entry =  checkpoint_2[var_type]['Number of Children'].values[idx].split()[0]

        if (entry == 'No'):

             checkpoint_2[var_type]['Number of Children'].values[idx] = 0

        elif (entry == 'One'):

             checkpoint_2[var_type]['Number of Children'].values[idx] = 1

        elif (entry == 'Two'):

             checkpoint_2[var_type]['Number of Children'].values[idx] = 2

        elif (entry == 'Three'):

             checkpoint_2[var_type]['Number of Children'].values[idx] = 3

        elif (entry == 'Four'):

             checkpoint_2[var_type]['Number of Children'].values[idx] = 4

        elif (entry == 'Five'):

             checkpoint_2[var_type]['Number of Children'].values[idx] = 5

        elif (entry == 'Six'):

             checkpoint_2[var_type]['Number of Children'].values[idx] = 6

        4.2.2 - Converting Column to Iterger Type


In [None]:
filtered_data['Number of Children'] = filtered_data['Number of Children'].astype(int)

In [None]:
filtered_data.info()

    4.3 - Length of Residence

    The main approach we are going to apply here is to get the mean value of the household key and apply it on the same number. After that, if it still exists any null value on the given column, we will use the mean or the mode of the column City to fill it.

        4.3.1 - Replace random default value apllied by Pandas on Null values for the Length Of Residence column. For some reason it was choosen a smaller number as possible for a float64 number

In [None]:
filtered_data['Length Of Residence'].unique()  # as we can see, by some reason, the number -9.223372e+18 was choosen to be a flag of a Null value for this column, we need to replace it by 0 or np.nan

In [None]:
filtered_data['Length Of Residence'] = filtered_data['Length Of Residence'].replace(filtered_data['Length Of Residence'].unique()[0], np.nan) # replacing random numerical Null values chose by Pandas for numpy NaN 

In [None]:
filtered_data['Length Of Residence'].unique() # now we have out correct output for Null values for the Length Of Residence column

        4.3.2 - Checking if there are Household Keys with at least one value for the Length Of Residence column and apply it to the rest of them

In [None]:
filtered_data.groupby('Household Key')['Length Of Residence'].unique(); # As we can see, for the same Household Key level we have different values of Length Of Residence. This happens because our main sheet data was sorted on a Individual Key level, that is not our interest here. # OBS: REMOVE THE ';' AT THE END OF THE COMMAND TO PRINT THE OUTPUT! BE ADVISED, THE OUTPUT IS EXTENSE!

In [None]:
for house_id in tqdm_notebook(filtered_data.groupby('Household Key')['Length Of Residence'].unique().keys(), desc='Process Progress'): # for every single unique Household Key value

    if len(filtered_data.groupby('Household Key')['Length Of Residence'].unique()[house_id]) == 1: # if there is only one associated value for Length Of Residence, independet of the value, we can infer nothing for now, so, we pass
        pass
    
    else:

        tmp = filtered_data.loc[filtered_data['Household Key'] == house_id]['Length Of Residence'].values # storing array on a temporary file

        if np.isnan(tmp).any(): # check if the associated values for the Length Of Residence for a given Household Key has at least one Null value

            nan_positions = np.where(np.isnan(tmp))[0].tolist() # grabing the positions where the Null values exist

            if len(nan_positions) == len(tmp): # checking if all the values for the given Household Key are Null (in this case we will treat it in another way, so, we pass)
                pass

            else:

                replacement_value =  tmp[~np.isnan(tmp)].mean() # calculating the mean of all non Null values

                tmp[np.isnan(tmp)] = replacement_value # index of Null values on the array

                idxs = filtered_data.loc[filtered_data['Household Key'] == house_id, 'Length Of Residence'].index[nan_positions] # index of Null values on the dataframe

                for i in idxs: # for each dataframe index
                
                    filtered_data.at[i, 'Length Of Residence'] = replacement_value # replace de Null value by the Household Key mean

        4.3.3 - Now we focus on predicting the rest of the Null values, but this time through groups of cities

In [None]:
filtered_data.groupby('City')['Length Of Residence'].unique(); # As we can see, for the same City we have different values of Length Of Residence. We want to take advantage of this bigger scale to replace more Null values # OBS: REMOVE THE ';' AT THE END OF THE COMMAND TO PRINT THE OUTPUT! BE ADVISED, THE OUTPUT IS EXTENSE!

In [None]:
for city in tqdm_notebook(filtered_data.groupby('City')['Length Of Residence'].unique().keys(), desc='Process Progress'): # for every single unique Household Key value

    if len(filtered_data.groupby('City')['Length Of Residence'].unique()[city]) == 1: # if there is only one associated value for Length Of Residence, we can infer nothing for now. So, we pass and let this for the state mean 
        pass
    
    else:

        tmp = filtered_data.loc[filtered_data['City'] == city]['Length Of Residence'].values # storing array on a temporary file

        if np.isnan(tmp).any(): # check if the associated values for the Length Of Residence for a given City has at least one Null value

            nan_positions = np.where(np.isnan(tmp))[0].tolist() # grabing the positions where the Null values exist

            if len(nan_positions) == len(tmp): # checking if all the values for the given City are Null (in this case we will treat it in another way, so, we pass)
                pass

            else:

                replacement_value =  tmp[~np.isnan(tmp)].mean() # calculating the mean of all non Null values

                tmp[np.isnan(tmp)] = replacement_value # index of Null values on the array

                idxs = filtered_data.loc[filtered_data['City'] == city, 'Length Of Residence'].index[nan_positions] # index of Null values on the dataframe

                for i in idxs: # for each dataframe index
                
                    filtered_data.at[i, 'Length Of Residence'] = replacement_value # replace de Null value by the City mean

            4.3.4 - Now we focus on predicting the rest of the Null values on the biggest and final scale as possible, the states.

            Note: You may noticed that we have excluded the State - Gouped column, but, since we havent eliminated any row yet we can still use the previous indexes of this unusable column 

In [None]:
ct_idxs = df_original.index[df_original['State - Grouped'] == 'CT'].tolist() # grab index of all rows that belong to the CT state
ri_idxs = df_original.index[df_original['State - Grouped'] == 'RI'].tolist() # grab index of all rows that belong to the RI state

In [None]:
ct_mean = filtered_data['Length Of Residence'][ct_idxs].mean()
ri_mean = filtered_data['Length Of Residence'][ri_idxs].mean()
print('CT mean value considering all Null values so far filled: ', ct_mean, '\n', 'RI mean value considering all Null values so far filled: ', ri_mean)

In [None]:
for idx in tqdm_notebook(filtered_data.index, desc='Process Progress'): # for every single unique Household Key value

    if filtered_data['Length Of Residence'].isnull()[idx]: # if for the given index of the Length Of Residence column the value is Null
        if idx in ct_idxs: # check if this index belongs to the CT state

            filtered_data.at[idx, 'Length Of Residence'] = ct_mean # replace the Null value by the CT state mean

        else:

            filtered_data.at[idx, 'Length Of Residence'] = ri_mean # replace the Null value by the RI state mean

In [None]:
filtered_data['Length Of Residence'].isnull().values.any() # check if Length Of Residence column still has any Null value left

In [None]:
filtered_data.info()

    4.4 - Summarize E-mail and Mail Information

        4.4.1 - Creating List of Columns to be summarized

In [None]:
mail_var = ['Mail Responder', 'Email Available', 'Email Status', 'Opt-Out - Publication']

        4.4.2 - Checking Existance of Null Values on this columns

In [None]:
for col in mail_var:

    print('Flag if column ', col, ' has null values. ', filtered_data[col].isnull().values.any(), ' and its percentage ', nullval_ratio[col]) # check if the columns have any Null values

        As we can see, all the binary categorical columns have Null values. However, the columns related to Email Available and Opt-Out - Publication are almost full filled and, according to the spreadsheet dictionary, those two are AAA direct information and not third party information. So, we will focus on them as most reliable data and use the Member Status as well to "predict" a summary of all these information

In [None]:
filtered_data.groupby('Household Key')['Email Status'].unique();

        4.4.3 - Give the name of the new column to be added on the dataframe

In [None]:
new_mail_key = 'Email Correspondent' # new column name

filtered_data[new_mail_key] = "" # appending new column to the dataframe

        4.4.4 - To summarize all these information we need to ensure that for the same Household Key we have a concordance between all the variables. For that we will use the Member Status variable and Email Available as pivot information

In [None]:
for house_id in tqdm_notebook(filtered_data['Household Key'].unique().tolist(), desc='Process Progress'): # for every single unique Household Key value

    if len(Counter(filtered_data.groupby('Household Key')['Member Status'].unique()[house_id].tolist())) != 1: # if there is more than one Member Status type for the given Household Key

        if 'ACTIVE' in filtered_data.groupby('Household Key')['Member Status'].unique()[house_id].tolist(): # if at least one individual member is active on the given HouseHold Key

            filtered_data.loc[(filtered_data['Household Key'] == house_id), 'Member Status'] = 'ACTIVE' # replace them all for active

        elif 'PENDING' in filtered_data.groupby('Household Key')['Member Status'].unique()[house_id].tolist(): # if at least one individual member is pending on the given HouseHold Key and there is none active

            filtered_data.loc[(filtered_data['Household Key'] == house_id), 'Member Status'] = 'PENDING' # replace them all for pending 
    
    if len(Counter(filtered_data.groupby('Household Key')['Email Available'].unique()[house_id].tolist())) != 1: # if there is more than one Email Available type for the given Household Key

        if 10 in filtered_data.groupby('Household Key')['Email Available'].unique()[house_id].tolist(): # if at least one individual member has its email available on the given HouseHold Key

            filtered_data.loc[(filtered_data['Household Key'] == house_id), 'Email Available'] = 10 # replace them all by 10 (active)

    if len(Counter(filtered_data.groupby('Household Key')['Opt-Out - Publication'].unique()[house_id].tolist())) != 1: # if there is more than one Opt-Out - Publication type for the given Household Key   

        if 'Opt-In' in filtered_data.groupby('Household Key')['Opt-Out - Publication'].unique()[house_id].tolist(): # if at least one individual member has optional for an email publication on the given HouseHold Key

            filtered_data.loc[(filtered_data['Household Key'] == house_id), 'Opt-Out - Publication'] = 'Opt-In' # replace them all by Opt-In (active)


        4.4.5 - Filling the new column

In [None]:
for house_id in tqdm_notebook(filtered_data['Household Key'].unique().tolist(), desc='Process Progress'): # for every single unique Household Key value

    if ('ACTIVE' in filtered_data[filtered_data['Household Key'] == house_id]['Member Status'].tolist()) and (10 in filtered_data[filtered_data['Household Key'] == house_id]['Email Available'].tolist()) and ('Opt-In' in filtered_data[filtered_data['Household Key'] == house_id]['Opt-Out - Publication'].tolist()): # if all the conditions are satisfied

        filtered_data.loc[(filtered_data['Household Key'] == house_id), 'Email Correspondent'] = 1 # the household is an e-mail correspondent

    else:

        filtered_data.loc[(filtered_data['Household Key'] == house_id), 'Email Correspondent'] = 0 # the household is not an e-mail correspondent


filtered_data['Email Correspondent'] = filtered_data['Email Correspondent'].astype(int) # changing to integer data type

        4.4.6 -  Dropping Email information Columns

In [None]:
key8 = 'Merging Availability of Email Contact'

removed_columns[key8] = mail_var

In [None]:
for c in mail_var:

    filtered_data.drop(columns=c, inplace=True)

print('Removed Columns after filtering process: ', removed_columns[key8])

In [None]:
filtered_data.info()

In [None]:
path_out = '..\\resources\\checkpoint.csv'
filtered_data.to_csv(path_or_buf=path_out, index=False)

        We still need to keep an eye on the variables that only have 2 unique values, but it will be more a subjective evaluation 

* Check correlation Matrix

#correlation matrix
corrmat = remove_null_data.corr()
f, ax = plt.subplots(figsize=(15, 10))
sns.heatmap(corrmat, square=True); # considering 70% o correlation as minimum to show

* Observations

  1- There is an "island" between Rec ID and Tow destination Longitude this will be evaluate latter, but looks promissing
  2- Basic Cost has a high correlation with the variables within this "island" 
  3- Individual Key and Right_Individual Key are pratically the same variable
  4- ZIP5 and ZIP9 are pratically the same variable
  5- Months from join to Cancel has no correlation at so ever with the Premier Cost variable
  

    By the correlation matrix of the entire dataset (minus the columns with more than 70% of null values), we can see that there are still some columns with no correlation, or total correlation with all the variables, and some trouble columns. Therefore, the best thing to do is to drop them.

trouble_columns= ['Is Duplicate', 'Member Match Flag']

columns_will_not_use = ['Individual Key', 'Member Flag', 'Right_Individual Key']