## Fix the household composition inconsistencies and check with marital status

#### Load the data

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
# to print numbers in a nice way
import locale
locale.setlocale(locale.LC_ALL, '')

In [None]:
#load the data
hh_demographic = pd.read_csv('../data/dunnhumby_complete_csv/hh_demographic.csv', sep = ',')
transaction_data = pd.read_csv('../data/dunnhumby_complete_csv/transaction_data.csv', sep = ',')
product = pd.read_csv('../data/dunnhumby_complete_csv/product.csv', sep = ',')

We first change tha marital status to a more intuitive one, setting to M the married entries and to S the single ones.

In [None]:
hh_demographic.loc[hh_demographic['MARITAL_STATUS_CODE'] == 'A', 'MARITAL_STATUS_CODE'] = 'M'
hh_demographic.loc[hh_demographic['MARITAL_STATUS_CODE'] == 'B', 'MARITAL_STATUS_CODE'] = 'S'

#### Explore again the household composition information

We already know from the data exploration that 2 Adults No Kids and 2 Adults Kids make sense. So we do not take into account these compositions.

In [None]:
for hh_composition in hh_demographic['HH_COMP_DESC'].unique():
    if hh_composition == "2 Adults No Kids" or hh_composition == "2 Adults Kids" : continue
    print("Looking at the household composition:", hh_composition)
    hh_demographic_current_composition = hh_demographic[hh_demographic['HH_COMP_DESC'] == hh_composition]
    print("Household size unique information:", hh_demographic_current_composition['HOUSEHOLD_SIZE_DESC'].unique())
    print("Kids number unique information:", hh_demographic_current_composition['KID_CATEGORY_DESC'].unique())
    print()

#### Analysing the composition 1 Adult Kids

In [None]:
hh_composition = "1 Adult Kids"
hh_demographic_1adultkids = hh_demographic[hh_demographic['HH_COMP_DESC'] == hh_composition]
for household_size in hh_demographic_1adultkids['HOUSEHOLD_SIZE_DESC'].unique():
    hh_demographic_1adultkids_size = hh_demographic_1adultkids[hh_demographic_1adultkids['HOUSEHOLD_SIZE_DESC'] == household_size]
    print(f"For house hold size {household_size}, with 1 adult, there are", hh_demographic_1adultkids_size['KID_CATEGORY_DESC'].unique(), "kid categories")

We have problems where the household size is 3 or 4 units. Let's explore these in a better way.

In [None]:
hh_demographic_1adultkids[(hh_demographic_1adultkids['HOUSEHOLD_SIZE_DESC'] == '3') & (hh_demographic_1adultkids['KID_CATEGORY_DESC'] == '1')]

We can see here that there are entries where the composition is 1 Adult Kids, the household size is 3 and the number of kids is 1. This means one of the following: there is one more person living in the house, the household size is wrong, the composition is wrong.<br>
However, we can notice that all of these entries have a marital status M, which stands for married. We can assume then that the composition is wrong and there is an actual couple living in the house, with 1 kid.

In [None]:
hh_demographic_1adultkids[(hh_demographic_1adultkids['HOUSEHOLD_SIZE_DESC'] == '4') & (hh_demographic_1adultkids['KID_CATEGORY_DESC'] == '2')]

For household size of 4, the same as before stands.

#### Analysing composition Single Male/Female

It des not make sense that the household size is bigger than 1.

In [None]:
hh_composition = "Single Female"
hh_demographic_singlefemale_size2 = hh_demographic[hh_demographic['HH_COMP_DESC'] == hh_composition]
hh_demographic_singlefemale_size2 = hh_demographic_singlefemale_size2[hh_demographic_singlefemale_size2['HOUSEHOLD_SIZE_DESC'] == '2']

print(hh_demographic_singlefemale_size2['MARITAL_STATUS_CODE'].unique())

All of them look married, again.

In [None]:
hh_composition = "Single Male"
hh_demographic_singlemale_size2 = hh_demographic[hh_demographic['HH_COMP_DESC'] == hh_composition]
hh_demographic_singlemale_size2 = hh_demographic_singlemale_size2[hh_demographic_singlemale_size2['HOUSEHOLD_SIZE_DESC'] == '2']

print(hh_demographic_singlemale_size2['MARITAL_STATUS_CODE'].unique())

As above.

We now want to check if the marital status, the number of kids and the household size are always coherent with each others. If it is, then we can assume that the household composition information is sometimes wrong. Hence, we can correct this parameter or just discard it, since it does not carry more information with respect to the other three.

#### Are the marital status, the number of kids and the household size coherent with each others?

In [None]:
for marital_status in np.sort(hh_demographic['MARITAL_STATUS_CODE'].unique()):
    print("Marital status:", marital_status)
    hh_demographic_current_marital = hh_demographic[hh_demographic['MARITAL_STATUS_CODE'] == marital_status]
    print(hh_demographic_current_marital.groupby(['HH_COMP_DESC', 'HOUSEHOLD_SIZE_DESC', 'KID_CATEGORY_DESC']).size())
    print()

We can conclude that the marital status is always coherent with the household size and the number of kids. Combined with the findings above, we can say that in these cases the household composition is wrong and we will not consider that.<br>
We have some incongruities in the household size / number of kids when the marital status is Single, so we discard these entries.<br>
If the marital status is Unknown, we fall back on the household size / number of children information and we give the corresponding marital status, when it makes sense.

#### Cleaning up the dataset

In [None]:
hh_demographic_fxd = hh_demographic.copy()

Dropping the entries marked as Single with inconsistencies in the household size / number of kids.

In [None]:
dropindex = hh_demographic_fxd.index[
    (hh_demographic_fxd['MARITAL_STATUS_CODE'] == 'S') &
    (hh_demographic_fxd['HOUSEHOLD_SIZE_DESC'] == '2') &
    (hh_demographic_fxd['KID_CATEGORY_DESC'] == 'None/Unknown')].tolist()
dropindex += hh_demographic_fxd.index[
    (hh_demographic_fxd['MARITAL_STATUS_CODE'] == 'S') &
    (hh_demographic_fxd['HOUSEHOLD_SIZE_DESC'] == '3') &
    (hh_demographic_fxd['KID_CATEGORY_DESC'] == '1')].tolist()
dropindex += hh_demographic_fxd.index[
    (hh_demographic_fxd['MARITAL_STATUS_CODE'] == 'S') &
    (hh_demographic_fxd['HOUSEHOLD_SIZE_DESC'] == '4') &
    (hh_demographic_fxd['KID_CATEGORY_DESC'] == '2')].tolist()
print(dropindex)
print(len(dropindex), "entries dropped.")

hh_demographic_fxd.drop(dropindex, axis=0, inplace=True)

Assigning the correct marital status to the entries marked as Unknown, when the household size and the number of children are coherent with each other.

In [None]:
hh_demographic_fxd.loc[
    (hh_demographic_fxd['MARITAL_STATUS_CODE'] == 'U') &
    (hh_demographic_fxd['HOUSEHOLD_SIZE_DESC'] == '3') &
    (hh_demographic_fxd['KID_CATEGORY_DESC'] == '1'),
    'MARITAL_STATUS_CODE'] = 'M'
hh_demographic_fxd.loc[
    (hh_demographic_fxd['MARITAL_STATUS_CODE'] == 'U') &
    (hh_demographic_fxd['HOUSEHOLD_SIZE_DESC'] == '4') &
    (hh_demographic_fxd['KID_CATEGORY_DESC'] == '2'),
    'MARITAL_STATUS_CODE'] = 'M'
hh_demographic_fxd.loc[
    (hh_demographic_fxd['MARITAL_STATUS_CODE'] == 'U') &
    (hh_demographic_fxd['HOUSEHOLD_SIZE_DESC'] == '2') &
    (hh_demographic_fxd['KID_CATEGORY_DESC'] == 'None/Unknown'),
    'MARITAL_STATUS_CODE'] = 'M'

hh_demographic_fxd.loc[
    (hh_demographic_fxd['MARITAL_STATUS_CODE'] == 'U') &
    (hh_demographic_fxd['HOUSEHOLD_SIZE_DESC'] == '2') &
    (hh_demographic_fxd['KID_CATEGORY_DESC'] == '1'),
    'MARITAL_STATUS_CODE'] = 'S'
hh_demographic_fxd.loc[
    (hh_demographic_fxd['MARITAL_STATUS_CODE'] == 'U') &
    (hh_demographic_fxd['HOUSEHOLD_SIZE_DESC'] == '3') &
    (hh_demographic_fxd['KID_CATEGORY_DESC'] == '2'),
    'MARITAL_STATUS_CODE'] = 'S'
hh_demographic_fxd.loc[
    (hh_demographic_fxd['MARITAL_STATUS_CODE'] == 'U') &
    (hh_demographic_fxd['HOUSEHOLD_SIZE_DESC'] == '4') &
    (hh_demographic_fxd['KID_CATEGORY_DESC'] == '3+'),
    'MARITAL_STATUS_CODE'] = 'S'
hh_demographic_fxd.loc[
    (hh_demographic_fxd['MARITAL_STATUS_CODE'] == 'U') &
    (hh_demographic_fxd['HOUSEHOLD_SIZE_DESC'] == '1') &
    (hh_demographic_fxd['KID_CATEGORY_DESC'] == 'None/Unknown'),
    'MARITAL_STATUS_CODE'] = 'S'

hh_demographic_fxd = hh_demographic_fxd[hh_demographic_fxd['MARITAL_STATUS_CODE'] != 'U']

In [None]:
for marital_status in np.sort(hh_demographic_fxd['MARITAL_STATUS_CODE'].unique()):
    print("Marital status:", marital_status)
    hh_demographic_current_marital = hh_demographic_fxd[hh_demographic_fxd['MARITAL_STATUS_CODE'] == marital_status]
    print(hh_demographic_current_marital.groupby(['HH_COMP_DESC', 'HOUSEHOLD_SIZE_DESC', 'KID_CATEGORY_DESC']).size())
    print()

In [None]:
print(f"The number of entries goes from {len(hh_demographic.count(axis='columns')):n} to {len(hh_demographic_fxd.count(axis='columns')):n}.")

42 entries were discarded because the marital status Single did not matched with the household size / number of children or because, in the Unknown marital status, the household size and the number of children did not carry enough information to conclude something on the marital status.

In [None]:
hh_demographic_fxd.drop(['HH_COMP_DESC'], axis=1, inplace=True)
hh_demographic_fxd.loc[hh_demographic_fxd['KID_CATEGORY_DESC'] == 'None/Unknown', 'KID_CATEGORY_DESC'] = '0'
hh_demographic_fxd.rename(columns={'KID_CATEGORY_DESC': 'KIDS_DESC'}, inplace=True)

hh_demographic_fxd.reset_index(drop=True, inplace=True)

In [None]:
# saving the fixed dataframe
if not os.path.exists("saved_structures"):
    os.makedirs("saved_structures")
hh_demographic_fxd.to_csv("saved_structures/hh_demographic_fix_hhcomp.csv", sep='\t', index=False)

In [None]:
hh_demographic_fxd