In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import HTML, display




In [2]:
%matplotlib inline
sns.set()

In [3]:
transactions = 'data/transaction_data.csv'
households = 'data/hh_demographic.csv'
hh_behaviour = 'hh_behaviour_features.csv'

In [4]:
df_trans = pd.read_csv(transactions)
df_hh = pd.read_csv(households)
df_behav = pd.read_csv(hh_behaviour)

In [64]:
df_hh

Unnamed: 0,AGE_DESC,MARITAL_STATUS_CODE,INCOME_DESC,HOMEOWNER_DESC,HH_COMP_DESC,HOUSEHOLD_SIZE_DESC,KID_CATEGORY_DESC,household_key,new_income_range
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1,35-49K
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7,50-74K
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8,25-34K
3,25-34,U,75-99K,Homeowner,2 Adults Kids,4,2,13,75-99K
4,45-54,B,50-74K,Homeowner,Single Female,1,None/Unknown,16,50-74K
5,65+,B,Under 15K,Homeowner,2 Adults No Kids,2,None/Unknown,17,0-14K
6,45-54,A,100-124K,Homeowner,2 Adults No Kids,2,None/Unknown,18,100-149K
7,35-44,B,15-24K,Unknown,Single Female,1,None/Unknown,19,15-24K
8,25-34,A,75-99K,Renter,2 Adults No Kids,2,None/Unknown,20,75-99K
9,45-54,A,75-99K,Homeowner,2 Adults No Kids,2,None/Unknown,22,75-99K


In [113]:
fig = plt.figure()

<matplotlib.figure.Figure at 0x10ff98b10>

In [6]:
age_dist = df_hh[['AGE_DESC', 'household_key']]
age_dist = age_dist.groupby('AGE_DESC').count()
age_dist = age_dist.rename(columns={'household_key':'age_count'})

In [114]:
ax1 = fig.add_subplot(3,2,1)
ax1.legend()
N = np.arange(6)
width = 0.5
ax1.bar(N, age_dist.age_count)
ax1.set_ylabel('')
ax1.set_xlabel('Age Group', fontsize=8)
ax1.set_xticks(N + 0.4)
ax1.tick_params(axis='both', which='major', labelsize=6)
ax1.set_xticklabels(age_dist.index)
#age_dist.plot(kind='bar')

[<matplotlib.text.Text at 0x110e0c6d0>,
 <matplotlib.text.Text at 0x110defb10>,
 <matplotlib.text.Text at 0x110e7cc50>,
 <matplotlib.text.Text at 0x110e8a410>,
 <matplotlib.text.Text at 0x110e8ab90>,
 <matplotlib.text.Text at 0x110e93350>]

In [76]:
age_dist

Unnamed: 0_level_0,age_count
AGE_DESC,Unnamed: 1_level_1
19-24,46
25-34,142
35-44,194
45-54,288
55-64,59
65+,72


In [20]:
df_hh['new_marriage_code'] = df_hh.MARITAL_STATUS_CODE.map({
    'A': 'Married',
    'B': 'Single',
    'U': 'Unknown'
})

In [21]:
marriage_dist = df_hh[['new_marriage_code', 'household_key']]
marriage_dist = marriage_dist.groupby('new_marriage_code').count()
marriage_dist = marriage_dist.rename(columns={'household_key':'marital_status_count'})


In [115]:
ax2 = fig.add_subplot(3,2,2)
N = np.arange(3)
width = 0.5
ax2.bar(N, marriage_dist.marital_status_count)
ax2.set_ylabel('')
ax2.set_xlabel('Relationship Status', fontsize=8)
ax2.set_xticks(N + 0.4)
ax2.tick_params(axis='both', which='major', labelsize=6)
ax2.set_xticklabels(marriage_dist.index)
#marriage_dist.plot(kind='bar')

[<matplotlib.text.Text at 0x110ec9090>,
 <matplotlib.text.Text at 0x110eb5450>,
 <matplotlib.text.Text at 0x110def650>]

In [77]:
marriage_dist

Unnamed: 0_level_0,marital_status_count
new_marriage_code,Unnamed: 1_level_1
Married,340
Single,117
Unknown,344


In [23]:
df_hh['new_income_range'] = df_hh.INCOME_DESC.map({
    'Under 15K': '0-14K',
    '15-24K': '15-24K',
    '25-34K': '25-34K',
    '35-49K': '35-49K',
    '50-74K': '50-74K',
    '75-99K': '75-99K',
    '100-124K': '100-149K',
    '125-149K': '100-149K',
    '150-174K': '150K+',
    '175-199K': '150K+',
    '200-249K': '150K+',
    '250K+': '150K+'
})

In [24]:
income_dist = df_hh[['new_income_range', 'household_key']]
income_dist = income_dist.groupby('new_income_range').count()
income_dist = income_dist.rename(columns={'household_key':'number_hh'})
income_dist = income_dist.reindex(['0-14K','15-24K','25-34K','35-49K','50-74K','75-99K','100-149K','150K+'])


In [116]:
ax3 = fig.add_subplot(3,2,3)
N = np.arange(8)
width = 0.5
ax3.bar(N, income_dist.number_hh)
ax3.set_ylabel('')
ax3.set_xlabel('Income Range', fontsize=8)
ax3.set_xticks(N + 0.4)
ax3.tick_params(axis='both', which='major', labelsize=6)
ax3.set_xticklabels(income_dist.index)
#income_dist.plot(kind='bar')

[<matplotlib.text.Text at 0x110f18850>,
 <matplotlib.text.Text at 0x110f42b50>,
 <matplotlib.text.Text at 0x110fb4550>,
 <matplotlib.text.Text at 0x110fb4cd0>,
 <matplotlib.text.Text at 0x110fc0490>,
 <matplotlib.text.Text at 0x110fc0c10>,
 <matplotlib.text.Text at 0x110fca3d0>,
 <matplotlib.text.Text at 0x110fcab50>]

In [78]:
income_dist

Unnamed: 0_level_0,number_hh
new_income_range,Unnamed: 1_level_1
0-14K,61
15-24K,74
25-34K,77
35-49K,172
50-74K,192
75-99K,96
100-149K,72
150K+,57


In [26]:
df_hh['new_homeowner_code'] = df_hh.HOMEOWNER_DESC.map({
    'Probable Renter': 'Renter',
    'Renter': 'Renter',
    'Probable Owner': 'Homeowner',
    'Homeowner': 'Homeowner',
    'Unknown': 'Unknown'
})

In [27]:
homeowner_dist = df_hh[['new_homeowner_code', 'household_key']]
homeowner_dist = homeowner_dist.groupby('new_homeowner_code').count()
homeowner_dist = homeowner_dist.rename(columns={'household_key':'homeowner_count'})


In [117]:
ax4 = fig.add_subplot(3,2,4)
N = np.arange(3)
width = 0.5
ax4.bar(N, homeowner_dist.homeowner_count)
ax4.set_ylabel('')
ax4.set_xlabel('Living Status', fontsize=8)
ax4.set_xticks(N + 0.4)
ax4.tick_params(axis='both', which='major', labelsize=6)
ax4.set_xticklabels(homeowner_dist.index)
#homeowner_dist.plot(kind='bar')

[<matplotlib.text.Text at 0x110ffa310>,
 <matplotlib.text.Text at 0x111005150>,
 <matplotlib.text.Text at 0x11103f5d0>]

In [79]:
homeowner_dist

Unnamed: 0_level_0,homeowner_count
new_homeowner_code,Unnamed: 1_level_1
Homeowner,515
Renter,53
Unknown,233


In [29]:
children_dist = df_hh[['KID_CATEGORY_DESC', 'household_key']]
children_dist = children_dist.groupby('KID_CATEGORY_DESC').count()
children_dist = children_dist.rename(columns={'household_key':'number_of_kids'})


In [118]:
ax5 = fig.add_subplot(3,2,5)
N = np.arange(4)
width = 0.5
ax5.bar(N, children_dist.number_of_kids)
ax5.set_ylabel('')
ax5.set_xlabel('Number of Children', fontsize=8)
ax5.set_xticks(N + 0.4)
ax5.tick_params(axis='both', which='major', labelsize=6)
ax5.set_xticklabels(children_dist.index)
#children_dist.plot(kind='bar')

[<matplotlib.text.Text at 0x1110755d0>,
 <matplotlib.text.Text at 0x11107fbd0>,
 <matplotlib.text.Text at 0x1110d6a90>,
 <matplotlib.text.Text at 0x1110e2250>]

In [81]:
children_dist

Unnamed: 0_level_0,number_of_kids
KID_CATEGORY_DESC,Unnamed: 1_level_1
1,114
2,60
3+,69
None/Unknown,558


In [31]:
homesize_dist = df_hh[['HOUSEHOLD_SIZE_DESC', 'household_key']]
homesize_dist = homesize_dist.groupby('HOUSEHOLD_SIZE_DESC').count()
homesize_dist = homesize_dist.rename(columns={'household_key':'homesize'})


In [119]:
ax6 = fig.add_subplot(3,2,6)
N = np.arange(5)
width = 0.5
ax6.bar(N, homesize_dist.homesize)
ax6.set_ylabel('')
ax6.set_xlabel('Household Size', fontsize=8)
ax6.set_xticks(N + 0.4)
ax6.tick_params(axis='both', which='major', labelsize=6)
ax6.set_xticklabels(homesize_dist.index)
#homesize_dist.plot(kind='bar')

[<matplotlib.text.Text at 0x1110edf50>,
 <matplotlib.text.Text at 0x111069a90>,
 <matplotlib.text.Text at 0x111166650>,
 <matplotlib.text.Text at 0x111166dd0>,
 <matplotlib.text.Text at 0x111172590>]

In [83]:
homesize_dist

Unnamed: 0_level_0,homesize
HOUSEHOLD_SIZE_DESC,Unnamed: 1_level_1
1,255
2,318
3,109
4,53
5+,66


In [120]:
fig.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
fig.savefig('exploratory_graphs.pdf', dpi=300, format='pdf')