# Employing Naive Bayes Classifiers to determine demographic labels

In this notebook we will begin the process of classifying our households, such that we might populate recommendations for each one based on their real attributes.

To do so, I'd like to train models using the labeled data -- the households for which we already have demographic information. This classifies as Supervised Learning, because we have accurate labels already.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


import dtcj
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB


from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [2]:
# loading data
demo = dtcj.load_demo()

merged = dtcj.load_merged()

hh_agg = dtcj.load_hh_agg(merged)

In [3]:
def test_Multinomial_NB():
    
    # for each of our targets
    for target in ['single_couple_family']:
        print(target)
        # define feature space;
            # remove data-leaked features 
            # negative values to positive
            # multi-collinearity? 
            # add target column as the last column.
        test_hh_agg = abs(hh_agg.drop(['R', 'F', 'M', 'RFM', 'RFM Bins'], axis=1))
        df = test_hh_agg.merge(demo[['household_key', target]], on='household_key')

        # set feature space and target variable
        X = df.iloc[:, :-1].values
        y = df.iloc[:, -1].values

        # train test split the data -- employing stratify. 
        X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

        # hyperparameter optimization
        for alpha in [0.005, 0.05, 0.5, 1, 10, 100, 1000, 100000, 1000000000]:
            # Instatiate our model
            mnbmodel = MultinomialNB(alpha=alpha)
            # Fit our model
            mnbmodel.fit(X_train, y_train)

            # simple print for results
            print('alpha:', alpha,  'score', mnbmodel.score(X_test, y_test))

test_Multinomial_NB() # this is calculating the target single_couple_family

single_couple_family
alpha: 0.005 score 0.36318407960199006
alpha: 0.05 score 0.36318407960199006
alpha: 0.5 score 0.36318407960199006
alpha: 1 score 0.36318407960199006
alpha: 10 score 0.36318407960199006
alpha: 100 score 0.36318407960199006
alpha: 1000 score 0.3582089552238806
alpha: 100000 score 0.39800995024875624
alpha: 1000000000 score 0.39800995024875624


In [4]:
def test_Gaussian_NB():
    
    for target in ['income_50K+', 'age_45+']:
    # define feature space; remove data-leaked features and multi-collinearity?   
        test_hh_agg = abs(hh_agg.drop(['R', 'F', 'M', 'RFM', 'RFM Bins'], axis=1))
        df = test_hh_agg.merge(demo[['household_key', target]], on='household_key')

        # set features and target
        X = df.iloc[:, :-1].values
        y = df.iloc[:, -1].values


        # train test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

        # scale the data
        ss = StandardScaler().fit(X_train)
        X_train = ss.transform(X_train)
        X_test = ss.transform(X_test)

        # hyperparameter optimization

        # Instatiate our model
        gnbmodel = GaussianNB()
        # Fit our model
        gnbmodel.fit(X_train, y_train)

        # simple print for results
        print(target, gnbmodel.score(X_test, y_test))

test_Gaussian_NB() # this is calculating the target income_50K+

income_50K+ 0.572139303482587
age_45+ 0.5074626865671642


In [5]:
def test_Bernoulli_NB():
        
    for target in ['single', 'couple', 'has_kids']:
        print(target)
    # define feature space; remove data-leaked features and multi-collinearity?   
        test_hh_agg = abs(hh_agg.drop(['R', 'F', 'M', 'RFM', 'RFM Bins'], axis=1))
        df = test_hh_agg.merge(demo[['household_key', target]], on='household_key')

        # set features and target
        X = df.iloc[:, :-1].values
        y = df.iloc[:, -1].values


        # train test split
        X_train, X_test, y_train, y_test = train_test_split(X, y)

        # scale the data
        ss = StandardScaler().fit(X_train)
        X_train = ss.transform(X_train)
        X_test = ss.transform(X_test)

        # hyperparameter optimization
        for alpha in [0.005, 0.05, 0.5, 1, 10, 100, 1000, 100000, 1000000000]:
            # Instatiate our model
            bnbmodel = BernoulliNB(alpha=alpha)
            # Fit our model
            bnbmodel.fit(X_train, y_train)

            # simple print for results
            print('alpha:', alpha,  'score', bnbmodel.score(X_test, y_test))
            
test_Bernoulli_NB()

single
alpha: 0.005 score 0.5522388059701493
alpha: 0.05 score 0.5522388059701493
alpha: 0.5 score 0.5522388059701493
alpha: 1 score 0.5522388059701493
alpha: 10 score 0.5522388059701493
alpha: 100 score 0.5771144278606966
alpha: 1000 score 0.582089552238806
alpha: 100000 score 0.582089552238806
alpha: 1000000000 score 0.582089552238806
couple
alpha: 0.005 score 0.5522388059701493
alpha: 0.05 score 0.5522388059701493
alpha: 0.5 score 0.5522388059701493
alpha: 1 score 0.5472636815920398
alpha: 10 score 0.5671641791044776
alpha: 100 score 0.5671641791044776
alpha: 1000 score 0.6218905472636815
alpha: 100000 score 0.6218905472636815
alpha: 1000000000 score 0.6218905472636815
has_kids
alpha: 0.005 score 0.6318407960199005
alpha: 0.05 score 0.6318407960199005
alpha: 0.5 score 0.6318407960199005
alpha: 1 score 0.6318407960199005
alpha: 10 score 0.6368159203980099
alpha: 100 score 0.6318407960199005
alpha: 1000 score 0.7014925373134329
alpha: 100000 score 0.6865671641791045
alpha: 1000000000 