In [12]:
import numpy as np
from sklearn.cross_validation import train_test_split

from data import load_df

In [21]:
#  Load the dataset
df = load_df()

In [6]:
df.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0


In [7]:
df.shape

(32561, 15)

In [8]:
print('Number of rows: {}'.format(df.shape[0]))
print('Number of columns: {}'.format(df.shape[1]))

Number of rows: 32561
Number of columns: 15


In [68]:
# Use a list comprehension to filter out the 'class' column
features = [column for column in df if column != 'class']

# Display each feature for the first 5 rows of data
df[features].head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [11]:
# Let's look at the class labels for each of the first 5 rows
df[['class']].head(5)

Unnamed: 0,class
0,0
1,0
2,0
3,0
4,0


In [13]:
class NaiveBayes:
    """
    Implementation of Naive Bayes classifier.
    """
    
    def __init__(self):
        """
        Constructor for NaiveBayes object.
        """
        pass
    
    def classify(self):
        """
        Classify new data instances using the NaiveBayes model parameters
        fit through the learn object method.
        """
        pass
    
    def learn(self):
        """
        Learn the NaiveBayes model parameters from training data.
        """
        pass
    
    def score(self):
        """
        Return the classification accuracy obtained from classifying 
        a set of testing data using the learned Naive Bayes parameters.
        """
        pass

In [60]:
class NaiveBayes:
    
    def __init__(self):
        """
        Constructor for NaiveBayes object.
        """
        # Dict to hold learned parameters
        self._parameters = None
        self._prob_0 = None
        self._prob_1 = None
        pass
    
    def classify(self, samples):
        """
        Classify new data instances using the NaiveBayes model parameters
        fit through the learn object method.
        """
        if self._parameters is None:
            raise ValueError('Please train the NaiveBayes model on training data' + 
                            'before calling the classify method.')
            
        # Bind shorter name to refer to the learned parameters
        parameters = self._parameters
            
        # Create a list of dicts where the keys are the feature names
        # and the values are the value the feature takes on for the sample
        df = samples.to_dict(orient='records')
        
        # Initialize list to store classifications
        predictions = []
        
        # loop through each of the data points
        for data_point in df:
            
            # Initialize empty lists to store conditional probabilities
            # for each feature for the current data point
            class_0 = [self._prob_0]
            class_1 = [self._prob_1]
            
            # Loop through each of the learned features
            for feature in parameters:
                
                if feature not in data_point:
                    raise Exception('Data missing learned features.')
                    
                # Value the feature takes on
                value = data_point[feature]
                
                if samples[feature].dtype == 'O':
                    # Return dict containing multinomial model params
                    # with default value of {0:0, 1:0} if that value wasn't
                    # encountered in the training data
                    #params = parameters[feature].get(value, {0:0, 1:0})
                    params = parameters[feature][value]
                    
                    class_0.append(params[0])
                    class_1.append(params[1])
                    
                else:
                    c0_mean_std = parameters[feature][0]
                    c1_mean_std = parameters[feature][1]
                    
                    class_0.append( self._gaussian_pdf(value, c0_mean_std['mean'], c0_mean_std['std']))
                    class_1.append( self._gaussian_pdf(value, c1_mean_std['mean'], c1_mean_std['std']))
                    
            
            if np.prod(class_0) > np.prod(class_1):
                predictions.append(0)
            else:
                predictions.append(1)
                        
        return predictions

    
    def learn(self, train):
        """
        Learn the NaiveBayes model parameters from training data.
        """
        df = train
        
        self._parameters = {}
        
        # List of features implicitly defined in the dataframe
        features = [col for col in df if col != 'class']
        
        # Number of samples falling into each class
        num_class_0 = df[ df['class'] == 0 ].shape[0]
        num_class_1 = df[ df['class'] == 1 ].shape[0]
        
        # Store the class probabilities
        self._prob_0 = (num_class_0 / float(df.shape[0]))
        self._prob_1 = (num_class_1 / float(df.shape[0]))
                    
        
        for feature in features:
            
            # If feature is categorical, learn the percentage of data
            # that takes on each value, per class to define a multinomial
            # distribution
            
            if df[feature].dtype == 'O':
                
                conditional_probs = {}
                
                for value in df[feature].unique():  
                    
                    # Fraction of each class where the feature equals the value
                    class_0_prob = df[ (df[feature] == value) & (df['class'] == 0) ].shape[0] / float(num_class_0)
                    class_1_prob = df[ (df[feature] == value) & (df['class'] == 1) ].shape[0] / float(num_class_1)
                    
                    conditional_probs[value] = {0 : class_0_prob, 1 : class_1_prob}
                    
                # Store the learned params for the particular feature
                self._parameters[feature] = conditional_probs
                
            # If feature is numerical, learn feature mean and standard deviation
            # to specify a normal distribution
            
            else:
                class_0_mean = df[ df['class'] == 0 ][feature].mean()
                class_0_std = df[ df['class'] == 0 ][feature].std()
                
                class_1_mean = df[ df['class'] == 1 ][feature].mean()
                class_1_std = df[ df['class'] == 1 ][feature].std()

                self._parameters[feature] = {0:{'mean': class_0_mean, 'std': class_0_std},
                                            1: {'mean': class_1_mean, 'std': class_1_std}}
                
        #print('Probability class 0: {}'.format(self._prob_0 * 100))
        #print('Probability class 1: {}'.format(self._prob_1 * 100))
        #print('Learned parameters: {}'.format(self._parameters))

    
    def score(self, samples):
        """
        Return the classification error obtained from classifying 
        a set of testing data using the learned Naive Bayes parameters.
        """
        y_true = samples['class'].tolist()
        y_pred = self.classify(samples)
        
        return (1-np.mean( np.array(y_true) == np.array(y_pred) ))
    
    
    def _gaussian_pdf(self, x, mu, sigma):
        """
        Returns the probability of a point x according to a Gaussian distribution
        with mean mu and standard deviation sigma.
        """
        return 1/(sigma * np.sqrt(2 * np.pi))*np.exp( - (x - mu)**2 / (2 * sigma**2) )

In [63]:
def train_test_model(_train, _test, features):
    
    # Get relevant subset of data
    train = _train.copy()[features + ['class']]
    test = _test.copy()[features + ['class']]
    
    # Initialize object
    nb = NaiveBayes()
    
    # Train model
    nb.learn(train)
    
    # Evaluate on test set
    print('NaiveBayes model with features: {}'.format(features))
    print('Test error: {}'.format(nb.score(test)*100))

In [52]:
# Split data into training and testing split
train, test = train_test_split(df, train_size=.7, random_state=42)

In [64]:
train_test_model(train, test, ['age'])

NaiveBayes model with features: ['age']
Test error: 23.687173712764864


In [65]:
train_test_model(train, test, ['age', 'education'])

NaiveBayes model with features: ['age', 'education']
Test error: 20.800491350189375


In [67]:
# Create a Naive Bayes model for each feature individually
for col in df:
    if col != 'class':
        train_test_model(train, test, [col])
        print()

NaiveBayes model with features: ['age']
Test error: 23.687173712764864

NaiveBayes model with features: ['workclass']
Test error: 23.328897533012594

NaiveBayes model with features: ['fnlwgt']
Test error: 23.687173712764864

NaiveBayes model with features: ['education']
Test error: 21.772955266659842

NaiveBayes model with features: ['education-num']
Test error: 21.772955266659842

NaiveBayes model with features: ['marital-status']
Test error: 23.687173712764864

NaiveBayes model with features: ['occupation']
Test error: 23.687173712764864

NaiveBayes model with features: ['relationship']
Test error: 23.687173712764864

NaiveBayes model with features: ['race']
Test error: 23.687173712764864

NaiveBayes model with features: ['sex']
Test error: 23.687173712764864

NaiveBayes model with features: ['capital-gain']
Test error: 20.595762104616643

NaiveBayes model with features: ['capital-loss']
Test error: 23.43126215579896

NaiveBayes model with features: ['hours-per-week']
Test error: 23.

In [69]:
# Test on all features
train_test_model(train, test, features)

NaiveBayes model with features: ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
Test error: 16.480704268604775
