In [34]:
import numpy as np
from get_data import data

In [43]:
df = data()
df = df[[col for col in df if col != 'label']]

In [47]:
class NaiveBayes:
    
    def __init__(self, df, ignore_missing=False):
        self.df = df
        self.learned_params = None
        
        if ignore_missing:
            self.categorical_features = ['education', 'marital-status', 'relationship', 'race', 'sex']
        else:
            self.categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 
                        'relationship', 'race', 'sex', 'native-country']
            
    def gaussian_pdf(self, mu, sigma, x):
        return 1/(sigma * np.sqrt(2 * np.pi))*np.exp( - (x - mu)**2 / (2 * sigma**2) )
        
        
    def classify(self, samples):
        
        if self.learned_params is None:
            self.learn_parameters()
            
        predictions = []
        
        df = self.df
        
        categorical_features = self.categorical_features
        
        learned_params = self.learned_params
        
        for sample in samples:
            
            class_0 = []
            class_1 = []
            
            for feature in sample:
                if feature in learned_params:
                    
                    value = sample[feature]
                    
                    if feature in categorical_features:
                        class_0_prob = learned_params[feature][value][0]
                        class_1_prob = learned_params[feature][value][1]
                        class_0.append(class_0_prob)
                        class_1.append(class_1_prob)
                    else:
                        c0_mean = learned_params[feature][0]['mean']
                        c0_std = learned_params[feature][0]['std']
                        c1_mean = learned_params[feature][1]['mean']
                        c1_std = learned_params[feature][1]['std']
                        class_0_prob = self.gaussian_pdf(c0_mean, c0_std, value)
                        class_1_prob = self.gaussian_pdf(c1_mean, c1_std, value)
                        class_0.append(class_0_prob)
                        class_1.append(class_1_prob)
                        
            c_0_prob = np.prod(class_0)
            c_1_prob = np.prod(class_1)
                    
            if c_0_prob > c_1_prob:
                predictions.append(0)
            else:
                predictions.append(1)
                        
        return predictions
        
            
    def learn_parameters(self):
        
        df = self.df
        features = [col for col in df if col != 'class']
        categorical_features = self.categorical_features

        num_class_0 = df[df['class'] == 0].shape[0]
        num_class_1 = df[df['class'] == 1].shape[0]

        # Hold the learned parameters
        # Mean and std for continuous quantities
        # Feature-value probabilities for categorical data
        learned_probabilities = {}

        for feature in features:

            # Learn probabilities for categorical variables
            if feature in categorical_features:  
                # List of unique values the feature can take
                unique_values = df[feature].unique()
                # Probabilities for given values conditioned on class
                conditional_probs = {}

                # Calculate these probabilities
                for value in unique_values:
                    class_0_prob = df[ (df[feature] == value) & (df['class'] == 0) ].shape[0] / float(num_class_0)
                    class_1_prob = df[ (df[feature] == value) & (df['class'] == 1) ].shape[0] / float(num_class_1)
                    conditional_probs[value] = {0 : class_0_prob, 1 : class_1_prob}

                learned_probabilities[feature] = conditional_probs

            else:
                # Get the mean and std for each class
                class_0_mean = df[ df['class'] == 0 ][feature].mean()
                class_0_std = df[ df['class'] == 0 ][feature].std()
                class_1_mean = df[ df['class'] == 1 ][feature].mean()
                class_1_std = df[ df['class'] == 1 ][feature].std()

                learned_probabilities[feature] = {0:{'mean': class_0_mean, 'std': class_0_std},
                                                  1: {'mean': class_1_mean, 'std': class_1_std}}
                
        self.learned_params = learned_probabilities

In [51]:
test = df.copy()[:20]

In [48]:
nb = NaiveBayes(df)
nb.learn_parameters()

In [52]:
nb.classify(test.to_dict(orient='records'))

[0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]

In [53]:
test['class'].tolist()

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0]