In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder,LabelBinarizer

In [2]:
weather = ['Clear', 'Clear', 'Clear', 'Clear', 'Clear', 'Clear',
            'Rainy', 'Rainy', 'Rainy', 'Rainy', 'Rainy', 'Rainy',
            'Snowy', 'Snowy', 'Snowy', 'Snowy', 'Snowy', 'Snowy']

timeOfWeek = ['Workday', 'Workday', 'Workday',
            'Weekend', 'Weekend', 'Weekend',
            'Workday', 'Workday', 'Workday',
            'Weekend', 'Weekend', 'Weekend',
            'Workday', 'Workday', 'Workday',
            'Weekend', 'Weekend', 'Weekend']

timeOfDay = ['Morning', 'Lunch', 'Evening',
            'Morning', 'Lunch', 'Evening',
            'Morning', 'Lunch', 'Evening',
            'Morning', 'Lunch', 'Evening',
            'Morning', 'Lunch', 'Evening',
            'Morning', 'Lunch', 'Evening',
            ]
trafficJam = ['Yes', 'No', 'Yes',
            'No', 'No', 'No',
            'Yes', 'Yes', 'Yes',
            'No', 'No', 'No',
            'Yes', 'Yes', 'Yes',
            'Yes', 'No', 'Yes'
            ]

In [3]:
df = pd.DataFrame(zip(weather,timeOfWeek,timeOfDay,trafficJam),columns = ['weather','timeOfWeek','timeOfDay','trafficJam'])
df

Unnamed: 0,weather,timeOfWeek,timeOfDay,trafficJam
0,Clear,Workday,Morning,Yes
1,Clear,Workday,Lunch,No
2,Clear,Workday,Evening,Yes
3,Clear,Weekend,Morning,No
4,Clear,Weekend,Lunch,No
5,Clear,Weekend,Evening,No
6,Rainy,Workday,Morning,Yes
7,Rainy,Workday,Lunch,Yes
8,Rainy,Workday,Evening,Yes
9,Rainy,Weekend,Morning,No


In [4]:
weather = df['weather'].values.reshape(-1,1)
timeOfWeek = df['timeOfWeek'].values.reshape(-1,1) 
timeOfDay = df['timeOfDay'].values.reshape(-1,1)

In [5]:
weather.shape,timeOfWeek.shape

((18, 1), (18, 1))

In [6]:
def preprocess():
    # Using ordinal encoder to convert the categories in the range from 0 to n-1
    wea_enc = OrdinalEncoder()
    weather_ = wea_enc.fit_transform(weather)

    timeOfWeek_enc = OrdinalEncoder()
    timeOfWeek_ = timeOfWeek_enc.fit_transform(timeOfWeek)

    timeOfDay_enc = OrdinalEncoder()
    timeOfDay_ = timeOfDay_enc.fit_transform(timeOfDay)
    # Stacking all the features
    X = np.column_stack((weather_,timeOfWeek_,timeOfDay_))
    # Changing the type to int
    X = X.astype(int)
    # Doing one hot encoding on the target data
    y = df['trafficJam']
    lb = LabelBinarizer()
    y_ = lb.fit_transform(y)
    if y_.shape[1] == 1:
        y_ = np.concatenate((1 - y_, y_), axis=1)
    return X,y_,lb.classes_

In [7]:
X,y,classes = preprocess()
X.shape, y.shape

((18, 3), (18, 2))

In [8]:
def counts_based_onclass(X,y):
    
    # No of feature
    n_features = X.shape[1]
    # No of classes
    n_classes = y.shape[1]
    
    count_matrix = []
    # For each feature
    for i in range(n_features):
        count_feature = []
        # Get that particuar feature from the dataset
        X_feature = X[:,i]
        # For each class
        for j in range(n_classes):
            # Get the datapoints that belong to the class - j
            mask = y[:,j].astype(bool)
            # Using masking filter out the datapoints that belong to this class- j in the given feature - i
            # Using bincount -- count all the different categories present in the given feature
            counts = np.bincount(X_feature[mask])
            
            count_feature.append(counts)
            
        count_matrix.append(np.array(count_feature))
        # Finding the count of datapoints beloging to each class -- we will use it to calculate prior probabilities.
        class_count = y.sum(axis=0)
        
    return count_matrix,n_features,n_classes,class_count

In [9]:
count_matrix,n_features,n_classes,class_count = counts_based_onclass(X,y)

In [10]:
count_matrix

[array([[4, 3, 1],
        [2, 3, 5]], dtype=int64),
 array([[7, 1],
        [2, 8]], dtype=int64),
 array([[2, 4, 2],
        [4, 2, 4]], dtype=int64)]

In [11]:
def calculate_likelihood_probs(count_matrix,alpha,n_features):
    log_probabilities = []
    for i in range(n_features):
        num = count_matrix[i] + alpha
        den = num.sum(axis = 1).reshape(-1,1)
        log_probability = np.log(num) - np.log(den)
        log_probabilities.append(log_probability)
    return log_probabilities

In [12]:
def calculate_prior_probs(class_count):
    
    num = class_count
    den = class_count.sum()
    
    return np.log(num)-np.log(den)

In [13]:
prior_probs = calculate_prior_probs(class_count)

In [14]:
log_probs = calculate_likelihood_probs(count_matrix,1,n_features)

In [15]:
log_probs

[array([[-0.78845736, -1.01160091, -1.70474809],
        [-1.46633707, -1.178655  , -0.77318989]]),
 array([[-0.22314355, -1.60943791],
        [-1.38629436, -0.28768207]]),
 array([[-1.29928298, -0.78845736, -1.29928298],
        [-0.95551145, -1.46633707, -0.95551145]])]

In [16]:
def predict(query_point,log_probs,prior_probs):
    
    # Intializing an empty array
    probs = np.zeros((1,n_classes))
    # For each feature
    for i in range(n_features):
        # Get the category_id of the feature - i from the query_point
        category = query_point[i]
        # Fetch the corresponding log_probability table and add continue to add them for all the features
        probs+=log_probs[i][:,category]
    # Finally add posterior probability
    probs+=prior_probs
    # Finding the maximum of the probabilities and fetching the corresponding class
    return classes[np.argmax(probs)]

In [17]:
from sklearn.naive_bayes import CategoricalNB
clf = CategoricalNB()
clf.fit(X, df['trafficJam'])
print('Sklearn feature log-probabilities\n',clf.feature_log_prob_)
print('Manually implemented likelihood probabilities\n',log_probs)

print('Sklearn feature prior-probabilities\n',clf.class_log_prior_)
print('Manually implemented prior probabilities\n',prior_probs)

print()
print('Sklearn predict',clf.predict(X[4:5]))
print('Manual predict',predict(X[4],log_probs,prior_probs))

Sklearn feature log-probabilities
 [array([[-0.78845736, -1.01160091, -1.70474809],
       [-1.46633707, -1.178655  , -0.77318989]]), array([[-0.22314355, -1.60943791],
       [-1.38629436, -0.28768207]]), array([[-1.29928298, -0.78845736, -1.29928298],
       [-0.95551145, -1.46633707, -0.95551145]])]
Manually implemented likelihood probabilities
 [array([[-0.78845736, -1.01160091, -1.70474809],
       [-1.46633707, -1.178655  , -0.77318989]]), array([[-0.22314355, -1.60943791],
       [-1.38629436, -0.28768207]]), array([[-1.29928298, -0.78845736, -1.29928298],
       [-0.95551145, -1.46633707, -0.95551145]])]
Sklearn feature prior-probabilities
 [-0.81093022 -0.58778666]
Manually implemented prior probabilities
 [-0.81093022 -0.58778666]

Sklearn predict ['No']
Manual predict No
