In [286]:
# MAKE SURE TO INLCUDE THESE IMPORT STATEMENTS

import pandas as pd
import numpy as np

In [None]:
# THE VALUES ASND RESULTS YOU SEE IN THIS TEST PROGRAM IS FROM THE WINE-QUALITY DATASET

# SINCE THIS ALGORITHM ONLY WORKS ON A BINARY CLASSIFICATION PROBLEM, WE HAVE CONVERTED THE RESPONSE VARIABLE TO BE BINARY

# FOR THIS CASE, WE TOOK ALL VALUES OF 'QUALITY' < 6 AS 0 AND 'QUALITY' > 6 AS 1

In [503]:
# ********************* CORE FUNCTIONS ********************* #

# THIS FUNCTION WOULD PRE-PROCESS THE INCOMING RAW DATASET
# THE PREPROCESSING INVOLVES - CONVERTING NUMERICAL FIELDS INTO BINNED CATEGORICAL ENCODED COLUMNS
# IN THE OUTPUT WE OBTAIN THE PROCESSED DARTASET IN WHICH NUMERICAL VALUES HAVE BEEN ENCODED AS PER BINNED VALUES
# ** NOTE ** - THIS FUNCTION COMES WITH A HYPERPARAMETER - THAT SPECIFIES HOW MANY LEVELS OF THE NUMERICAL VARIABLE TO CREATE

def preprocess_data(data, response, hyperparameter):
    res = data[response]
    processed_df = pd.DataFrame()
    col_list = list(data.columns)
    col_list.remove(response)

    for col in col_list:
        if len(data[col].unique())>15:
            feature = pd.qcut(np.log(data[col]+1), hyperparameter, labels=range(hyperparameter))
        else:
            feature = data[col]
        processed_df[col] = feature
    processed_df['y'] = res
    return processed_df

############################################################################################################################

# THIS FUNCTION COMPUTES THE PRIOR PROBABILITY OF THE BINARY CLASSES AND TAKES ITS LOG
# IN THE OUTPUT WE GET THE LOG PRIOR VALUES AND CLASS 0 AND CLASS 1

def log_priors(data):
    prior0 = np.log(np.round(data[data['y']==0]['y'].count()/len(data), 2))
    prior1 = np.log(np.round(data[data['y']==1]['y'].count()/len(data), 2))
    return prior0, prior1

############################################################################################################################

# THIS FUNCTION PERFORMS THE ACTUAL TRAINING ON THE DATA - IT COMPUTES THE LIKELIHOOD SCORES FOR EACH FEATURE
# THE OUTPUT IS A DICTIONARY THAT STORES THE TRAINED LIKELIHOOD SCORES FOR EACH FEATURE AND EACH LEVEL WITHIN THEM
# THESE ARE THE SCORES THAT ARE ULTIMATELY APPLIED ONTO THE TEST DATASET TO COMPUTE PREDICTIONS

def compute_likelihoods(data):
    col_list = list(data.columns)
    col_list.remove('y')

    p0 = data[data['y']==0]
    p1 = data[data['y']==1]

    feature_level_probs = {}

    for f in col_list:
        levels = list(data[f].unique())
        level_probs = {}

        for i in levels:
            f_lev_0 = np.log(np.round(((p0[p0[f]==i][f].count())+1)/(len(p0)+len(col_list)), 2))
            f_lev_1 = np.log(np.round(((p1[p1[f]==i][f].count())+1)/(len(p1)+len(col_list)), 2))
            level_probs[i] = (f_lev_0, f_lev_1)

        feature_level_probs[f] = level_probs
    return feature_level_probs

############################################################################################################################

# THIS IS THE NULLSPACE NAIVE BAYES FUNCTION THAT TAKES THE PRE-PROCESSED TEST DATA AS INPUT
# IT ALSO TAKES AS INPUT, THE PRIOR PROBABILITIES AND LEARNED LIKELIHOOD SCORES
# THE OUTPUT IS THE NEW TEST DATAFRAME IN WHICH A COLUMN OF PREDICTED LABELS IS PRESENT AS WELL

def NS_NB(data, prior_0, prior_1, feature_scores):
    col_list_test = list(data.columns)
    col_list_test.remove('y')
    preds = []

    for i in range(len(data)):
        evidence0 = prior_0
        evidence1 = prior_1
        for f in col_list_test:
            for k in list(feature_scores[f].keys()):
                if data[f][i] == k:
                    evidence0 += feature_scores[f][k][0]
                    evidence1 += feature_scores[f][k][1]
                else:
                    pass
        if max(evidence0, evidence1) == evidence0:
            preds.append(0)
        elif max(evidence0, evidence1) == evidence1:
            preds.append(1)
    new_df = data.copy()
    new_df['predicted'] = preds
    return new_df

In [504]:
# THIS IS A MASTER FUNCTION THAT CALLS ALL THE COMPONENT FUNCTIONS TO TRAIN THE NAIVE BAYES MODEL

def train_NB_ns(data, response, hyper_param):
    p_df = preprocess_data(data, response, hyper_param)
    lp0, lp1 = log_priors(p_df)
    probs = compute_likelihoods(p_df)
    return (lp0, lp1, probs)

In [509]:
# THIS IS AN EXAMPLE OF HOW WE CALL THE TRAINING FUNCTION ON THE TRAINING DATASET
# WE PASS THE FOLLOWING VALUES TO IT - THE INPUT DATAFRAME OF PREDICTOR COLUMNS, THE RESPONSE VARIABLE NAME, 
# AND THE HYPERPARAMETER VALUE 

# ** NOTE ** YOU CAN EXPERIMENT WITH THE HYPERPARAMETER VALUE - BUT KEEP IT BETWEEN 2 AND 14.

LP0, LP1, SCORES = train_NB_ns(df_train, 'y', 11)

In [510]:
# THIS IS THE RUNNING FUNCTION - IT RUNS THE TRAINED SCORES ON THE TEST DATASET
# FIRST THE TEST DATA IS PREPROCESSED USING THE SAME METHOD AS THE TRAIN DATASET TO ENSURE CONSISTENCY
# THEN THE TRAINED SCORES ARE APPLIED TO ARRIVE AT PREDICTIONS
# THE OUTPUT IS THE ACCURACY SCORE!

# INPUTS ARE AS FOLLOWS - ENTER THE TEST DATAFRAME, THE LEARNED PARAMETER VALUES FROM THE TRAINING CALL (ABOVE)
# AND FINALLY, ENTER THE RESPONSE COLUMN NAME AND THE HYPERPARAMETER 

# ** NOTE ** PLEASE INCLUDE THE SAME HYPERPARAMETER VALUE AS ENTERED IN THE TRAIN PROGRAM

def run_NB_ns(frame, logprior0, logprior1, scores, response, hyper_param):
    new_frame = preprocess_data(frame, response, hyper_param)
    p = NS_NB(new_frame, logprior0, logprior1, scores)
    accuracy = p[p['y']==p['predicted']]['y'].count()/len(p)
    return accuracy

In [511]:
# THIS IS AN EXAMPLE OF HOW WE CALL THE RUN FUNCTION TO COMPUTE THE ACCURACY

ACCURACY = run_NB_ns(df_test, LP0, LP1, SCORES, 'y', 11)

In [512]:
# THE ACCURACY IS THEN PRINTED OUT

ACCURACY

0.726530612244898