# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from numpy import log2 as log
import sys
import math
import pprint
import statistics

# Dataset Preparation and Train Test Split

In [2]:
def categorize_age(df):
    if df['Age'] < 8:
        return 1
    elif 8 <= df['Age'] <= 12:
        return 2
    else:
        return 3

df = pd.read_csv("abalone.data", sep="," , header = None, 
                  names=['Sex', 'Length', 'Diameter', 'Height', 'Whole weight',
                         'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings'])

df['Age'] = df['Rings'] + 1.5
df['Age'].unique()

array([16.5,  8.5, 10.5, 11.5,  9.5, 21.5, 17.5, 20.5, 15.5, 12.5, 13.5,
       19.5, 14.5,  6.5,  5.5,  7.5, 22.5, 18.5, 23.5,  2.5,  4.5, 27.5,
       24.5, 30.5,  3.5, 28.5, 26.5, 25.5])

## Code Explanation
Since Age (Rings + 1.5) looks like a continuous variable as it is hard to predict correctly since there are essentially 20+ classes here, I decided to separate them into three classes using categorize_age function. For abalones with Age < 8, they are encoded as 1 or "Young", 8 <= Age <= 12 is encoded as 2 or "Middle-Aged" and Age > 12 is encoded as 3 or "Old". This will help with the accuracy of the model. 

In [3]:
df['Age'] = df.apply(categorize_age, axis = 1)
className = 'Age'

df_train = df.sample(frac =.70)
df_train = df_train.sort_index()

df_test = df.drop(df_train.index)

## Code Explanation
Using the sample() function, the dataset is separated into 70% for training. Then, by dropping the rows of the original df that matches to the row index of the df_train, we can get the remaining 30% data for testing.

In [4]:
df_trainn = df_train.copy()
df_train_x = df_trainn.drop(['Age'], axis = 1)
df_train_y = df_trainn.pop("Age")

In [5]:
df_testt = df_test.copy()
df_test_x = df_testt.drop(['Age'], axis = 1)
df_test_y = df_testt.pop("Age")

## Code Explanation
Here, a copy of the train and test data is created. Both df_train_x and df_test_x will be used in making the predictions, as both of them only contains the predictors needed to make the predictions. Additionally, both df_train_y and df_test_y serve as the "validator" to validate the predictions made by the model, as well as getting the accuracy of both train and test data.

In [6]:
df_train

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Age
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15,3
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9,2
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10,2
5,I,0.425,0.300,0.095,0.3515,0.1410,0.0775,0.1200,8,2
6,F,0.530,0.415,0.150,0.7775,0.2370,0.1415,0.3300,20,3
...,...,...,...,...,...,...,...,...,...,...
4171,M,0.560,0.430,0.155,0.8675,0.4000,0.1720,0.2290,8,2
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11,3
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10,2
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9,2


In [7]:
df_test

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Age
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7,2
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7,2
9,F,0.550,0.440,0.150,0.8945,0.3145,0.1510,0.3200,19,3
10,F,0.525,0.380,0.140,0.6065,0.1940,0.1475,0.2100,14,3
12,M,0.490,0.380,0.135,0.5415,0.2175,0.0950,0.1900,11,3
...,...,...,...,...,...,...,...,...,...,...
4163,I,0.390,0.310,0.085,0.3440,0.1810,0.0695,0.0790,7,2
4165,I,0.405,0.300,0.085,0.3035,0.1500,0.0505,0.0880,7,2
4167,M,0.500,0.380,0.125,0.5770,0.2690,0.1265,0.1535,9,2
4170,M,0.550,0.430,0.130,0.8395,0.3155,0.1955,0.2405,10,2


# Probability (Categorical) and Mean/Standard Deviation (Continuous)

In [8]:
def create_probability_dict (df):
    table = {}
    table_mean_std = {}
    global usePerc
    global useMean

    # determine values for the label
    label_count = df[className].value_counts().sort_index()
    table["label_types"] = label_count.index.to_numpy()
    table["label_count"] = label_count.values

    #Loop every features except the label
    for feature in df.drop(className, axis=1).columns:
        
        #If the values are categorical
        if len(df[feature].unique()) < 10:
            table[feature] = {}
            usePerc.append(feature)
            counter = df.groupby(className)[feature].value_counts()
            df_counter = counter.unstack(className)
            
            #If df_counter contains missing values, replace it with 0. 
            #Laplace Estimator will be used later Instead of Gaussian Distribution if probability is zero
            if df_counter.isna().any(axis=None):
                df_counter.fillna(value=0, inplace=True)

            # calculate probabilities
            df_probabilities = df_counter / df_counter.sum()
            for val in df_probabilities.index:
                probabilities = df_probabilities.loc[val].to_numpy()
                table[feature][val] = probabilities
                
        #If the values are continuous
        else:
            table_mean_std[feature] = {}
            useMean.append(feature)
            #calculate mean, for '1', '2', '3'
            meanArr = []
            for x in table["label_types"]:
                grouped = df.groupby(className)[feature].sum()[x]
                mea_n = int(grouped) / len(df[df[className] == x])
                meanArr.append(mea_n)
                
            stdArr = []
            i = 0
            for x in table["label_types"]:
                b = df_train[feature][df_train[className] == x]
                subtractedArr = 0
                
                for y in b:
                    calc = (y - meanArr[i]) ** 2
                    subtractedArr = subtractedArr + calc
                
                i = i+1
                variance = subtractedArr / len(b) 
                std_dev = math.sqrt(variance)
                stdArr.append(std_dev)
                
            table_mean_std[feature]["Mean"] = meanArr 
            table_mean_std[feature]["Std dev"] = stdArr 
        
            
    return table, table_mean_std

## Code Explanation
A function to create two dictionaries necessary for the probability calculation. The function returns two dictionaries, the first dictionary, "table", contains the probabilities of categorical variables. The second dictionary, "table_mean_std", contains the mean and standard deviation of the continuous variables.

First, the Age group and the rows count is defined as "label_types" and "label_counts" inside the "table" dictionary. 

For every features inside the dataframe, a check is performed to determine whether the feature is continuous or categorical. To get the probabilities of the categorical variable, a groupby is performed to group the data based on the Age group ("className") and using value_counts, we can get the count of each of the values inside the "feature". The count is then divided by df_counter.sum(), which is the total rows of each Age classes (there are three Age classes, "1", "2", and "3").

To get the mean of the continuous variable, a for loop is performed for every Age class (label_types). Inside the for loop, we can get the sum of every value in the feature that has x (either 1, 2, or 3) as their Age class, and then dividing it by the total rows where the label column is equal to x. The three mean values are appended into a list called meanArr. Likewise, to get the standard deviation, a for loop is performed for every Age class. Then, we filter the dataframe to only include the rows where Age group is equal to x. Every value inside feature is subtracted by the mean and then squared. The sum of these values are then divided by the total number of rows where Age group is equal to x to get the variance. The square root of the variance is the standard deviation, which is appended to the stdArr list. The "table_mean_std" now contains the mean and standard deviation for every continuous feature.

In [9]:
usePerc = []
useMean = []
prob_table, cont_table = create_probability_dict(df_train)
pprint.pprint(prob_table)


{'Sex': {'F': array([0.05345912, 0.28193833, 0.43657817]),
         'I': array([0.8490566 , 0.35053493, 0.10521141]),
         'M': array([0.09748428, 0.36752675, 0.45821042])},
 'label_count': array([ 318, 1589, 1017], dtype=int64),
 'label_types': array([1, 2, 3], dtype=int64)}


## Code Explanation
For reference, we look at the partial dictionary -> {'Sex': {'F': array([0.04126984, 0.28356336, 0.4453202 ]). The probability of an abalone with "F" as their Sex having an Age class of 1 (Young) is 0.04126984. Likewise, the probability of an abalone with "F" as their Sex having an Age class of 2 (Middle Aged) is 0.28356336. Therefore, these arrays serve as the "lookup" table to calculate the total probability.

In [10]:
pprint.pprint(cont_table)

{'Diameter': {'Mean': [0.2389937106918239,
                       0.4040276903713027,
                       0.464110127826942],
              'Std dev': [0.07270019274661217,
                          0.07978661022271767,
                          0.0678055206701565]},
 'Height': {'Mean': [0.07861635220125786,
                     0.13593455003146634,
                     0.16322517207472959],
            'Std dev': [0.02600912200432039,
                        0.039199245243721065,
                        0.029498201407598346]},
 'Length': {'Mean': [0.31761006289308175,
                     0.5217117684078036,
                     0.5889872173058014],
            'Std dev': [0.09074518772131643,
                        0.09725920392469882,
                        0.08256345760991925]},
 'Rings': {'Mean': [5.355345911949685, 8.692259282567653, 13.257620452310718],
           'Std dev': [0.8774711949431682,
                       1.0561679516471785,
                       2.71354507691

## Code Explanation
For reference, we look at the partial dictionary -> {'Diameter': {'Mean': [0.23885350318471338, 0.404126213592233, 0.46465696465696466]. For abalones with Age group of 1, the mean of Diameter is 0.23885350318471338. Likewise, the mean of Diameter of abalones with Age group of 2 is 0.404126213592233. These arrays are used to get the mean and standard deviation required to calculate the probability of continuous variables.

# Make Predictions

In [11]:
def predict_example(row, probability_table, mean_table, use_perc, use_mean): #BELUM
    
    probability_label = probability_table["label_count"] / sum(probability_table["label_count"])
    for feature in row.index:
        value = row[feature]
        if feature in use_perc:
            try:
                zeroCount = False
                probabilities = probability_table[feature][value]
                for x in probabilities:
                    if x == 0: #Check if frequency is zero
                        zeroCount = True
                        
                #If frequency is zero, use Laplace Estimator to calculate the probability
                if zeroCount == True:
                    i = 0
                    probabilitiess = []
                    for x in probability_table["label_types"]:
                        nc = df_train[df_train[feature] == value].groupby(className)[feature].count()[x]
                        n = int(probability_table["label_count"][i])
                        m = int(len(probability_table["label_types"]))
                        probability = (nc + 1)/ (n + m)
                        probabilitiess.append(probability)
                        i = i+1
                    probability_label = probability_label * probabilitiess
                else:
                    probability_label = probability_label * probabilities
                
            except KeyError:
                continue
        else:
            stand_dev = mean_table[feature]['Std dev']
            mea_nn = mean_table[feature]['Mean']
            probabilities = []
            for x in range(len(stand_dev)):
                probability = 1/(math.sqrt(2*math.pi)*stand_dev[x]) * math.e ** (-1 * ((value - mea_nn[x])**2)/(2 * stand_dev[x]**2))
                probabilities.append(probability)
                
            #print(probabilities)
            probability_label = probability_label * probabilities
                            
    index_max_class = probability_label.argmax()
    prediction = probability_table["label_types"][index_max_class]
    
    return prediction

## Code Explanation
This function returns the prediction of a single row. First, we need to get the probability of the row being "1", being "2", and being "3" (stored inside probability_label). To do that, we use our "probability_table" dictionary to get the "label_count" array ('label_count': array([ 327, 1581, 1016]) and then dividing them with the total rows to get the probability.

For every features inside the row, we get the value. Then, using use_perc list, we filter out features that are categorical and continuous. 

If the feature is categorical, we simply take out the probabilities stored inside probability_table. Another check is performed if the one of the probability is equal to zero (zero counts/occurrences). If the probability is indeed zero, we use Laplace Estimator to calculate the probability, instead of using the probability_table. The probabilities are then multiplied by the probability_label. The multiplied value are then assigned to probability_label, ready for the next for loop if there is one.

If the feature is continuous, we need to get the mean and standard deviation of the feature from the mean_table. Then, using the Gaussian Naive Bayes formula, we get the three probabilities, which are then multipled to probability_label and stored to probability_label, readu for the next loop if there is one.

After all the features are looped, the index of the highest value in probability_label, which has three probability values, are taken and stored into index_max_class. The prediction for the row is taken from the value of dictionary with key "label_types" ('label_types': array([1, 2, 3]) and getting the value (1, 2 or 3) inside the array using index_max_class, which is either 0, 1, or 2.

In [12]:
preds_train = df_train_x.apply(predict_example, axis=1, args=(prob_table, cont_table, usePerc, useMean,))
preds = df_test_x.apply(predict_example, axis=1, args=(prob_table, cont_table, usePerc, useMean,))

## Code Explanation
Here, for every row (axis=1) in df_train_x and df_test_x, we apply the function predict_example to return the prediction.

# Model Evaluation

In [13]:
predictions_correct_train = preds_train == df_train_y
accuracy_train = predictions_correct_train.mean()
print(f"Train Accuracy: {accuracy_train:.3f}")

Train Accuracy: 0.742


In [14]:
predictions_correct = preds == df_test_y
accuracy = predictions_correct.mean()
print(f"Test Accuracy: {accuracy:.3f}")

Test Accuracy: 0.738


## Findings
As shown above, our Naive Bayes model has an accuracy of 74.2% for the train data set and an accuracy of 73.8% for the test data set. This shows that the model isn't overfitted as there is no significant difference between both of them.