In [1]:
import pandas as pd
from collections import Counter

In [2]:
def read_dataset(file_path: str) -> pd.DataFrame:
    dataset = pd.read_csv(file_path)
    return dataset

***Prior Class*** ________________ **P(c) = Nc / N**

In [3]:
def calculate_class_count(dataset, unique_class, class_attribute):
    """
        Returns number of occurrences of given class
    """
    unique_class_column = dataset.loc[:,[class_attribute]]
    class_count = 0
    for _,row in unique_class_column.iterrows():
        dataset_class = row[0]
        if dataset_class == unique_class: 
            class_count += 1
    return class_count

def calculate_document_count(dataset):
    document_count = len(dataset)
    return document_count

def calculate_class_probibility(unique_class_count, document_count): 
    """
        Returns probibility of given class
        
        Prior Class => P(c) = Nc / N
    """
    class_probibility = unique_class_count / document_count
    return class_probibility

***Liklihood***   ____________  **P(w |c) = count(w,c) + 1 / count(c) + |V|**

In [4]:
def get_unique_tokens(testdata, tokens_attribute):
    """    
        get all the words, make their set to get word's occuurrence once,
        then returns all those in a list
    """
    tokens_df = testdata.loc[:,[tokens_attribute]]
    test_tokens_set = set()
    for _,row in tokens_df.iterrows():
        tokens = row[0].split(" ")
        for token in tokens:
            test_tokens_set.add(token)
    unique_test_tokens = list(test_tokens_set)
    return (unique_test_tokens)


def count_class_unique_token(dataset, unique_token, unique_class, unique_test_tokens, \
                                            tokens_attribute, class_attribute):
    """
        Returns count of given token within class
    """
    origional_data = dataset.loc[:,[tokens_attribute, class_attribute]]
    token_count = 0
    for _,row in origional_data.iterrows():
        words_row = row[0].split(" ")
        df_class = row[1] 
        if df_class == unique_class:
            
            for token in words_row:
                
                if token in unique_test_tokens and token == unique_token:   
                    token_count += 1
    return token_count

def count_total_class_tokens(dataset, unique_token, unique_class, tokens_attribute, class_attribute):
    """
        Returns count of all the words in given class
    """
    origional_data = dataset.loc[:,[tokens_attribute, class_attribute]]
    tokens_count = 0
#     filtered_data = origional_data[origional_data["Class"] == unique_class]
    for _,row in origional_data.iterrows():
        words_row = row[0].split(" ")
        df_class = row[1] 
        if df_class == unique_class:
            for token in words_row:  
                tokens_count += 1
    return tokens_count

def count_vucabulary_tokens(dataset, tokens_attribute, class_attribute):
    """
        get all words, make their set to get unique set, adds resultant set's occurrences and returns it
    """
    origional_data = dataset.loc[:3,[tokens_attribute,class_attribute]]
    tokens_set = set()
    for _,row in origional_data.iterrows():
        words_row = row[0].split(" ")
        for token in words_row: 
            tokens_set.add(token)
    vucabulary_list = list(tokens_set)
    vucabulary_count = 0
    for i in vucabulary_list:
        vucabulary_count += 1
    
    return vucabulary_count

def calculate_hypothesis(class_unique_token_count, tokens_count, vucablary_count):
    """
        Returns hypothesis of given class,
        
        Hypothesis Formula:
        Liklihood => P(w |c) = count(w,c) + 1 / count(c) + |V|
        
        count(w,c): count the occurrencesof specific word in given class
        count(c): count of all the words in given class
        V: count of occurances of set of all words within dataset
    """
    hypothesis = (class_unique_token_count + 1) / (total_class_tokens + vucablary_count)
    return hypothesis

***Maximum posterior***____ **P(c|d) = P(c|d) * P(c)**

In [5]:
def get_unique_classes(dataset, class_attribute):
    """
        Returns count of given word in test class
    """
    sub_df = dataset.loc[:,[class_attribute]]
    unique_classes = set()
    for _,row in sub_df.iterrows():
        distinct_class= row[0]
        unique_classes.add(distinct_class)
    unique_classes = list(unique_classes)
    return unique_classes

In [6]:
def count_unique_token_testdata(testdata, unique_token, tokens_attribute):
    """
        Returns count of given word in testdata
    """
    sub_testdata = testdata.loc[:,[tokens_attribute]]
    unique_class_count = 0
    for _,row in sub_testdata.iterrows():
        test_tokens = row[0].split(" ")
        for token in test_tokens:
            if token == unique_token:
                unique_class_count += 1
    return unique_class_count

In [7]:
def calculate_class_posterior(hypothesis, class_probibility, unique_test_tokens, test_class_unique_token_count):
    """
        Returns posterior of a class 
        
        Class Posterior => P(c|d) = P(w|c) * P(c)
        
        P(w|c): Product of all the hypothesis of words in given class
        P(c): Probability of give class
    """
    product_hypothesis_into_class_count = 1 
    for word in unique_test_tokens:
        hypothesis_into_class_count = hypothesis * test_class_unique_token_count
        product_hypothesis_into_class_count *= hypothesis_into_class_count
    class_posterior = class_probibility * product_hypothesis_into_class_count
    return class_posterior

## Naive Bayes

In [8]:
if __name__ == '__main__':
    """
        This is the main entrypoint, orchestrating the Naive Bayes algorithm
    """
    
    dataset = read_dataset('dataset.csv')
    testdata = read_dataset('testdata.csv')
    class_attribute = input("Enter class column name: ")
    tokens_attribute = input("Enter words column name: ")
    
    classes_posterior = {}
    
    unique_test_tokens = get_unique_tokens(testdata, tokens_attribute)
    document_count = calculate_document_count(dataset) 
    unique_classes = get_unique_classes(dataset, class_attribute)
    vucablary_count = count_vucabulary_tokens(dataset, tokens_attribute, class_attribute)
    for unique_token in unique_test_tokens:
        testdata_unique_token_count = count_unique_token_testdata(testdata, unique_token, tokens_attribute)
    for unique_class in unique_classes:
        unique_class_count = calculate_class_count(dataset, unique_class, class_attribute)
        class_probibility = calculate_class_probibility(unique_class_count, document_count)
        for unique_token in unique_test_tokens:
            
            class_unique_token_count = count_class_unique_token(dataset, unique_token, unique_class,\
                                                                         unique_test_tokens, tokens_attribute,\
                                                                         class_attribute)

            total_class_tokens = count_total_class_tokens(dataset, unique_token, unique_class,\
                                                                tokens_attribute, class_attribute)  
            hypothesis = calculate_hypothesis(class_unique_token_count,total_class_tokens,\
                                                                vucablary_count)

        class_posterior = calculate_class_posterior(hypothesis, class_probibility, unique_test_tokens, testdata_unique_token_count)

        print(f"\nclass_posterior of {unique_class}: ", class_posterior)
        classes_posterior[unique_class] = class_posterior
    maximum_posterior = max(classes_posterior)
    print("\nmaximum_posterior: ",maximum_posterior)

Enter class column name: Class
Enter words column name: Words

class_posterior of j:  0.0027434842249657062

class_posterior of c:  0.0002733236151603498

maximum_posterior:  j
