In [None]:
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import numpy as np

In [None]:
class NaiveBayesClassifier:

    def __init__(self, classes, classes_column_name):
        self.relative_frequencies = {} #by_class_by_value
        self.classes_probabilities = {}
        self.classes = classes
        self.classes_column_name = classes_column_name
        self.row_count_by_class = {}

    def train(self, data_df):
        for c in self.classes:
            class_df = data_df[data_df[self.classes_column_name] == c].drop(self.classes_column_name, axis=1)
            self.row_count_by_class[c] = len(class_df)
            self.classes_probabilities[c] = len(class_df) / len(data_df)

            values_appearances = {column_name: {} for column_name in class_df.columns}
            # iterate over rows within class
            for i in range(len(class_df)):
                row = class_df.iloc[[i]]

                # iterate over values within row
                for column_name in row.columns:
                    # initialize possible value if not present in map, otherwise increment appereances
                    column_value = row[column_name].values[0]

                    if column_value not in values_appearances[column_name]:
                        values_appearances[column_name][column_value] = 1
                    else:
                        values_appearances[column_name][column_value] += 1

            # calculate relative frequencies
            self.relative_frequencies[c] = { column_name: {
                key: (value + 1) / (len(class_df) + len(self.classes)) for key, value in values_appearances[column_name].items()
            } for column_name in class_df.columns}
            print(self.relative_frequencies[c])
            print(self.classes_probabilities[c])



    def classify(self, sample):
        maximizing_class = None
        maximizing_prod = -1
        for c in self.classes:
            prod = self.classes_probabilities[c]
            
            for column_name in sample.columns:
                attr_likelihoods = self.relative_frequencies[c][column_name]
                prod *= attr_likelihoods[sample[column_name][0]] if sample[column_name][0] in attr_likelihoods else 1/(self.row_count_by_class[c] + len(self.classes))

            if(prod > maximizing_prod):
                maximizing_prod = prod
                maximizing_class = c

        return maximizing_class
        
    def create_sample_df(self, columns, sample):
        return pd.DataFrame([sample], columns=columns)

In [None]:
data_df = pd.read_csv("./PreferenciasBritanicos.csv", header=0)

nbclassifier = NaiveBayesClassifier(["I", "E"], "Nacionalidad")
nbclassifier.train(data_df)
columns = data_df.columns.drop("Nacionalidad")

sample_df = nbclassifier.create_sample_df(columns, [1, 0, 1, 1, 0])
print(nbclassifier.classify(sample_df))
sample_df = nbclassifier.create_sample_df(columns, [0, 1, 1, 0, 1])
print(nbclassifier.classify(sample_df))
