In [32]:
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import numpy as np

In [33]:
class NaiveBayesTextClassifier:

    def __init__(self, classes, text_column_name, classes_column_name, tokenizer):
        self.relative_frequencies = {}  # by_class_by_value
        self.classes_probabilities = {}
        self.classes = classes
        self.classes_column_name = classes_column_name
        self.text_column_name = text_column_name
        self.row_count_by_class = {}
        self.tokenizer = tokenizer

    def train(self, data_df):
        for c in self.classes:
            class_df = data_df[data_df[self.classes_column_name] == c]
            self.row_count_by_class[c] = len(class_df)
            self.classes_probabilities[c] = len(class_df) / len(data_df)

            values_appearances = {}
            # iterate over rows within class
            for i in range(len(class_df)):
                row = class_df.iloc[[i]]

                tokenized_text = self.tokenizer(
                    row[self.text_column_name].values[0])
                # iterate over values within row
                for token in tokenized_text:
                    # initialize possible value if not present in map, otherwise increment appereances
                    if token not in values_appearances:
                        values_appearances[token] = 1
                    else:
                        values_appearances[token] += 1

            # calculate relative frequencies
            self.relative_frequencies[c] = {token: (token_count + 1) / (len(class_df) + len(
                self.classes)) for token, token_count in values_appearances.items()}


    def classify(self, sample):
        tokenized_sample = self.tokenizer(sample)
        classification = {}

        maximizing_class = None
        maximizing_prod = -1
        for c in self.classes:

            prod = self.classes_probabilities[c]
            token_likelihoods = self.relative_frequencies[c]
            laplace_constant = 1 / \
                (self.row_count_by_class[c] + len(self.classes))

            for token in tokenized_sample:
                prod *= token_likelihoods[token] if token in token_likelihoods else laplace_constant

            classification[c] = prod

        return dict(sorted(classification.items(), key=lambda item: item[1], reverse=True))


In [34]:
def tokenize(text: str):
    return text.split()


In [35]:
data_df = pd.read_csv("./Noticias_argentinas.txt", header=0, sep='\t')
classes = ["Economia", "Salud", "Ciencia y Tecnologia", "Deportes"]

data_df = data_df[data_df["categoria"].isin(classes)]


nbclassifier = NaiveBayesTextClassifier(
    classes, "titular", "categoria", tokenize)
nbclassifier.train(data_df)

sample_columns = data_df.columns.drop("categoria")

print(nbclassifier.classify("Histórico: Los Pumas derrotaron por primera vez a los All Blacks en Nueva Zelanda"))
print(nbclassifier.classify("Maradona negó haber criticado a Messi, disparó otra vez contra Scaloni y también la ligó Solari"))
print(nbclassifier.classify("Guzmán"))


{'Deportes': 1.0923717845242864e-31, 'Economia': 2.3878352702903526e-33, 'Ciencia y Tecnologia': 2.0081003185176624e-34, 'Salud': 8.598773060888454e-36}
{'Deportes': 3.582880382438505e-33, 'Economia': 3.16071890106868e-42, 'Ciencia y Tecnologia': 2.575323374214025e-43, 'Salud': 4.424901195844235e-44}
{'Deportes': 6.484258102769242e-05, 'Ciencia y Tecnologia': 6.484251139620591e-05, 'Economia': 6.484240667796619e-05, 'Salud': 6.484223142111342e-05}
