In [41]:
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import numpy as np

In [42]:
class NaiveBayesTextClassifier:
    predicted_class_column_name = 'prediccion'

    def __init__(self, classes, text_column_name, classes_column_name, tokenizer):
        self.relative_frequencies = {}  # by_class_by_value
        self.classes_probabilities = {}
        self.classes = classes
        self.classes_column_name = classes_column_name
        self.text_column_name = text_column_name
        self.row_count_by_class = {}
        self.tokenizer = tokenizer

    def train(self, data_df):
        for c in self.classes:
            class_df = data_df[data_df[self.classes_column_name] == c]
            self.row_count_by_class[c] = len(class_df)
            self.classes_probabilities[c] = len(class_df) / len(data_df)

            values_appearances = {}
            # iterate over rows within class
            for i in range(len(class_df)):
                row = class_df.iloc[[i]]

                tokenized_text = self.tokenizer(
                    row[self.text_column_name].values[0])
                # iterate over values within row
                for token in tokenized_text:
                    # initialize possible value if not present in map, otherwise increment appereances
                    if token not in values_appearances:
                        values_appearances[token] = 1
                    else:
                        values_appearances[token] += 1

            # calculate relative frequencies
            self.relative_frequencies[c] = {token: (token_count + 1) / (len(class_df) + len(
                self.classes)) for token, token_count in values_appearances.items()}

    def classify(self, sample):
        tokenized_sample = self.tokenizer(sample)
        classification = {}

        maximizing_class = None
        maximizing_prod = -1
        for c in self.classes:

            prod = self.classes_probabilities[c]
            token_likelihoods = self.relative_frequencies[c]
            laplace_constant = 1 / \
                (self.row_count_by_class[c] + len(self.classes))

            for token in tokenized_sample:
                prod *= token_likelihoods[token] if token in token_likelihoods else laplace_constant

            classification[c] = prod

        return dict(sorted(classification.items(), key=lambda item: item[1], reverse=True))

    def test(self, test_df):
        predicted_classes = []

        for i in range(len(test_df)):
            row = test_df.iloc[[i]]
            row_class = row[self.classes_column_name].values[0] 

            classification = self.classify(row[self.text_column_name].values[0])
            predicted_class = max(classification, key=classification.get)

            predicted_classes.append(predicted_class)

        # append results column to new dataframe
        results_df = test_df.copy()
        results_df[self.predicted_class_column_name] = predicted_classes
        
        return results_df
        
    def get_confusion_matrix(self):
        pass

    def get_evaluation_metrics_by_class(self, test_results_df):
        pass

    def plot_roc_curve(self):
        pass


In [43]:
def tokenize(text: str):
    return text.split()


In [44]:
data_df = pd.read_csv("./Noticias_argentinas.txt", header=0, sep='\t')
classes = ["Economia", "Salud", "Ciencia y Tecnologia", "Deportes"]

data_df = data_df[data_df["categoria"].isin(classes)]


nbclassifier = NaiveBayesTextClassifier(
    classes, "titular", "categoria", tokenize)

train_set = data_df.sample(frac=0.8, random_state=1)
test_set = data_df.drop(train_set.index)

nbclassifier.train(train_set)
print(nbclassifier.test(test_set))

sample_columns = data_df.columns.drop("categoria")

print(nbclassifier.classify("Histórico: Los Pumas derrotaron por primera vez a los All Blacks en Nueva Zelanda"))
print(nbclassifier.classify("Maradona negó haber criticado a Messi, disparó otra vez contra Scaloni y también la ligó Solari"))
print(nbclassifier.classify("Guzmán"))






                  fecha                                            titular  \
20     11/14/2018 10:44  Superclásico: River sacó a la venta más entrad...   
22      11/14/2018 9:36  Maradona negó haber criticado a Messi, disparó...   
35      11/14/2018 7:52  En lo que va del año, Iosper invirtió 160.000....   
38     11/14/2018 10:19  ¿Un milagro? el tumor cerebra de una paciente ...   
40     11/13/2018 22:09  Cómo hacer para recuperar un mensaje de Whatsa...   
...                 ...                                                ...   
30796   12/5/2018 16:48  Autorizaron a viajar a Madrid a Rafael Di Zeo,...   
30802   12/5/2018 17:35  El rival de Rafael Di Zeo por el comando de "L...   
30806   12/5/2018 14:39  Nos dio mucha alegría ver nacer al primer bebé...   
30818    12/5/2018 8:31  El Galaxy S10 de Samsung contará con 4 cámaras...   
30823   12/5/2018 17:20      Tumblr prohibirá los contenidos pornográficos   

                            fuente             categoria  \
20 