In [26]:
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
import numpy as np


In [27]:
import numpy as np
import pandas as pd


class Metrics:
    @staticmethod
    def k_fold_cross_validation(x, y, k: int, x_column_names: list = None, y_column_names: list = None):
        if k <= 0 or k > len(x):
            raise ValueError(
                "k must be greater than 0 and less than the number of rows in the dataset")

        if len(x) != len(y):
            raise ValueError(
                "The number of rows in the dataset must be equal to the number of rows in the expected output")

        # Shuffle the dataset
        shuffled_dataset = list(zip(x, y))
        np.random.shuffle(shuffled_dataset)
        x, y = map(
            np.array, zip(*shuffled_dataset))

        fold_len = int(len(x) / k)
        folds = []

        print(x[0:2])
        print("============")
        print(y[0:2])

        # Split the dataset into k folds
        for i in range(k):
            x_test = x[i *
                       fold_len: (i + 1) * fold_len]
            y_test = y[i *
                       fold_len: (i + 1) * fold_len]

            x_train = np.concatenate(
                [x[:i * fold_len],
                 x[(i + 1) * fold_len:]])

            y_train = np.concatenate(
                [y[:i * fold_len],
                 y[(i + 1) * fold_len:]])

            # If df_columns is not None, then we need to create a dataframe for both sets
            if x_column_names is not None:
                x_train = pd.DataFrame(
                    x_train, columns=x_column_names)
                x_test = pd.DataFrame(
                    x_test, columns=x_column_names)
                y_train = pd.DataFrame(
                    y_train, columns=y_column_names)
                y_test = pd.DataFrame(
                    y_test, columns=y_column_names)

            # Load the test and train sets into the folds
            folds.append({
                'x_train': x_train,
                'y_train': y_train,
                'x_test': x_test,
                'y_test': y_test,
            })

        return folds

    @staticmethod
    def k_fold_cross_validation_eval(x, y, model, k: int, x_column_names: list = None, y_column_names: list = None):
        if model is None:
            raise ValueError("Model cannot be None")

        folds = Metrics.k_fold_cross_validation(
            x, y, k=k, x_column_names=x_column_names, y_column_names=y_column_names)

        # Evaluate the model on each fold
        results = []
        for fold in folds:
            # Train the model, on this case the expected output is present on the train set
            # so ignore the train_set_expected_output field
            x_train = fold['x_train']
            y_train = fold['y_train']
            model.train(pd.concat([x_train, y_train], axis=1))

            # Evaluate the model on the test set
            x_test = fold['x_test']
            y_test = fold['y_test']
            results.append(model.test(pd.concat([x_test, y_test], axis=1)))

        return results


In [28]:
class NaiveBayesTextClassifier():
    predicted_class_column_name = 'prediccion'

    def __init__(self, classes, text_column_name, classes_column_name, tokenizer):
        self.relative_frequencies = {}  # by_class_by_value
        self.classes_probabilities = {}
        self.classes = classes
        self.classes_column_name = classes_column_name
        self.text_column_name = text_column_name
        self.row_count_by_class = {}
        self.tokenizer = tokenizer

    def train(self, data_df):
        for c in self.classes:
            class_df = data_df[data_df[self.classes_column_name] == c]
            self.row_count_by_class[c] = len(class_df)
            self.classes_probabilities[c] = len(class_df) / len(data_df)

            values_appearances = {}
            # iterate over rows within class
            for i in range(len(class_df)):
                row = class_df.iloc[[i]]

                tokenized_text = self.tokenizer(
                    row[self.text_column_name].values[0])
                # iterate over values within row
                for token in tokenized_text:
                    # initialize possible value if not present in map, otherwise increment appereances
                    if token not in values_appearances:
                        values_appearances[token] = 1
                    else:
                        values_appearances[token] += 1

            # calculate relative frequencies
            self.relative_frequencies[c] = {token: (token_count + 1) / (len(class_df) + len(
                self.classes)) for token, token_count in values_appearances.items()}

    def classify(self, sample):
        tokenized_sample = self.tokenizer(sample)
        classification = {}

        maximizing_class = None
        maximizing_prod = -1
        for c in self.classes:

            prod = self.classes_probabilities[c]
            token_likelihoods = self.relative_frequencies[c]
            laplace_constant = 1 / \
                (self.row_count_by_class[c] + len(self.classes))

            for token in tokenized_sample:
                prod *= token_likelihoods[token] if token in token_likelihoods else laplace_constant

            classification[c] = prod

        return dict(sorted(classification.items(), key=lambda item: item[1], reverse=True))

    def test(self, test_df):
        predicted_classes = []

        for i in range(len(test_df)):
            row = test_df.iloc[[i]]
            row_class = row[self.classes_column_name].values[0]

            classification = self.classify(
                row[self.text_column_name].values[0])
            predicted_class = max(classification, key=classification.get)

            predicted_classes.append(predicted_class)

        # append results column to new dataframe
        results_df = test_df.copy()
        results_df[self.predicted_class_column_name] = predicted_classes

        return results_df

    def get_confusion_matrix(self):
        pass

    def get_evaluation_metrics_by_class(self, test_results_df):
        pass

    def plot_roc_curve(self):
        pass


In [29]:
def tokenize(text: str):
    return text.split()


In [30]:
data_df = pd.read_csv("./Noticias_argentinas.txt", header=0, sep='\t')
classes = ["Economia", "Salud", "Ciencia y Tecnologia", "Deportes"]

data_df = data_df[data_df["categoria"].isin(classes)]


nbclassifier = NaiveBayesTextClassifier(
    classes, "titular", "categoria", tokenize)
# Get expected output for dataset

y = data_df.loc[:, ["categoria"]]
x = data_df.loc[:, ["titular"]]
print(len(y))
# print("Expected output: ", expected_output)
print(Metrics.k_fold_cross_validation_eval(x.values.tolist(), y.values.tolist(
), model=nbclassifier, x_column_names=x.columns, y_column_names=y.columns, k=2))

# train_set = data_df.sample(frac=0.8, random_state=1)
# print("======================")
# print(train_set.index)
# test_set = data_df.drop(train_set.index)

# nbclassifier.train(train_set)
# print(nbclassifier.test(test_set))

# sample_columns = data_df.columns.drop("categoria")

# print(nbclassifier.classify("Histórico: Los Pumas derrotaron por primera vez a los All Blacks en Nueva Zelanda"))
# print(nbclassifier.classify("Maradona negó haber criticado a Messi, disparó otra vez contra Scaloni y también la ligó Solari"))
# print(nbclassifier.classify("Guzmán"))


15406


AttributeError: 'Series' object has no attribute 'columns'