In [None]:
# Load libraries

# Data Manipulation
import pandas as pd
# import stanza
# stanza.download("es")

In [None]:
# Load the dataset
raw_df = pd.read_csv("./../datasets/initial_dataset.csv")

# Load stop words
with open("stop_words.txt", "r") as f:
    stop_words = f.read().splitlines()

# stop_words

In [None]:
raw_df.head(5)

In [None]:
# Pipeline
from sklearn.pipeline import Pipeline
# TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Decision Tree Classifier
from sklearn.tree._classes import DecisionTreeClassifier
# Split into train and test sets
from sklearn.model_selection import train_test_split
# Custom transformers
from custom_transformers import Cleaner, StopWordsRemover, Lemmatizer

copy = raw_df.copy()

X_train, X_test, y_train, y_test = train_test_split(
    copy, copy['class'], test_size=0.2, random_state=42)

pipeline = Pipeline(steps=[
    ("cleaner", Cleaner()),
    ("stop_words_remover", StopWordsRemover(stop_words=stop_words)),
    ("lemmatizer", Lemmatizer()),
    ("tfidf", TfidfVectorizer()),
    ("classifier", DecisionTreeClassifier()),
])

model = pipeline.fit(X=X_train['tweet'], y=y_train)

In [None]:
# model.set_params(classifier__max_depth=None)

In [None]:
print(model.steps[-2][1].get_feature_names_out().shape)
print(model.steps[-2][1].get_feature_names_out())
prediction = model.predict(X_test['tweet'])
X_test['prediction'] = prediction
X_test

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np

# Evaluation
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

print("\n Reporte de Clasificación:")
print(classification_report(y_test, prediction))

# Confusion Matrix:
classes = copy['class'].unique()
cm = confusion_matrix(y_test, prediction, labels=classes)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classes)
print('Matriz de Confusión:')
disp.plot()
plt.show()

# Calculamos el número de instancias clasificadas correctamente
correct = np.sum(np.diag(cm))
print('\nInstancias clasificadas correctamente:\n\n', correct)

# Calculamos el número de instancias clasificadas incorrectamente
incorrect = cm.sum() - correct
print('\nInstancias clasificadas incorrectamente:\n\n', incorrect)

# Calculamos FP, FN, TP y TN para cada clase operando con la matriz de confusión
FP = cm.sum(axis=0) - np.diag(cm)
FN = cm.sum(axis=1) - np.diag(cm)
TP = np.diag(cm)
TN = cm.sum() - (FP + FN + TP)

# Calculamos el TP Rate (Recall) para cada clase
TPR = TP/(TP + FN)
print('\nTP Rate (Recall) por clase:\n\n', TPR)

# Calculamos el FP Rate para cada clase
FPR = FP/(FP + TN)
print('\nFP Rate por clase:\n\n', FPR)

# Calculamos la exactitud del modelo
accuracy = accuracy_score(y_test, prediction)
print('\nExactitud:\n\n', accuracy)

In [None]:
# Don't truncate the output
# pd.set_option("display.max_colwidth", None)
# copy

In [None]:
import pickle

# Save the model
with open("./../models/model.pkl", "wb") as f:  # wb = write binary
    pickle.dump(model, f)

In [None]:
import pickle

# Load the model
with open("./../models/model.pkl", "rb") as f:  # rb = read binary
    model_loaded = pickle.load(f)

prediction = model_loaded.predict(raw_df['tweet'])
raw_df['prediction'] = prediction
raw_df

In [None]:
# data = pd.read_csv("./data/dataset.csv")
# model_loaded.predict(data['text'])