In [1]:
import re
import pickle
import pandas as pd
from textblob import TextBlob
from spellchecker import SpellChecker
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_predict
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

In [2]:
RANDOM_SEED = 8

In [3]:
df_conta = pd.read_excel("sentences/sentences_conta.xlsx")
df_clima = pd.read_excel("sentences/sentences_clima.xlsx")
df_eletro = pd.read_excel("sentences/sentences_eletro.xlsx")
dfs = [df_conta, df_clima, df_eletro]

## Cleaning the Datasets

In [4]:
for df in dfs:
    df["Sentença"] = df["Sentença"].str.lower()
    df["Sentença"] = df["Sentença"].str.replace(r"\s\-\s|\-\-+", " ", regex=True)
    df["Sentença"] = df["Sentença"].str.replace(r"[^\w\s\-]", " ", regex=True)
    df["Sentença"] = df["Sentença"].str.replace("foxbot ", "", regex=False)

## Adapting Data for MultinomialNB

In [5]:
# Adapts data to be used by the MultinomialNB model and already splits into X and y.
def adaptToModel(df, vectorizer, objective=None):
    txts = df["Sentença"].tolist()
    if objective == "train":
        counts = vectorizer.fit_transform(txts)
        return counts, df["Intenção"]
    elif objective == "test":
        counts = vectorizer.transform(txts)
        return counts, df["Intenção"]
    else:
        raise ValueError("Defina o objetivo ('train' ou 'test')")

## Split Datasets into Train and Test

In [6]:
# Splits dataset into stratified train and test sets.
models = {
    "conta" : {"model" : MultinomialNB(), "vectorizer" : CountVectorizer(), "df" : df_conta},
    "clima" : {"model" : MultinomialNB(), "vectorizer" : CountVectorizer(), "df" : df_clima},
    "eletro" : {"model" : MultinomialNB(), "vectorizer" : CountVectorizer(), "df" : df_eletro}
}
for sub_class in models.keys():
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_SEED)
    for train_index, test_index in split.split(models[sub_class]["df"], models[sub_class]["df"]["Intenção"]):
        strat_train_set = models[sub_class]["df"].loc[train_index]
        strat_test_set = models[sub_class]["df"].loc[test_index]
    models[sub_class]["train"] = strat_train_set
    models[sub_class]["test"] = strat_test_set

## Training the Models

In [7]:
for sub_class in models.keys():
    X_train, y_train = adaptToModel(models[sub_class]["train"], vectorizer=models[sub_class]["vectorizer"], objective="train")
    models[sub_class]["model"].fit(X_train, y_train)


## Testing the Models

In [8]:
for sub_class in models.keys():
    X_test, y_test = adaptToModel(models[sub_class]["test"], vectorizer=models[sub_class]["vectorizer"], objective="test")
    pred = cross_val_predict(models[sub_class]["model"], X_test, y_test, cv=3, n_jobs=-1)
    score = accuracy_score(y_test, pred)
    models[sub_class]["accuracy"] = score

In [9]:
for sub_class in models.keys():
    print(f"Acurácia {sub_class}: {models[sub_class]['accuracy'] * 100:.2f}%")

Acurácia conta: 91.67%
Acurácia clima: 85.19%
Acurácia eletro: 95.24%


## Saving the Models

In [10]:
for sub_class in models.keys():
    pickle.dump(models[sub_class]["model"], open(f"models/model_{sub_class}_v0.sav", "wb"))
    pickle.dump(models[sub_class]["vectorizer"], open(f"vectorizers/vectorizer_{sub_class}_v0.sav", "wb"))