In [1]:
import re
import os
import pickle
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_predict
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

In [2]:
RANDOM_SEED = 36

In [3]:
data = pd.read_excel("new_sentences/newSentences.xlsx", index_col=0)
df_conta = data[(data["Intenção"] == "Consultar saldo da poupança") | (data["Intenção"] == "Consultar saldo da conta-corrente")].reset_index(drop=True)
df_clima = data[(data["Intenção"] == "Temperatura") | (data["Intenção"] == "Chuva")].reset_index(drop=True)
df_eletro = data[(data["Intenção"] == "Ar-condicionado") | (data["Intenção"] == "Luz")].reset_index(drop=True)

data["Intenção"] = data["Intenção"].str.replace("Consultar saldo da poupança", "Consultar saldo da conta", regex=False)
data["Intenção"] = data["Intenção"].str.replace("Consultar saldo da conta-corrente", "Consultar saldo da conta", regex=False)
data["Intenção"] = data["Intenção"].str.replace("Temperatura", "Obter informações relativas ao clima", regex=False)
data["Intenção"] = data["Intenção"].str.replace("Chuva", "Obter informações relativas ao clima", regex=False)
data["Intenção"] = data["Intenção"].str.replace("Ar-condicionado", "Interagir com a luz ou o ar-condicionado", regex=False)
data["Intenção"] = data["Intenção"].str.replace("Luz", "Interagir com a luz ou o ar-condicionado", regex=False)


dfs = [data, df_conta, df_clima, df_eletro]

In [4]:
data

Unnamed: 0,Sentença,Intenção
0,Hoje tem gol do Gabigol?,Não sei
1,O Thiago tem pipi grande?,Não sei
2,Mostre uma imagem do meu Ma,Não sei
3,Quem vai sair do BBB semana que vem?,Não sei
4,Quanto foi o jogo do Palmeiras hoje?,Não sei
5,Quantas estrelas existem no céu?,Não sei
6,Como tá a temperatura?,Obter informações relativas ao clima
7,Qual a chance de chover?,Obter informações relativas ao clima
8,"Ligue a luz, por gentileza",Interagir com a luz ou o ar-condicionado
9,"Desligue o ar, foxbot!",Interagir com a luz ou o ar-condicionado


## Cleaning the Dataset

In [5]:
for df in dfs:
    df["Sentença"] = df["Sentença"].str.lower()
    df["Sentença"] = df["Sentença"].str.replace(r"\s\-\s|\-\-+", " ", regex=True)
    df["Sentença"] = df["Sentença"].str.replace(r"[^\w\s\-]", " ", regex=True)
    df["Sentença"] = df["Sentença"].str.replace("foxbot ", "", regex=False)

## Adapting Data for MultinomialNB

In [6]:
# Adapts data to be used by the MultinomialNB model and already splits into X and y.
def adaptToModel(df, vectorizer, objective=None):
    txts = df["Sentença"].tolist()
    if objective == "train":
        counts = vectorizer.fit_transform(txts)
        return counts, df["Intenção"]
    elif objective == "test":
        counts = vectorizer.transform(txts)
        return counts, df["Intenção"]
    else:
        raise ValueError("Defina o objetivo ('train' ou 'test')")

## Split Datasets into Train and Test

In [7]:
def loadModels():
    modelTypes = ["main", "clima", "conta", "eletro"]
    models = {}
    vectorizers = {}
    files = os.listdir("./models")
    for modelType in modelTypes:
        finalNumber = 0
        if modelType == "main":
            filename = f"model_v{finalNumber}.sav"
            pattern = r"model_v\d+.sav"
            pattern2 = r"model_v(\d)+.sav"
        else:
            filename = f"model_{modelType}_v{finalNumber}.sav"
            pattern = fr"model_{modelType}_v\d+.sav"
            pattern2 = fr"model_{modelType}_v(\d)+.sav"
        for f in files:
            if re.fullmatch(pattern, f):
                current = int(re.sub(pattern2, r"\1", filename))
                possible = int(re.sub(pattern2, r"\1", f))
                if possible > current:
                    filename = f
                    finalNumber = possible
        if modelType == "main":
            model = pickle.load(open(f"models/model_v{finalNumber}.sav", "rb"))
            vectorizer = pickle.load(
                open(f"vectorizers/vectorizer.sav", "rb")
            )
        else:
            model = pickle.load(
                open(f"models/model_{modelType}_v{finalNumber}.sav", "rb")
            )
            vectorizer = pickle.load(
                open(f"vectorizers/vectorizer_{modelType}.sav", "rb")
            )
        models[modelType] = model
        vectorizers[modelType] = vectorizer
    return models, vectorizers, finalNumber

In [8]:
lastModels, lastVectorizers, num = loadModels()

In [9]:
df1 = pd.read_excel("sentences/sentences.xlsx")
df1_conta = pd.read_excel("sentences/sentences_conta.xlsx")
df1_clima = pd.read_excel("sentences/sentences_clima.xlsx")
df1_eletro = pd.read_excel("sentences/sentences_eletro.xlsx")

df1 = df1.append(data, ignore_index=True)
df1_conta = df1_conta.append(df_conta, ignore_index=True)
df1_clima = df1_clima.append(df_clima, ignore_index=True)
df1_eletro = df1_eletro.append(df_eletro, ignore_index=True)

In [10]:
# Splits dataset into stratified train and test sets.
models = {
    "main" : {
        "model" : lastModels["main"], "vectorizer" : lastVectorizers["main"], "newData" : data, "consolidated" : df1 
    },
    "conta" : {
        "model" : lastModels["conta"], "vectorizer" : lastVectorizers["conta"], "newData" : df_conta, "consolidated" : df1_conta
    },
    "clima" : {
        "model" : lastModels["clima"], "vectorizer" : lastVectorizers["clima"], "newData" : df_clima, "consolidated" : df1_clima
    },
    "eletro" : {
        "model" : lastModels["eletro"], "vectorizer" : lastVectorizers["eletro"], "newData" : df_eletro, "consolidated" : df1_eletro
    }
}

for sub_class in models.keys():
    classNames = models[sub_class]["newData"]["Intenção"].unique().tolist()
    
    X = models[sub_class]["vectorizer"].transform(models[sub_class]["newData"]["Sentença"].tolist())
    y = models[sub_class]["newData"]["Intenção"]
    models[sub_class]["model"].partial_fit(X, y, classes=classNames)

In [11]:
for sub_class in models.keys():
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_SEED)
    for train_index, test_index in split.split(models[sub_class]["consolidated"], models[sub_class]["consolidated"]["Intenção"]):
        strat_train_set = models[sub_class]["consolidated"].loc[train_index]
        strat_test_set = models[sub_class]["consolidated"].loc[test_index]
    models[sub_class]["train"] = strat_train_set
    models[sub_class]["test"] = strat_test_set

## Training the models

In [12]:
for sub_class in models.keys():
    X_train, y_train = adaptToModel(models[sub_class]["train"], vectorizer=models[sub_class]["vectorizer"], objective="train")
    models[sub_class]["model"].fit(X_train, y_train)


## Testing the models

In [13]:
for sub_class in models.keys():
    X_test, y_test = adaptToModel(models[sub_class]["test"], vectorizer=models[sub_class]["vectorizer"], objective="test")
    pred = cross_val_predict(models[sub_class]["model"], X_test, y_test, cv=5, n_jobs=-1)
    score = accuracy_score(y_test, pred)
    models[sub_class]["accuracy"] = score

In [14]:
for sub_class in models.keys():
    print(f"Acurácia {sub_class}: {models[sub_class]['accuracy'] * 100:.2f}%")

Acurácia main: 80.00%
Acurácia conta: 96.00%
Acurácia clima: 92.86%
Acurácia eletro: 100.00%


In [15]:
pd.DataFrame().to_excel("new_sentences/newSentences.xlsx")

## Saving the models

In [16]:
for sub_class in models.keys():
    if sub_class == "main":
        pickle.dump(models[sub_class]["model"], open(f"models/model_v{num + 1}.sav", "wb"))
    else:
        pickle.dump(models[sub_class]["model"], open(f"models/model_{sub_class}_v{num + 1}.sav", "wb"))
        