# Função NLP
## Versão 1.1.1 
### Versão Python utilizada: Python 3.11.3 (Anaconda) | IDE: VSCode

In [1]:
import numpy as np
import pandas as pd
import nltk
import unidecode 
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score


In [2]:
dataset = pd.read_csv('dados.csv')

In [3]:
def NLP(dataset,algoritmo): # Input: Dataset inteiro: 2 colunas: texto + rótulo ; qual algoritmo usar

    # Bloco 1: Processamento do texto

    corpus = []
    for i in range(0,dataset.shape[0]):
        texto = re.sub('[^a-zA-Z]',' ',unidecode.unidecode(dataset['texto'][i]) )
        texto = texto.lower()
        texto = texto.split()
        ps = PorterStemmer()
        texto = [ps.stem(word) for word in texto if not word in set(stopwords.words('portuguese'))]
        texto = ' '.join(texto)
        corpus.append(texto)

    # Bloco 2: Embedding do texto

    # BoW
    cv=CountVectorizer(max_features = 5000)
    X=cv.fit_transform(corpus).toarray()
    y=dataset.iloc[:,1].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)

    # Bloco 3 : Algorítmos de NLP
    metricas = {}

    if algoritmo == "rf":

    # Random forest
        model_RF=RandomForestClassifier(n_estimators = 400,criterion="entropy",random_state = 0)
        model_RF.fit(X_train,y_train)

        y_pred_RF = model_RF.predict(X_test)
        metricas["Random Forest"] = {
            "acurácia": accuracy_score(y_test, y_pred_RF),
            "precisão": precision_score(y_test, y_pred_RF),
            "f1 score": f1_score(y_test, y_pred_RF)
        }

        df = pd.DataFrame(metricas)
        df = df.transpose()
        print(df)

    elif algoritmo == "xg":

    # XGBoost
        model_XGB = XGBClassifier(n_estimators = 400, max_depth = 2, learning_rate = 1, objective = 'binary:logistic')
        model_XGB.fit(X_train, y_train)
        y_pred_XGB = model_XGB.predict(X_test)
        metricas["XGBoost"] = {
            "acurácia": accuracy_score(y_test, y_pred_XGB),
            "precisão": precision_score(y_test, y_pred_XGB),
            "f1 score": f1_score(y_test, y_pred_XGB)
        }

        df = pd.DataFrame(metricas)
        df = df.transpose()
        print(df)
    elif algoritmo == "cb":

    # Catboost:
        model_CB = CatBoostClassifier(iterations = 3,
                            depth = 3,
                            learning_rate = 1,
                            loss_function = 'Logloss',
                            verbose = False)
        model_CB.fit(X_train, y_train)
        y_pred_CB = model_CB.predict(X_test)
        metricas["Catboost"] = {
            "acurácia": accuracy_score(y_test, y_pred_CB),
            "precisão": precision_score(y_test, y_pred_CB),
            "f1 score": f1_score(y_test, y_pred_CB)
        }

        df = pd.DataFrame(metricas)
        df = df.transpose()
        print(df)



    elif algoritmo == "gbc":

    # Gradient Boosting Classifier:
        model_GBC = GradientBoostingClassifier()
        model_GBC.fit(X_train, y_train)

        y_pred_gbc = model_GBC.predict(X_test)
        metricas["Gradient Boosting Classifier"] = {
            "acurácia": accuracy_score(y_test, y_pred_gbc),
            "precisão": precision_score(y_test, y_pred_gbc),
            "f1 score": f1_score(y_test, y_pred_gbc)
        }

        df = pd.DataFrame(metricas)
        df = df.transpose()
        print(df)

    elif algoritmo == "svm":

    # SVM:
        model_SVM = svm.SVC()
        model_SVM.fit(X_train, y_train)

        y_pred_svm = model_SVM.predict(X_test)
        metricas["SVM"] = {
            "acurácia": accuracy_score(y_test, y_pred_svm),
            "precisão": precision_score(y_test, y_pred_svm),
            "f1 score": f1_score(y_test, y_pred_svm)
        }

        df = pd.DataFrame(metricas)
        df = df.transpose()
        print(df)

    elif algoritmo == "gnb":

    # Naive Bayes Gaussian:
        model_GNB = GaussianNB()
        model_GNB.fit(X_train, y_train)

        y_pred_GNB = model_GNB.predict(X_test)
        metricas["Naive Bayes Gaussian"] = {
            "acurácia": accuracy_score(y_test, y_pred_GNB),
            "precisão": precision_score(y_test, y_pred_GNB),
            "f1 score": f1_score(y_test, y_pred_GNB)
        }

        df = pd.DataFrame(metricas)
        df = df.transpose()
        print(df)

    elif algoritmo == "mnb":

    # Naive Bayes Multinomial:
        model_MNB = MultinomialNB()
        model_MNB.fit(X_train, y_train)

        y_pred_MNB = model_MNB.predict(X_test)
        metricas["Naive Bayes Multinomial"] = {
            "acurácia": accuracy_score(y_test, y_pred_MNB),
            "precisão": precision_score(y_test, y_pred_MNB),
            "f1 score": f1_score(y_test, y_pred_MNB)
        }

        df = pd.DataFrame(metricas)
        df = df.transpose()
        print(df)

    elif algoritmo == "cnb":

    # Naive Bayes Complement:
        model_CNB = ComplementNB()
        model_CNB.fit(X_train, y_train)

        y_pred_CNB = model_CNB.predict(X_test)
        metricas["Naive Bayes Complement"] = {
            "acurácia": accuracy_score(y_test, y_pred_CNB),
            "precisão": precision_score(y_test, y_pred_CNB),
            "f1 score": f1_score(y_test, y_pred_CNB)
        }

        df = pd.DataFrame(metricas)
        df = df.transpose()
        print(df)

    else:
        model_RF = RandomForestClassifier(n_estimators = 400,criterion = "entropy",random_state = 0)
        model_RF.fit(X_train,y_train)     
        model_XGB = XGBClassifier(n_estimators = 400, max_depth = 2, learning_rate = 1, objective = 'binary:logistic')
        model_XGB.fit(X_train, y_train)
        model_CB = CatBoostClassifier(iterations = 3,
                            depth = 3,
                            learning_rate = 1,
                            loss_function = 'Logloss',
                            verbose = False)
        model_CB.fit(X_train, y_train)
        model_GBC = GradientBoostingClassifier()
        model_GBC.fit(X_train, y_train)
        model_SVM = svm.SVC()
        model_SVM.fit(X_train, y_train)
        model_GNB = GaussianNB()
        model_GNB.fit(X_train, y_train)
        model_MNB = MultinomialNB()
        model_MNB.fit(X_train, y_train)
        model_CNB = ComplementNB()
        model_CNB.fit(X_train, y_train)

        # Bloco 4: Acumulador/organizador de resultados
        metricas = {}

        # Random Forest:
        y_pred_RF = model_RF.predict(X_test)
        metricas["Random Forest"] = {
            "acurácia": accuracy_score(y_test, y_pred_RF),
            "precisão": precision_score(y_test, y_pred_RF),
            "f1 score": f1_score(y_test, y_pred_RF)
        }

        # XGBoost:
        y_pred_XGB = model_XGB.predict(X_test)
        metricas["XGBoost"] = {
            "acurácia": accuracy_score(y_test, y_pred_XGB),
            "precisão": precision_score(y_test, y_pred_XGB),
            "f1 score": f1_score(y_test, y_pred_XGB)
        }

        # Catboost:
        y_pred_CB = model_CB.predict(X_test)
        metricas["Catboost"] = {
            "acurácia": accuracy_score(y_test, y_pred_CB),
            "precisão": precision_score(y_test, y_pred_CB),
            "f1 score": f1_score(y_test, y_pred_CB)
        }

        # Gradient Boosting Classifier:
        y_pred_gbc = model_GBC.predict(X_test)
        metricas["Gradient Boosting Classifier"] = {
            "acurácia": accuracy_score(y_test, y_pred_gbc),
            "precisão": precision_score(y_test, y_pred_gbc),
            "f1 score": f1_score(y_test, y_pred_gbc)
        }

        # SVM:
        y_pred_svm = model_SVM.predict(X_test)
        metricas["SVM"] = {
            "acurácia": accuracy_score(y_test, y_pred_svm),
            "precisão": precision_score(y_test, y_pred_svm),
            "f1 score": f1_score(y_test, y_pred_svm)
        }

        # Naive Bayes Gaussian:
        y_pred_GNB = model_GNB.predict(X_test)
        metricas["Naive Bayes Gaussian"] = {
            "acurácia": accuracy_score(y_test, y_pred_GNB),
            "precisão": precision_score(y_test, y_pred_GNB),
            "f1 score": f1_score(y_test, y_pred_GNB)
        }

        # Naive Bayes Multinomial:
        y_pred_MNB = model_MNB.predict(X_test)
        metricas["Naive Bayes Multinomial"] = {
            "acurácia": accuracy_score(y_test, y_pred_MNB),
            "precisão": precision_score(y_test, y_pred_MNB),
            "f1 score": f1_score(y_test, y_pred_MNB)
        }

        # Naive Bayes Complement:
        y_pred_CNB = model_CNB.predict(X_test)
        metricas["Naive Bayes Complement"] = {
            "acurácia": accuracy_score(y_test, y_pred_CNB),
            "precisão": precision_score(y_test, y_pred_CNB),
            "f1 score": f1_score(y_test, y_pred_CNB)
        }

        # Bloco 5: Saída
        # Crie o DataFrame a partir do dicionário de métricas
        df = pd.DataFrame(metricas)

        # Transponha o DataFrame para que os nomes dos algoritmos sejam nas linhas e as métricas nas colunas
        df = df.transpose()

        # Imprima o DataFrame como uma tabela
        print(df)

In [4]:
NLP(dataset,algoritmo="rf")

               acurácia  f1 score  precisão
Random Forest   0.70383  0.656126  0.704883


In [5]:
NLP(dataset,algoritmo="cb")

          acurácia  f1 score  precisão
Catboost  0.553191  0.108659  0.666667


In [6]:
NLP(dataset=dataset,algoritmo="todos")

                              acurácia  precisão  f1 score
Random Forest                 0.703830  0.704883  0.656126
XGBoost                       0.660426  0.641434  0.617450
Catboost                      0.553191  0.666667  0.108659
Gradient Boosting Classifier  0.640000  0.783654  0.435247
SVM                           0.603404  0.738854  0.332378
Naive Bayes Gaussian          0.577872  0.525597  0.650704
Naive Bayes Multinomial       0.665532  0.650407  0.619555
Naive Bayes Complement        0.667234  0.629758  0.650581
