# Função NLP
## Versão 1.0.0 
### Versão Python utilizada: Python 3.11.3 (Anaconda) | IDE: VSCode

In [1]:
import numpy as np
import pandas as pd
import nltk
import unidecode 
import os
import re
import matplotlib.pyplot as plt
import sklearn
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [2]:
dataset = pd.read_csv('dados.csv')

In [15]:
def NLP(dataset,algoritmo): # Input: Dataset inteiro: 2 colunas: texto + rótulo ; qual algoritmo usar

    # Bloco 1: Processamento do texto

    corpus = []
    for i in range(0,dataset.shape[0]):
        texto = re.sub('[^a-zA-Z]',' ',unidecode.unidecode(dataset['texto'][i]) )
        texto = texto.lower()
        texto = texto.split()
        ps = PorterStemmer()
        texto = [ps.stem(word) for word in texto if not word in set(stopwords.words('portuguese'))]
        texto = ' '.join(texto)
        corpus.append(texto)

    # Bloco 2: Embedding do texto

    # BoW
    cv=CountVectorizer(max_features = 5000)
    X=cv.fit_transform(corpus).toarray()
    y=dataset.iloc[:,1].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)

    # Bloco 3 : Algorítmos de NLP

    if algoritmo == "rf":

    # Random forest
        model_RF=RandomForestClassifier(n_estimators =400,criterion="entropy",random_state =0)
        model_RF.fit(X_train,y_train)

        y_pred_RF = model_RF.predict(X_test)
        return(confusion_matrix(y_test, y_pred_RF))

    elif algoritmo == "xgb":

    # XGBoost:
        model_XGB = XGBClassifier(n_estimators=400, max_depth=2, learning_rate=1, objective='binary:logistic')
        model_XGB.fit(X_train, y_train)

        y_pred_XGB = model_XGB.predict(X_test)
        return(confusion_matrix(y_test, y_pred_XGB))

    elif algoritmo == "cb":

    # Catboost:
        model_CB = CatBoostClassifier(iterations=3,
                            depth=3,
                            learning_rate=1,
                            loss_function='Logloss',
                            verbose=True)
        model_CB.fit(X_train, y_train)
        y_pred_CB = model_CB.predict(X_test)
        return(confusion_matrix(y_test, y_pred_CB))

    elif algoritmo == "gbc":

    # Gradient Boosting Classifier:
        model_GBC = GradientBoostingClassifier()
        model_GBC.fit(X_train, y_train)

        y_pred_gbc = model_GBC.predict(X_test)
        return(confusion_matrix(y_test, y_pred_gbc))

    elif algoritmo == "svm":

    # SVM:
        model_SVM = svm.SVC()
        model_SVM.fit(X_train, y_train)

        y_pred_svm = model_SVM.predict(X_test)
        return(confusion_matrix(y_test, y_pred_svm))

    elif algoritmo == "gnb":

    # Naive Bayes Gaussian:
        model_GNB = GaussianNB()
        model_GNB.fit(X_train, y_train)

        y_pred_GNB = model_GNB.predict(X_test)
        return(confusion_matrix(y_test, y_pred_GNB))

    elif algoritmo == "mnb":

    # Naive Bayes Multinomial:
        model_MNB = MultinomialNB()
        model_MNB.fit(X_train, y_train)

        y_pred_MNB = model_MNB.predict(X_test)
        return(confusion_matrix(y_test, y_pred_MNB))

    elif algoritmo == "cnb":

    # Naive Bayes Complement:
        model_CNB = MultinomialNB()
        model_CNB.fit(X_train, y_train)

        y_pred_CNB = model_CNB.predict(X_test)
        return(confusion_matrix(y_test, y_pred_CNB))

    else:
        model_RF=RandomForestClassifier(n_estimators =400,criterion="entropy",random_state =0)
        model_RF.fit(X_train,y_train)     
        model_XGB = XGBClassifier(n_estimators=400, max_depth=2, learning_rate=1, objective='binary:logistic')
        model_XGB.fit(X_train, y_train)
        model_CB = CatBoostClassifier(iterations=3,
                            depth=3,
                            learning_rate=1,
                            loss_function='Logloss',
                            verbose=True)
        model_CB.fit(X_train, y_train)
        model_GBC = GradientBoostingClassifier()
        model_GBC.fit(X_train, y_train)
        model_SVM = svm.SVC()
        model_SVM.fit(X_train, y_train)
        model_GNB = GaussianNB()
        model_GNB.fit(X_train, y_train)
        model_MNB = MultinomialNB()
        model_MNB.fit(X_train, y_train)
        model_CNB = MultinomialNB()
        model_CNB.fit(X_train, y_train)

        # Bloco 4: Acumulador/organizador de resultados

        # Random Forest:
        y_pred_RF = model_RF.predict(X_test)
        cm1 = confusion_matrix(y_test, y_pred_RF)

        # XGBoost:
        y_pred_XGB = model_XGB.predict(X_test)
        cm2 = confusion_matrix(y_test, y_pred_XGB)

        # Catboost:
        y_pred_CB = model_CB.predict(X_test)
        cm3 = confusion_matrix(y_test, y_pred_CB)

        # Gradient Boosting Classifier:
        y_pred_gbc = model_GBC.predict(X_test)
        cm4 = confusion_matrix(y_test, y_pred_gbc)

        # SVM:
        y_pred_svm = model_SVM.predict(X_test)
        cm5 = confusion_matrix(y_test, y_pred_svm)

        # Naive Bayes Gaussian:
        y_pred_GNB = model_GNB.predict(X_test)
        cm6 = confusion_matrix(y_test, y_pred_GNB)

        # Naive Bayes Multinomial:
        y_pred_MNB = model_MNB.predict(X_test)
        cm7 = confusion_matrix(y_test, y_pred_MNB)

        # Naive Bayes Complement:
        y_pred_CNB = model_CNB.predict(X_test)
        cm8 = confusion_matrix(y_test, y_pred_CNB)

        # Bloco 5: Saída
        # Saída esperada: Acurária; F1-Score e Precisão de cada algoritmo | Acabei optanto pela matriz de confusão por enquanto, pela simplicidade de implementação. Futuramente irei alterar para o output solicitado.
        

        return cm1,cm2,cm3,cm4,cm5,cm6,cm7,cm8

In [16]:
NLP(dataset,algoritmo="rf")

array([[495, 139],
       [209, 332]], dtype=int64)

In [17]:
NLP(dataset=dataset,algoritmo="todos")

0:	learn: 0.6779021	total: 178ms	remaining: 356ms
1:	learn: 0.6681406	total: 192ms	remaining: 96.2ms
2:	learn: 0.6608500	total: 206ms	remaining: 0us


(array([[495, 139],
        [209, 332]], dtype=int64),
 array([[454, 180],
        [219, 322]], dtype=int64),
 array([[618,  16],
        [509,  32]], dtype=int64),
 array([[588,  46],
        [379, 162]], dtype=int64),
 array([[593,  41],
        [425, 116]], dtype=int64),
 array([[217, 417],
        [ 79, 462]], dtype=int64),
 array([[462, 172],
        [221, 320]], dtype=int64),
 array([[462, 172],
        [221, 320]], dtype=int64))