# Documentation

# Sample

In [1]:
import sys 
!{sys.executable} -m pip install nltk 
!{sys.executable} -m pip install matplotlib
!{sys.executable} -m pip install seaborn
!{sys.executable} -m pip install sklearn

Collecting nltk
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hCollecting tqdm
  Downloading tqdm-4.64.0-py2.py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.4/78.4 KB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting joblib
  Using cached joblib-1.1.0-py2.py3-none-any.whl (306 kB)
Collecting regex>=2021.8.3
  Downloading regex-2022.3.15-cp310-cp310-macosx_10_9_x86_64.whl (288 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m288.9/288.9 KB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: tqdm, regex, joblib, nltk
Successfully installed joblib-1.1.0 nltk-3.7 regex-2022.3.15 tqdm-4.64.0
Collecting matplotlib
  Downloading matplotlib-3.5.1-cp310-cp310-macosx_10_9_x86_64.whl (7.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

products_data = pd.read_csv("produtos.csv", delimiter=';', encoding='utf-8')

# concatenando as colunas nome e descricao
products_data['informacao'] = products_data['nome'] + products_data['descricao']
# excluindo linhas com valor de informacao ou categoria NaN
products_data.dropna(subset=['informacao', 'categoria'], inplace=True)
products_data.drop(columns=['nome', 'descricao'], inplace=True)

stop_words=set(stopwords.words("portuguese"))
# transforma a string em caixa baixa e remove stopwords
products_data['sem_stopwords'] = products_data['informacao'].str.lower().apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

tokenizer = nltk.RegexpTokenizer(r"\w+")
products_data['tokens'] = products_data['sem_stopwords'].apply(tokenizer.tokenize) # aplica o regex tokenizer
products_data.drop(columns=['sem_stopwords','informacao'],inplace=True) # Exclui as colunas antigas

products_data["strings"]= products_data["tokens"].str.join(" ") # reunindo cada elemento da lista
products_data.head()


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split( # Separação dos dados para teste e treino
    products_data["strings"], 
    products_data["categoria"], 
    test_size = 0.2, 
    random_state = 10
)
# CountVectorizer
vect = CountVectorizer()
vect.fit(products_data["strings"])

dataset = { 
    "X_train": vect.transform(X_train),
    "X_test": vect.transform(X_test),
    "y_train": y_train,
    "y_test" : y_test,
    "vect": vect
}

In [None]:
from sklearn.naive_bayes import MultinomialNB


clf = MultinomialNB()
clf.fit(dataset["X_train"], dataset["y_train"])

model = {
    "clf" : clf,
    "vect": dataset["vect"]
}

In [None]:
from sklearn.metrics import accuracy_score
from sklearn import metrics
# Multinomial NB
y_prediction = model["clf"].predict(dataset["X_test"])
accuracy = accuracy_score(y_prediction, dataset["y_test"])


In [None]:
input_message = ["Figura Transformers Prime War Deluxe - E9687 - Hasbro"]

In [None]:
print(input_message)
input_message = model["vect"].transform(input_message)

In [None]:
final_prediction = model["clf"].predict(input_message)[0]
print("Predicted value: " + final_prediction)