# Acquisitor and Cleaner

In [5]:
import nltk
import unicodedata
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from marvin_python_toolbox.common.data import MarvinData

nltk.download('stopwords')
stop_words = stopwords.words('portuguese')

initial_dataset = pd.read_csv(
    MarvinData.download_file("https://s3.amazonaws.com/automl-example/produtos.csv"),
    delimiter=";", encoding='utf-8')

def remove_nonlatin(string):
    new_chars = []
    for char in string:
        if char == '\n':
            new_chars.append(' ')
            continue
        try:
            if unicodedata.name(char).startswith(('LATIN', 'SPACE')):
                new_chars.append(char)
        except:
            continue
    return ''.join(new_chars)

def pre_processor(text):
    stops = set(stopwords.words("portuguese"))
    text = remove_nonlatin(text)
    words = text.lower().split()
    words = ' '.join([w for w in words if not w in stops])
    return words

initial_dataset["text"] = initial_dataset["nome"] + " " + initial_dataset["descricao"]
initial_dataset.drop(["descricao", "nome"], axis=1, inplace=True)
initial_dataset.dropna(inplace=True)
initial_dataset['text'] = initial_dataset['text'].apply(pre_processor)

marvin_initial_dataset = initial_dataset

[nltk_data] Downloading package stopwords to /home/rafael/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Training preparator

In [8]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split


X_train,X_test,y_train,y_test = train_test_split(
    marvin_initial_dataset["text"], 
    marvin_initial_dataset["categoria"], 
    test_size = 0.2, 
    random_state = 10
)

vect = CountVectorizer()
vect.fit(marvin_initial_dataset["text"])

encoder = LabelEncoder()
encoder.fit(marvin_initial_dataset["categoria"])

marvin_dataset = {
    "X_train": vect.transform(X_train),
    "X_test": vect.transform(X_test),
    "y_train": encoder.transform(y_train),
    "y_test" : encoder.transform(y_test),
    "vect": vect,
    "encoder": encoder
}

# Training

In [10]:
import autosklearn.classification as automl

clf = automl.AutoSklearnClassifier(
    include_preprocessors=["no_preprocessing",],
    exclude_preprocessors=None
)
clf.fit(marvin_dataset["X_train"], marvin_dataset["y_train"])

marvin_model = {
    "clf" : clf,
    "vect": marvin_dataset["vect"],
    "encoder": marvin_dataset["encoder"]
}



# Evaluation

In [11]:
from sklearn.metrics import classification_report


y_prediction = marvin_model["clf"].predict(marvin_dataset["X_test"])

report = classification_report(y_prediction, marvin_dataset["y_test"])

print(report)

marvin_metrics = report

             precision    recall  f1-score   support

          0       0.98      0.97      0.98       142
          1       0.98      1.00      0.99       122
          2       0.99      0.99      0.99       156
          3       1.00      0.98      0.99       164

avg / total       0.99      0.99      0.99       584



# Predictor preparation

In [12]:
input_message = {"text": """God of War"""}

In [13]:
input_message = marvin_model["vect"].transform([input_message["text"]])

# Predictor

In [14]:
final_prediction = marvin_model["clf"].predict(input_message)[0]

In [15]:
print(final_prediction)

1
