# Dependencias adicionais

In [1]:
# Dependencias do preprocessamento
import nltk
nltk.download("stopwords")
!pip3 install unidecode joblib flask_restful
!pip install unidecode joblib flask_restful

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




In [59]:
# Necessario para treinar modelo usando os cuda cores
#!conda install -y -c rapidsai -c nvidia -c numba -c conda-forge -c anaconda cudf=0.9 cuml=0.9 cugraph=0.9 python=3.7 anaconda::cudatoolkit=10.0

# Solução

In [2]:
from google.cloud import bigquery
import pandas as pd
import missingno as msno
import nltk
import numpy as np

client = bigquery.Client(location="US")
print("Client creating using default project: {}".format(client.project))

Client creating using default project: hackathon-08


## Data Ingestion

In [45]:
query = """
SELECT
    eanIsbn,
    nome,
    categoria,
    subCategoriaNivel4,
    departamento,
    tipoProduto,
    cor,
    sabor,
    tamanho,
    skuReferencia,
    skuNetshoes,
    precoPor,
    precoDe,
    valorGenero,
    peso,
    marca,
    dimensao.altura,
    dimensao.largura,
    dimensao.profundidade,
    descricao,
    nomeGenero,
    CASE
      WHEN skuReferencia IS NOT NULL AND skuNetshoes IS NULL THEN skuReferencia
      ELSE skuNetshoes
    END as skuNetshoesReferencia
    FROM `hackathon-08.dadosbrutos.sellerProducts`
    WHERE 
    (skuReferencia IS NULL and statusProdutoLojista.statusLojista = "APROVADO" and skuNetshoes IS NOT NULL
    OR
    skuReferencia IS NOT NULL and statusProdutoLojista.statusLojista = "APROVADO" and skuNetshoes IS NULL
    OR
    skuReferencia IS NOT NULL and statusProdutoLojista.statusLojista = "APROVADO" and skuNetshoes IS NOT NULL)
    LIMIT 10000
"""
query_job = client.query(
    query,
    # Location must match that of the dataset(s) referenced in the query.
    location="US",
)  # API request - starts the query

df_big_query = query_job.to_dataframe()

In [36]:
df_big_query.to_csv('raw_data.csv', index=False)

In [52]:
df_big_query = pd.read_csv('raw_data.csv')
df_big_query.columns

Index(['eanIsbn', 'nome', 'categoria', 'subCategoriaNivel4', 'departamento',
       'tipoProduto', 'cor', 'sabor', 'tamanho', 'skuReferencia',
       'skuNetshoes', 'precoPor', 'precoDe', 'valorGenero', 'peso', 'marca',
       'altura', 'largura', 'profundidade', 'descricao', 'nomeGenero',
       'skuNetshoesReferencia'],
      dtype='object')

# Data preprocessing

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
from unidecode import unidecode
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer as Imputer
import joblib 

In [54]:
def normalize(s):
    if str(s) == 'nan' or type(s) == type(None):
        return ""
    return unidecode(str(s)).lower()

In [56]:
def data_preprocessing(df, training=True):
    # Concatena colunas de texto
    df['descr'] = df.apply(lambda r: f"{r['nome']} {r['descricao']} {r['eanIsbn']}", axis=1)
    _ = df.pop('nome')
    _ = df.pop('descricao')
    _ = df.pop('eanIsbn')
    
    if training:
        _ = df.pop('skuReferencia')
        _ = df.pop('skuNetshoes')
    
    del _
    
    text_cols = ['categoria', 'subCategoriaNivel4', 'departamento', 'tipoProduto', 'cor', 'sabor', 'tamanho', 'valorGenero', 'marca', 'nomeGenero', 'descr']

    for col in text_cols:
        df[col] = df[col].apply(normalize)
        df[col] = pd.Categorical(df[col])

    for col in text_cols:
        if col == 'descr':
            continue
        dummies = pd.get_dummies(df[col], prefix=col, dummy_na=True).iloc[:, :-1]
        df = pd.concat([df, dummies], axis=1)
        _ = df.pop(col)
        
    stop_words = nltk.corpus.stopwords.words("portuguese")
    
    if training:
        tvec = TfidfVectorizer(min_df=.1, max_df=.2, stop_words=stop_words, ngram_range=(1, 2))
        tvec_weights = tvec.fit_transform(df.descr)
        joblib.dump(tvec, 'tvecModel.pkl')
    else:
        tvec = joblib.load('tvecModel.pkl')
        tvec_weights = tvec.transform(df.descr)
    
    descr_w = pd.DataFrame(tvec_weights.todense())

    _ = df.pop('descr')

    df = pd.concat([df, descr_w], axis=1)
    
    if training:
        y = df.pop('skuNetshoesReferencia').values
        
    X = df.iloc[:, :].values
    
    if training:
        # Treina Standard Scaler 
        std_scaler = StandardScaler()
        X = std_scaler.fit_transform(X)
        joblib.dump(std_scaler, 'StandardScalerModel.pkl')
    else:
        std_scaler = joblib.load('StandardScalerModel.pkl')
        print(X)
        X = std_scaler.transform(X)
    
    if training:
        imputer = Imputer(missing_values=np.nan, strategy='mean')
        imputer = imputer.fit(X[:, :2])
        joblib.dump(imputer, 'ImputerModel.pkl')
        X[:, :2] = imputer.transform(X[:, :2])
    else:
        imputer = joblib.load('ImputerModel.pkl')
        X[:, :2] = imputer.transform(X[:, :2])
        
    if training:
        return X, y
    else:
        return X

In [67]:

data_preprocessing(df_big_query.loc[0:0,:].copy(), False)


[[nan nan 16.0 20.0 75.0 155.0 'MEA-0002-050-17' 1 1 1 1 1 1 1 1 1 1 0.0
  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
  0.0]]


ValueError: could not convert string to float: 'MEA-0002-050-17'

In [41]:
X, y = data_preprocessing(df_big_query, training=True)

## Treina modelo na CPU

In [8]:
from sklearn.neighbors import KNeighborsClassifier as knn

In [9]:
%%time
classifier = knn(n_neighbors=1)
classifier.fit(X, y)
y_pred_cpu = classifier.kneighbors(X, 1)

CPU times: user 26.4 s, sys: 92 ms, total: 26.4 s
Wall time: 26.3 s


## Treina modelo na GPU

In [10]:
#from cuml.neighbors import NearestNeighbors as cumlNN
#import cudf as gd

In [11]:
#X1 = pd.DataFrame(X)

In [12]:
#device_data = gd.DataFrame.from_pandas(X1)

In [13]:
#%%time
#knn_cuml = cumlNN(n_neighbors=1)
#knn_cuml.fit(device_data)

#y_pred = knn_cuml.kneighbors(X, 2)

## API Server

In [51]:
from flask import Flask, request
from flask_restful import Resource, Api

app = Flask(__name__)
api = Api(app)

data = []
columns = []

class HelloWorld(Resource):
    def get(self):
        args = request.args

        
        nome = args["nome"]
        categoria = args["categoria"]
        cor = args["cor"]
        sabor = args["sabor"]
        eanIsbn = args["eanLSBN"]
        departamento = args["departamento"]
        tipoProduto = args["tipoProduto"]
        tamanho = args["tamanho"]
        precoDe = args["precoDe"]
        precoPor = args["precoPor"]
        subCategoriaNivel4 = args["subCatN4"]
        descricao = args["descricao"]
        valorGenero = args["valorGenero"]
        peso = args["peso"]
        marca = args["marca"]
        altura = args["altura"]
        largura = args["largura"]
        profundidade = args["profundidade"]
        nomeGenero = args["nomeGenero"]
        
        data = [eanIsbn, nome, categoria, subCategoriaNivel4, departamento,
                tipoProduto, cor, sabor, tamanho, precoPor, precoDe, valorGenero, 
                peso, marca, altura, largura, profundidade, descricao, nomeGenero]
        
        columns=['eanIsbn', 'nome', 'categoria', 'subCategoriaNivel4', 'departamento',
                'tipoProduto', 'cor', 'sabor', 'tamanho', 'precoPor', 'precoDe', 'valorGenero',
                 'peso', 'marca', 'altura', 'largura', 'profundidade', 'descricao', 'nomeGenero']
        
        data_dict = {k:[v] for k, v in zip(columns, data)}
        
        data = pd.DataFrame(data=data_dict)
        
        X = data_preprocessing(data, training=False)
        pred_y = classifier.predict(X)
        
#         dados_produto = df_big_query.query('skuNetshoesReferencia == @pred_y[0]')
#         response = dados_produto.loc[0, columns].to_json()
        
        return ""#response

api.add_resource(HelloWorld, '/')

if __name__ == '__main__':
    app.run(debug=True, port=8041, use_reloader=False)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: on


 * Running on http://127.0.0.1:8041/ (Press CTRL+C to quit)
127.0.0.1 - - [29/Sep/2019 12:25:51] "GET /?eanLSBN=7897571148506&nome=Bicicleta&categoria=&subCatN4=&departamento=Bike&tipoProduto=Bicicletas&cor=Preto+Laranja&sabor=&tamanho=17&precoPor=&precoDe=&valorGenero=Homem&peso=16&marca=XKS&altura=20&largura=75&profundidade=155.0&descricao=Bicicleta&nomeGenero=Geanero HTTP/1.1" 500 -


[['' '' '16' '20' '75' '155.0' 1 1 1 1 1 1 1 1 1 1 0.0 0.0 0.0 0.0 0.0
  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0]]


Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/flask/app.py", line 2463, in __call__
    return self.wsgi_app(environ, start_response)
  File "/opt/anaconda3/lib/python3.7/site-packages/flask/app.py", line 2449, in wsgi_app
    response = self.handle_exception(e)
  File "/opt/anaconda3/lib/python3.7/site-packages/flask_restful/__init__.py", line 269, in error_router
    return original_handler(e)
  File "/opt/anaconda3/lib/python3.7/site-packages/flask/app.py", line 1866, in handle_exception
    reraise(exc_type, exc_value, tb)
  File "/opt/anaconda3/lib/python3.7/site-packages/flask/_compat.py", line 38, in reraise
    raise value.with_traceback(tb)
  File "/opt/anaconda3/lib/python3.7/site-packages/flask/app.py", line 2446, in wsgi_app
    response = self.full_dispatch_request()
  File "/opt/anaconda3/lib/python3.7/site-packages/flask/app.py", line 1951, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "/opt/anaconda3/lib/p

[['0' '0' '16' '20' '75' '155.0' 1 1 1 1 1 1 1 1 1 1 0.0 0.0 0.0 0.0 0.0
  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0]]


Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/flask/app.py", line 2463, in __call__
    return self.wsgi_app(environ, start_response)
  File "/opt/anaconda3/lib/python3.7/site-packages/flask/app.py", line 2449, in wsgi_app
    response = self.handle_exception(e)
  File "/opt/anaconda3/lib/python3.7/site-packages/flask_restful/__init__.py", line 269, in error_router
    return original_handler(e)
  File "/opt/anaconda3/lib/python3.7/site-packages/flask/app.py", line 1866, in handle_exception
    reraise(exc_type, exc_value, tb)
  File "/opt/anaconda3/lib/python3.7/site-packages/flask/_compat.py", line 38, in reraise
    raise value.with_traceback(tb)
  File "/opt/anaconda3/lib/python3.7/site-packages/flask/app.py", line 2446, in wsgi_app
    response = self.full_dispatch_request()
  File "/opt/anaconda3/lib/python3.7/site-packages/flask/app.py", line 1951, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "/opt/anaconda3/lib/p

In [29]:
%debug

'asd_asd'

In [47]:
df_big_query.loc[0, :].to_json()

'{"eanIsbn":"7897571148506","nome":"Bicicleta Xks Aro 29 Fd 21V Qd17 Prt La","categoria":null,"subCategoriaNivel4":null,"departamento":"Bike","tipoProduto":"Bicicletas","cor":"Preto+Laranja","sabor":null,"tamanho":"17","skuReferencia":null,"skuNetshoes":"MEA-0002-050-17","precoPor":null,"precoDe":null,"valorGenero":"Homem","peso":16.0,"marca":"XKS","altura":20.0,"largura":75.0,"profundidade":155.0,"descricao":"Bicicleta Xks Aro 29 freio \\u00e1 disco, 21 marchas \\u00e9 a combina\\u00e7\\u00e3o perfeita com seus passeios e trilhas Garantindo todo o conforto e seguran\\u00e7a","nomeGenero":"G\\u00eanero","skuNetshoesReferencia":"MEA-0002-050-17"}'

In [48]:
data

['asd_asd',
 'asd_asd',
 'asd_asd',
 'asd_asd',
 'asd_asd',
 'asd_asd',
 'asd_asd',
 'asd_asd',
 'asd_asd',
 'asd_asd',
 'asd_asd',
 'asd_asd',
 'asd_asd',
 'asd_asd',
 'asd_asd',
 'asd_asd',
 'asd_asd',
 'asd_asd',
 'asd_asd']

In [49]:
columns

['eanIsbn',
 'nome',
 'categoria',
 'subCategoriaNivel4',
 'departamento',
 'tipoProduto',
 'cor',
 'sabor',
 'tamanho',
 'precoPor',
 'precoDe',
 'valorGenero',
 'peso',
 'marca',
 'altura',
 'largura',
 'profundidade',
 'descricao',
 'nomeGenero']

In [52]:
data_dict = {k:[v] for k, v in zip(columns, data)}

In [53]:
data = pd.DataFrame(data=data_dict)

In [54]:
data

Unnamed: 0,eanIsbn,nome,categoria,subCategoriaNivel4,departamento,tipoProduto,cor,sabor,tamanho,precoPor,precoDe,valorGenero,peso,marca,altura,largura,profundidade,descricao,nomeGenero
0,asd_asd,asd_asd,asd_asd,asd_asd,asd_asd,asd_asd,asd_asd,asd_asd,asd_asd,asd_asd,asd_asd,asd_asd,asd_asd,asd_asd,asd_asd,asd_asd,asd_asd,asd_asd,asd_asd
