In [1]:
import awswrangler as wr
import pandas as pd
import numpy as np
import joblib
import tempfile
import boto3
import json
import tarfile

from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.utils.class_weight import compute_class_weight

In [17]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.estimator import SKLearn

bucket = "recipes-data-models-sagemaker-bucket"
sagemaker_session = sagemaker.Session(default_bucket = bucket)
role = get_execution_role() #'sagemaker_execution_role'
aws_region = sagemaker_session.boto_session.region_name

In [2]:
seed = 42

## Carga de datos

In [3]:
df_recipe_data = wr.s3.read_csv(path="s3://recipes-data-models-sagemaker-bucket/data/cleaned_recipes.csv")
df_recipe_data.head(5)

Unnamed: 0,titulo,categoria,ingredientes,elaboracion,link,total_ingredientes,titulo_link,ingredientes_limpios
0,"Buñuelos de viento fáciles, la receta tradicio...",postres,['125 gr de harina' '30 gr de mantequilla' '1/...,Otoño no es solo época de calabazas y castañas...,https://www.hogarmania.com//cocina/recetas/pos...,9.0,bunuelos,harina mantequilla agua azúcar huevos limón sa...
1,Corona de hojaldre de Navidad,postres,['2 láminas de hojaldre rectangular ' '150 g d...,"Los polvorones, los turrones, los mazapanes o ...",https://www.hogarmania.com//cocina/recetas/pos...,7.0,corona navidad,láminas hojaldre chocolate negro mantequilla g...
2,Cafés de Navidad: Gingerbread Latte y Pumpkin ...,postres,['2 cucharadas de azúcar moreno suave'\n '1/2 ...,"La temporada de invierno, junto a la época nav...",https://www.hogarmania.com//cocina/recetas/pos...,8.0,cafes navidad gingerbread pumpkin spice latte,azúcar moreno suave jengibre molido nuez mosca...
3,Mazapanes de Navidad,postres,['300 gr. de almendra molida ' '370 gr. de lec...,Mezcla en un bol la almendra molida con la lec...,https://www.hogarmania.com//cocina/recetas/pos...,7.0,mazapanes navidad,almendra molida leche condensada limón agua ac...
4,"Churros en freidora de aire, ¡más fácil imposi...",postres,['Churros congelados' 'Aceite de oliva o giras...,Comienza por precalentar la freidora de aire. ...,https://www.hogarmania.com//cocina/recetas/pos...,3.0,churros freidora aire,churros congelados aceite oliva girasol azúcar


## Preprocesamiento de datos

In [4]:
label_encoder = LabelEncoder()
df_recipe_data['encoded_categoria'] = label_encoder.fit_transform(df_recipe_data['categoria'])

In [5]:
dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

{'arroces': 0,
 'carnes': 1,
 'ensaladas': 2,
 'entrantes': 3,
 'huevos': 4,
 'pastas-pizzas': 5,
 'pescados-mariscos': 6,
 'postres': 7,
 'segundos-platos': 8,
 'sopas-cremas': 9}

In [24]:
s3_client = boto3.client("s3")
with tempfile.TemporaryFile() as fp:
    joblib.dump(label_encoder, fp)
    fp.seek(0)
    s3_client.put_object(Body=fp.read(), Bucket=bucket, Key="models/label_encoder.joblib")

In [25]:
df = df_recipe_data[['encoded_categoria', 'ingredientes_limpios']]
train_data, test_data = train_test_split(df, test_size=0.3, shuffle=True, stratify=df['encoded_categoria'], random_state=seed)

In [26]:
train_data.to_csv("train.csv", index=False)
test_data.to_csv("test.csv", index=False)

In [27]:
train_data_path = sagemaker_session.upload_data("train.csv", key_prefix="data")
test_data_path = sagemaker_session.upload_data("test.csv", key_prefix="data")

## Modelo propio

In [28]:
X_train, y_train = train_data['ingredientes_limpios'], train_data['encoded_categoria']
X_test, y_test = test_data['ingredientes_limpios'], test_data['encoded_categoria']

In [29]:
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(zip(np.unique(y_train), class_weights))
class_weights_dict

{0: 2.40817843866171,
 1: 0.5618386816999132,
 2: 0.4845175766641735,
 3: 1.6195,
 4: 2.904932735426009,
 5: 1.7895027624309392,
 6: 0.6246865959498553,
 7: 0.5932234432234432,
 8: 2.024375,
 9: 2.2729824561403507}

### Naive Bayes

In [30]:
nb_model = Pipeline([("tfidf", TfidfVectorizer()), ("clf", MultinomialNB())])

parameters = {
    "clf__alpha": [0.1, 1e-2, 1e-3, 1e-5]
}
nb_gs = GridSearchCV(nb_model, parameters, cv=5, scoring='balanced_accuracy', refit=True)

nb_gs.fit(X_train, y_train)

print("Mejor score: ", nb_gs.best_score_)
print("Mejor configuración de parámetros: ", nb_gs.best_params_)

Mejor score:  0.5133797838594976
Mejor configuración de parámetros:  {'clf__alpha': 0.01}


### Logistic Regression

In [31]:
log_model = Pipeline([("tfidf", TfidfVectorizer()), ("clf", LogisticRegression())])

parameters = {
        "clf__max_iter": [100, 200],
        "clf__C": [3, 1, 0.5, 0.3, 0.1],
        "clf__class_weight": [class_weights_dict]
}

log_gs = GridSearchCV(log_model, parameters, cv=5, scoring='balanced_accuracy', refit=True)

log_gs.fit(X_train, y_train)

print("Mejor score: ", log_gs.best_score_)
print("Mejor configuración de parámetros: ", log_gs.best_params_)

Mejor score:  0.6480098242418407
Mejor configuración de parámetros:  {'clf__C': 1, 'clf__class_weight': {0: 2.40817843866171, 1: 0.5618386816999132, 2: 0.4845175766641735, 3: 1.6195, 4: 2.904932735426009, 5: 1.7895027624309392, 6: 0.6246865959498553, 7: 0.5932234432234432, 8: 2.024375, 9: 2.2729824561403507}, 'clf__max_iter': 100}


### Support Vector Machine

In [32]:
sgd_model = Pipeline([("tfidf", TfidfVectorizer()), ("clf", SGDClassifier())])

parameters = {
        "clf__max_iter": [1000, 2000],
        "clf__tol": [1e-3, 1e-4],
        "clf__alpha": [1e-3, 1e-4, 1e-5],
        "clf__class_weight": [class_weights_dict]
    }

sgd_gs = GridSearchCV(sgd_model, parameters, cv=5, scoring='balanced_accuracy', refit=True)

sgd_gs.fit(X_train, y_train)

print("Mejor score: ", sgd_gs.best_score_)
print("Mejor configuración de parámetros: ", sgd_gs.best_params_)

Mejor score:  0.639340583963979
Mejor configuración de parámetros:  {'clf__alpha': 0.0001, 'clf__class_weight': {0: 2.40817843866171, 1: 0.5618386816999132, 2: 0.4845175766641735, 3: 1.6195, 4: 2.904932735426009, 5: 1.7895027624309392, 6: 0.6246865959498553, 7: 0.5932234432234432, 8: 2.024375, 9: 2.2729824561403507}, 'clf__max_iter': 2000, 'clf__tol': 0.001}


## Evaluación y almacenamiento

In [34]:
y_test_pred = log_gs.predict(X_test)
balanced_accuracy_score(y_test, y_test_pred)

0.6664616104995379

In [79]:
model_filename = "models/model.joblib"
encoder_filename = "models/label_encoder.joblib"
tar_filename = "models/model.tar.gz"

joblib.dump(log_gs, model_filename)
joblib.dump(label_encoder, encoder_filename)

with tarfile.open(tar_filename, 'w:gz') as tar:
    tar.add(model_filename, arcname=model_filename)
    tar.add(encoder_filename, arcname=encoder_filename)

In [None]:
#!tar -czf model.tar.gz model.joblib label_encoder.joblib

In [78]:
sagemaker_session.upload_data("models/model.joblib", key_prefix="models")
sagemaker_session.upload_data("models/model.tar.gz", key_prefix="models")

's3://recipes-data-models-sagemaker-bucket/models/model.tar.gz'

## Endpoint

In [13]:
import boto3
import json
runtime = boto3.client(service_name="sagemaker-runtime")

payload = {
    "ingredientes": 'tomate pepino aceituna'
}
response = runtime.invoke_endpoint(
    EndpointName="custom-model-endpoint",
    Body=json.dumps(payload),
    ContentType="application/json",
)

In [14]:
json.loads(response.get("Body").next().decode())

{'predictions': [2]}