In [54]:
import awswrangler as wr
import numpy as np
import joblib
import tempfile
import boto3
import json

from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.utils.class_weight import compute_class_weight


In [52]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.estimator import SKLearn

sagemaker_session = sagemaker.Session()
role = get_execution_role() #'sagemaker_execution_role'
aws_region = sagemaker_session.boto_session.region_name

In [6]:
seed = 42

## Carga de datos

In [7]:
df_recipe_data = wr.s3.read_csv(path="s3://recipes-data-models-sagemaker-bucket/data/cleaned_recipes.csv")
df_recipe_data.head(5)

Unnamed: 0,titulo,categoria,ingredientes,elaboracion,link,total_ingredientes,titulo_link,ingredientes_limpios
0,"Buñuelos de viento fáciles, la receta tradicio...",postres,['125 gr de harina' '30 gr de mantequilla' '1/...,Otoño no es solo época de calabazas y castañas...,https://www.hogarmania.com//cocina/recetas/pos...,9.0,bunuelos,harina mantequilla agua azúcar huevos limón sa...
1,Corona de hojaldre de Navidad,postres,['2 láminas de hojaldre rectangular ' '150 g d...,"Los polvorones, los turrones, los mazapanes o ...",https://www.hogarmania.com//cocina/recetas/pos...,7.0,corona navidad,láminas hojaldre chocolate negro mantequilla g...
2,Cafés de Navidad: Gingerbread Latte y Pumpkin ...,postres,['2 cucharadas de azúcar moreno suave'\n '1/2 ...,"La temporada de invierno, junto a la época nav...",https://www.hogarmania.com//cocina/recetas/pos...,8.0,cafes navidad gingerbread pumpkin spice latte,azúcar moreno suave jengibre molido nuez mosca...
3,Mazapanes de Navidad,postres,['300 gr. de almendra molida ' '370 gr. de lec...,Mezcla en un bol la almendra molida con la lec...,https://www.hogarmania.com//cocina/recetas/pos...,7.0,mazapanes navidad,almendra molida leche condensada limón agua ac...
4,"Churros en freidora de aire, ¡más fácil imposi...",postres,['Churros congelados' 'Aceite de oliva o giras...,Comienza por precalentar la freidora de aire. ...,https://www.hogarmania.com//cocina/recetas/pos...,3.0,churros freidora aire,churros congelados aceite oliva girasol azúcar


## Preprocesamiento de datos

In [8]:
label_encoder = LabelEncoder()
df_recipe_data['encoded_categoria'] = label_encoder.fit_transform(df_recipe_data['categoria'])

In [9]:
dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

{'arroces': 0,
 'carnes': 1,
 'ensaladas': 2,
 'entrantes': 3,
 'huevos': 4,
 'pastas-pizzas': 5,
 'pescados-mariscos': 6,
 'postres': 7,
 'segundos-platos': 8,
 'sopas-cremas': 9}

In [10]:
def store_model(model, bucket_name, key):
    s3_client = boto3.client('s3')
    with tempfile.TemporaryFile() as fp:
        joblib.dump(model, fp)
        fp.seek(0)
        s3_client.put_object(Body=fp.read(), Bucket=bucket_name, Key=key)

In [11]:
bucket_name = "recipes-data-models-sagemaker-bucket"
key = "models/label_encoder.joblib"
store_model(label_encoder, bucket_name, key)

In [12]:
df = df_recipe_data[['encoded_categoria', 'ingredientes_limpios']] # class label, feature => order of columns required for already built in sagemaker models
train_data, test_data = train_test_split(df, test_size=0.3, shuffle=True, stratify=df['encoded_categoria'], random_state=seed)

In [13]:
wr.s3.to_csv(
    df=train_data,
    path="s3://recipes-data-models-sagemaker-bucket/data/train_data.csv",
    index=False
)
wr.s3.to_csv(
    df=test_data,
    path="s3://recipes-data-models-sagemaker-bucket/data/test_data.csv",
    index=False
)

{'paths': ['s3://recipes-data-models-sagemaker-bucket/data/test_data.csv'],
 'partitions_values': {}}

In [62]:
wr.s3.to_csv(
    df=train_data,
    path="s3://recipes-data-models-sagemaker-bucket/data/jumpstart/data.csv",
    index=False
)

{'paths': ['s3://recipes-data-models-sagemaker-bucket/data/jumpstart/data.csv'],
 'partitions_values': {}}

## Modelo propio

In [14]:
X_train, y_train = train_data['ingredientes_limpios'], train_data['encoded_categoria']
X_test, y_test = test_data['ingredientes_limpios'], test_data['encoded_categoria']

In [28]:
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(zip(np.unique(y_train), class_weights))
class_weights_dict

{0: 2.40817843866171,
 1: 0.5618386816999132,
 2: 0.4845175766641735,
 3: 1.6195,
 4: 2.904932735426009,
 5: 1.7895027624309392,
 6: 0.6246865959498553,
 7: 0.5932234432234432,
 8: 2.024375,
 9: 2.2729824561403507}

### Naive Bayes

In [23]:
nb_model = Pipeline([("tfidf", TfidfVectorizer()), ("clf", MultinomialNB())])

parameters = {
    "clf__alpha": [0.1, 1e-2, 1e-3, 1e-5]
}
nb_gs = GridSearchCV(nb_model, parameters, cv=5, scoring='balanced_accuracy', refit=True)

nb_gs.fit(X_train, y_train)

print("Mejor score: ", nb_gs.best_score_)
print("Mejor configuración de parámetros: ", nb_gs.best_params_)

Mejor score:  0.5133797838594976
Mejor configuración de parámetros:  {'clf__alpha': 0.01}


### Logistic Regression

In [31]:
log_model = Pipeline([("tfidf", TfidfVectorizer()), ("clf", LogisticRegression())])

parameters = {
        "clf__max_iter": [100, 200],
        "clf__C": [3, 1, 0.5, 0.3, 0.1],
        "clf__class_weight": [class_weights_dict]
}

log_gs = GridSearchCV(log_model, parameters, cv=5, scoring='balanced_accuracy', refit=True)

log_gs.fit(X_train, y_train)

print("Mejor score: ", log_gs.best_score_)
print("Mejor configuración de parámetros: ", log_gs.best_params_)

Mejor score:  0.6480098242418407
Mejor configuración de parámetros:  {'clf__C': 1, 'clf__class_weight': {0: 2.40817843866171, 1: 0.5618386816999132, 2: 0.4845175766641735, 3: 1.6195, 4: 2.904932735426009, 5: 1.7895027624309392, 6: 0.6246865959498553, 7: 0.5932234432234432, 8: 2.024375, 9: 2.2729824561403507}, 'clf__max_iter': 100}


### Supported Vector Machine

In [50]:
sgd_model = Pipeline([("tfidf", TfidfVectorizer()), ("clf", SGDClassifier())])

parameters = {
        "clf__max_iter": [1000, 2000, 3000],
        "clf__tol": [1e-3, 1e-4],
        "clf__alpha": [1e-3, 1e-4, 1e-5],
        "clf__class_weight": [class_weights_dict]
    }

sgd_gs = GridSearchCV(sgd_model, parameters, cv=10, scoring='balanced_accuracy', refit=True)

sgd_gs.fit(X_train, y_train)

print("Mejor score: ", sgd_gs.best_score_)
print("Mejor configuración de parámetros: ", sgd_gs.best_params_)

Mejor score:  0.6457667297211529
Mejor configuración de parámetros:  {'clf__alpha': 0.0001, 'clf__class_weight': {0: 2.40817843866171, 1: 0.5618386816999132, 2: 0.4845175766641735, 3: 1.6195, 4: 2.904932735426009, 5: 1.7895027624309392, 6: 0.6246865959498553, 7: 0.5932234432234432, 8: 2.024375, 9: 2.2729824561403507}, 'clf__max_iter': 2000, 'clf__tol': 0.001}


## Estimador

In [56]:
model_id = "tensorflow-tc-bert-en-uncased-L-12-H-768-A-12-2"

In [58]:
import IPython
from ipywidgets import Dropdown

# download JumpStart model_manifest file.
boto3.client("s3").download_file(
    f"jumpstart-cache-prod-{aws_region}", "models_manifest.json", "models_manifest.json"
)
with open("models_manifest.json", "rb") as json_file:
    model_list = json.load(json_file)

# filter-out all the Text Classification models from the manifest list.
tc_models_all_versions, tc_models = [
    model["model_id"] for model in model_list if "-tc-" in model["model_id"]
], []
[tc_models.append(model) for model in tc_models_all_versions if model not in tc_models]

# display the model-ids in a dropdown, for user to select a model.
dropdown = Dropdown(
    value=model_id,
    options=tc_models,
    description="JumpStart Text Classification Models:",
    style={"description_width": "initial"},
    layout={"width": "max-content"},
)
display(IPython.display.Markdown("#### Select a JumpStart pre-trained model from the dropdown below"))
display(dropdown)

#### Select a JumpStart pre-trained model from the dropdown below

Dropdown(description='JumpStart Text Classification Models:', index=24, layout=Layout(width='max-content'), op…

In [68]:
from sagemaker.jumpstart.estimator import JumpStartEstimator

s3_output_model_path = "s3://recipes-data-models-sagemaker-bucket/model/jumpstart/output"
s3_training_dataset_path = "s3://recipes-data-models-sagemaker-bucket/data/jumpstart/"

estimator = JumpStartEstimator(
    model_id=model_id,
    model_version='3.0.0',
    #instance_type='ml.g4dn.xlarge', #ml.p3.2xlarge
    instance_count=1,
    role=role,
    base_job_name="tensorflow-bert",
    hyperparameters={"epochs": "1", "batch_size": "64"},
    output_path=s3_output_model_path
)

No instance type selected for training job. Defaulting to ml.p3.2xlarge.
INFO:sagemaker.jumpstart:No instance type selected for training job. Defaulting to ml.p3.2xlarge.


In [71]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6478 entries, 3074 to 8491
Data columns (total 2 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   encoded_categoria     6478 non-null   int64 
 1   ingredientes_limpios  6478 non-null   object
dtypes: int64(1), object(1)
memory usage: 151.8+ KB


In [69]:
estimator.fit({"training": s3_training_dataset_path}, logs=True)

INFO:sagemaker:Creating training-job with name: tf-tc-bert-en-uncased-l-12-h-768-a-12-2-2024-06-30-19-14-43-518


2024-06-30 19:14:43 Starting - Starting the training job...
2024-06-30 19:15:06 Pending - Training job waiting for capacity...
2024-06-30 19:15:33 Pending - Preparing the instances for training...
2024-06-30 19:16:06 Downloading - Downloading input data...
2024-06-30 19:16:31 Downloading - Downloading the training image.....................
2024-06-30 19:19:57 Training - Training image download completed. Training in progress..[34m2024-06-30 19:20:09.150342: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2024-06-30 19:20:09.150572: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.[0m
[34m2024-06-30 19:20:09.195665: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2024-06-30 19:20:11,642 sagemaker-training-toolkit INFO     Impo

UnexpectedStatusException: Error for Training job tf-tc-bert-en-uncased-l-12-h-768-a-12-2-2024-06-30-19-14-43-518: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
ExitCode 1
ErrorMessage ""
Command "/usr/local/bin/python3.9 transfer_learning.py --batch_size 64 --beta_1 0.9 --beta_2 0.999 --dropout_rate 0.2 --early_stopping False --early_stopping_min_delta 0.0 --early_stopping_patience 5 --epochs 1 --epsilon 1e-06 --initial_accumulator_value 0.1 --learning_rate 2e-05 --momentum 0.9 --optimizer adamw --regularizers_l2 0.01 --reinitialize_top_layer Auto --rho 0.95 --train_only_top_layer False --validation_split_ratio 0.2 --warmup_steps_fraction 0.1", exit code: 1