In [41]:
import awswrangler as wr
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression


In [None]:
# import sagemaker
# from sagemaker import get_execution_role
# from sagemaker.sklearn.estimator import SKLearn

# sagemaker_session = sagemaker.Session()
# role = get_execution_role()
# region = sagemaker_session.boto_session.region_name

## Carga de datos

In [28]:
df_recipe_data = wr.s3.read_csv(path="s3://recipes-data-models-sagemaker-bucket/data/cleaned_recipes.csv")
df_recipe_data.head(5)

Unnamed: 0,titulo,categoria,ingredientes,elaboracion,link,total_ingredientes,titulo_link,ingredientes_limpios
0,"Buñuelos de viento fáciles, la receta tradicio...",postres,['125 gr de harina' '30 gr de mantequilla' '1/...,Otoño no es solo época de calabazas y castañas...,https://www.hogarmania.com//cocina/recetas/pos...,9.0,bunuelos,harina mantequilla agua azúcar huevos limón sa...
1,Corona de hojaldre de Navidad,postres,['2 láminas de hojaldre rectangular ' '150 g d...,"Los polvorones, los turrones, los mazapanes o ...",https://www.hogarmania.com//cocina/recetas/pos...,7.0,corona navidad,láminas hojaldre chocolate negro mantequilla g...
2,Cafés de Navidad: Gingerbread Latte y Pumpkin ...,postres,['2 cucharadas de azúcar moreno suave'\n '1/2 ...,"La temporada de invierno, junto a la época nav...",https://www.hogarmania.com//cocina/recetas/pos...,8.0,cafes navidad gingerbread pumpkin spice latte,azúcar moreno suave jengibre molido nuez mosca...
3,Mazapanes de Navidad,postres,['300 gr. de almendra molida ' '370 gr. de lec...,Mezcla en un bol la almendra molida con la lec...,https://www.hogarmania.com//cocina/recetas/pos...,7.0,mazapanes navidad,almendra molida leche condensada limón agua ac...
4,"Churros en freidora de aire, ¡más fácil imposi...",postres,['Churros congelados' 'Aceite de oliva o giras...,Comienza por precalentar la freidora de aire. ...,https://www.hogarmania.com//cocina/recetas/pos...,3.0,churros freidora aire,churros congelados aceite oliva girasol azúcar


## Preprocesamiento de datos

In [31]:
le = LabelEncoder()
df_recipe_data['encoded_categoria'] = le.fit_transform(df_recipe_data['categoria'])

In [33]:
X = df_recipe_data['ingredientes_limpios']
y = df_recipe_data['encoded_categoria']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state=42)

In [32]:
# train_data, test_data = train_test_split(df_recipe_data, test_size=0.2, shuffle=True, stratify=df_recipe_data[['encoded_categoria']], random_state=42)

In [17]:
# wr.s3.to_csv(
#     df=train_data,
#     path="s3://recipes-data-models-sagemaker-bucket/data/train_data.csv",
#     index=False
# )

{'paths': ['s3://recipes-data-models-sagemaker-bucket/data/train_data.csv'],
 'partitions_values': {}}

In [18]:
# wr.s3.to_csv(
#     df=test_data,
#     path="s3://recipes-data-models-sagemaker-bucket/data/test_data.csv",
#     index=False
# )

{'paths': ['s3://recipes-data-models-sagemaker-bucket/data/test_data.csv'],
 'partitions_values': {}}

In [19]:
# X_train, X_test = train_data[['ingredientes_concat']], test_data[['ingredientes_concat']]
# y_train, y_test = train_data[['encoded_categoria']], test_data[['encoded_categoria']]

KeyError: "None of [Index(['ingredientes_concat'], dtype='object')] are in the [columns]"

## Pipeline

In [34]:
recetas_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

In [35]:
recetas_clf.fit(X_train, y_train)

In [36]:
recetas_clf.score(X_train, y_train)

0.6509994597514857

In [37]:
recetas_clf.score(X_test, y_test)

0.6223662884927067

In [45]:
from sklearn.linear_model import SGDClassifier

In [64]:
recetas_lr_clf = Pipeline([
    ('countvect', TfidfVectorizer(use_idf=True)),
    ('clf', SGDClassifier())
])
recetas_lr_clf.fit(X_train, y_train)
recetas_lr_clf.score(X_train, y_train)

0.852377093462993

In [65]:
recetas_lr_clf.score(X_test, y_test)

0.7433819556996218

## Estimador

In [79]:
docs_new = ['molde tarta galletas maría mantequilla', 'lubina ajo mantequilla', 'harina mantequilla agua azúcar huevos']
predicted = recetas_lr_clf.predict(docs_new)
le2.inverse_transform(predicted)

array(['postres', 'pescados-mariscos', 'postres'], dtype='<U17')

In [80]:
le.inverse_transform(predicted)

array(['postres', 'pescados-mariscos', 'postres'], dtype=object)

In [58]:
le.classes_

array(['arroces', 'carnes', 'ensaladas', 'entrantes', 'huevos',
       'pastas-pizzas', 'pescados-mariscos', 'postres', 'segundos-platos',
       'sopas-cremas'], dtype=object)

In [60]:
from sklearn.metrics import classification_report

y_pred = recetas_lr_clf.predict(X_train)
print(classification_report(y_train, y_pred, target_names=le.classes_))

                   precision    recall  f1-score   support

          arroces       0.76      0.66      0.71       307
           carnes       0.68      0.91      0.78      1318
        ensaladas       0.67      0.76      0.72      1528
        entrantes       0.74      0.17      0.28       458
           huevos       0.80      0.19      0.31       254
    pastas-pizzas       0.84      0.58      0.69       414
pescados-mariscos       0.75      0.84      0.79      1185
          postres       0.85      1.00      0.92      1248
  segundos-platos       0.72      0.49      0.58       366
     sopas-cremas       0.78      0.37      0.50       326

         accuracy                           0.74      7404
        macro avg       0.76      0.60      0.63      7404
     weighted avg       0.75      0.74      0.71      7404



In [75]:
definitive_categories = [
    "arroces",
    # "bebidas",
    "carnes",
    # "desayunos",
    "entrantes",
    "ensaladas",
    "huevos",
    "pastas-pizzas",
    # "panes",
    "pescados-mariscos",
    "postres",
    "sopas-cremas",
    "segundos-platos",
    # "otros"
]

In [76]:
le2 = LabelEncoder()
le2.fit_transform(definitive_categories)

array([0, 1, 3, 2, 4, 5, 6, 7, 9, 8])

In [77]:
le2.classes_

array(['arroces', 'carnes', 'ensaladas', 'entrantes', 'huevos',
       'pastas-pizzas', 'pescados-mariscos', 'postres', 'segundos-platos',
       'sopas-cremas'], dtype='<U17')

In [78]:
le2.inverse_transform([7])

array(['postres'], dtype='<U17')