# Categorias

In [None]:
import pandas as pd

df_categories = pd.read_csv('../../data/raw/categories.csv')
df_categories.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179 entries, 0 to 178
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       179 non-null    int64 
 1   category_id      179 non-null    int64 
 2   category_name    165 non-null    object
 3   parent_category  166 non-null    object
 4   created_at       169 non-null    object
dtypes: int64(2), object(3)
memory usage: 7.1+ KB


Vamos a explorar el dataset de categorias

In [2]:
df_categories

Unnamed: 0.1,Unnamed: 0,category_id,category_name,parent_category,created_at
0,0,1,Smartphones,ELECTRONICS,
1,1,2,Laptops,Electronics,2024-08-09T15:21:20.057392
2,2,3,Tablets,,2024-08-25T14:12:07.136934
3,3,4,Cameras,,2024-07-02T05:37:42.784417
4,4,5,Televisions,Electronics,2024-09-11T15:18:54.079818
...,...,...,...,...,...
174,174,175,Pens,Stationery,2024-07-07T19:47:40.812894
175,175,176,Art Supplies,Stationery,2023-11-30T15:41:20.408471
176,176,177,Organizers,Stationery,2024-03-27T02:24:22.144225
177,177,178,Greeting Cards,Stationery,2025-04-18T14:56:12.817964


A simple vista... tenemos duplicada una columna de indices, un id categoría un nombre que describe la categoría, una categoría padre (si no es categoría raiz) y por último una fecha y hora de creación.

Por otro lado sería ideal limpiar las columnas aplicando una función que elimine los espacios en blanco tanto al principio como al final de los strings.

Por último sería preferente tener el formato de fecha en datetime para la columna de creación.

In [7]:
df_categories_clean = df_categories.set_index('category_id')
df_categories_clean = df_categories_clean.drop(columns=['Unnamed: 0'])
df_categories_clean.columns = ['name','parent_category_name','created_at']
df_categories_clean = df_categories_clean.rename_axis('id')

for col in ['name', 'parent_category_name']:
    df_categories_clean[col] = df_categories_clean[col].str.strip()
    df_categories_clean[col] = df_categories_clean[col].replace(['', 'null', 'NULL', 'NaN'], None)
    df_categories_clean[col] = df_categories_clean[col].str.title()
    df_categories_clean[col] = df_categories_clean[col].astype('string')

df_categories_clean['created_at'] = df_categories_clean['created_at'].str.strip()
df_categories_clean['created_at'] = df_categories_clean['created_at'].replace(['', 'null', 'NULL', 'NaN', "unknown"], None)
df_categories_clean['created_at'] = pd.to_datetime(df_categories_clean['created_at'], errors='coerce')
df_categories_clean['created_at'] = pd.to_datetime(df_categories_clean['created_at'])
df_categories_clean

Unnamed: 0_level_0,name,parent_category_name,created_at
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Smartphones,Electronics,NaT
2,Laptops,Electronics,2024-08-09 15:21:20.057392
3,Tablets,,2024-08-25 14:12:07.136934
4,Cameras,,2024-07-02 05:37:42.784417
5,Televisions,Electronics,2024-09-11 15:18:54.079818
...,...,...,...
175,Pens,Stationery,2024-07-07 19:47:40.812894
176,Art Supplies,Stationery,2023-11-30 15:41:20.408471
177,Organizers,Stationery,2024-03-27 02:24:22.144225
178,Greeting Cards,Stationery,2025-04-18 14:56:12.817964


Lo ideal sería limpiar este dataset manteniendo solamente el id de categoría como índice, reemplazando la categoría padre por el id de la categoría que la identifica.

In [5]:
valid_names = df_categories_clean['name'].dropna()
valid_names = valid_names[valid_names.str.strip() != '']
name_to_id = df_categories_clean.loc[valid_names.index].reset_index().set_index('name')['id'].to_dict()

df_categories_clean["parent_category_id"] = df_categories_clean[
    "parent_category_name"
].map(name_to_id).astype("Int64")

missing_parents = (
    df_categories_clean.loc[
        df_categories_clean["parent_category_id"].isnull(), "parent_category_name"
    ]
    .dropna()
    .unique()
)

if len(missing_parents) > 0:
    max_id = int(df_categories_clean.index.max())
    new_ids = range(max_id + 1, max_id + 1 + len(missing_parents))

    df_new_parents = pd.DataFrame(
        {
            "name": missing_parents,
            "parent_category_name": pd.Series(
                [None] * len(missing_parents),
                dtype=df_categories_clean["parent_category_name"].dtype,
            ),
            "created_at": pd.Series(
                [pd.NaT] * len(missing_parents),
                dtype=df_categories_clean["created_at"].dtype,
            ),
            "parent_category_id": pd.Series(
                [None] * len(missing_parents),
                dtype=df_categories_clean["parent_category_id"].dtype,
            ),
        },
        index=new_ids,
    )
    df_new_parents.index.name = "id"

    df_categories_clean = pd.concat([df_categories_clean, df_new_parents])

    valid_names = df_categories_clean['name'].dropna()
    valid_names = valid_names[valid_names.str.strip() != '']
    name_to_id = df_categories_clean.loc[valid_names.index].reset_index().set_index('name')['id'].to_dict()
    df_categories_clean["parent_category_id"] = df_categories_clean[
        "parent_category_name"
    ].map(name_to_id).astype("Int64")

df_categories_clean = df_categories_clean.drop(columns=["parent_category_name"])

df_categories_clean

Unnamed: 0_level_0,name,created_at,parent_category_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Smartphones,NaT,180
2,Laptops,2024-08-09 15:21:20.057392,180
3,Tablets,2024-08-25 14:12:07.136934,
4,Cameras,2024-07-02 05:37:42.784417,
5,Televisions,2024-09-11 15:18:54.079818,180
...,...,...,...
204,Collectibles,NaT,
205,Tickets & Experiences,NaT,
206,Musical Instruments,NaT,
207,Games & Virtual Goods,NaT,


Por último exportamos el dataset limpio a un archivo que guarde toda la info y que pueda ser reutilizado en el futuro. Por ejemplo un pickle.

In [None]:
df_categories_clean.to_pickle('../../data/clean/categories.pkl')