In [14]:
import pandas as pd

dtype_map = {
    'product_id': 'string',
    'product_name': 'string',
    'category_id': 'string',
    'brand': 'string',
    'price': 'string',
    'cost': 'string',
    'stock_quantity': 'string',
    'weight_kg': 'string',
    'dimensions': 'string',
    'description': 'string',
    'is_active': 'string',
    'created_at': 'string'
}

df_products = pd.read_csv('../../data/raw/products.csv', dtype=dtype_map)
df_products.head()

Unnamed: 0.1,Unnamed: 0,product_id,product_name,category_id,brand,price,cost,stock_quantity,weight_kg,dimensions,description,is_active,created_at
0,0,1,Centralized attitude-oriented synergy,164,undefined,120.22,49.48,754.0,,194x144x13,Hotel quickly amount care meet. Likely conside...,True,2024-05-12T00:44:32.354038
1,1,2,Reactive asynchronous analyzer,9,Adidas,19.75,10.92,27.0,28.11,188x171x91,Wish white also. Natural career woman across c...,True,2024-06-13T17:16:40.512421
2,2,3,Polarized static flexibility,57,Dr. Martens,454.72,334.06,777.0,40.31,183x113x45,Senior go talk down story forget. Career make ...,True,2025-01-02T18:13:21.175439
3,3,4,Persevering homogeneous Graphic Interface,56,Tiffany & Co.,957.64,331.7,,,93x159x35,Exist little add economy discussion across. Pe...,True,2024-12-15T11:32:56.088835
4,4,5,Compatible non-volatile array,118,IKEA,540.87,311.55,,31.47,97x152x26,Stuff evening north city he sit style poor. Pe...,True,2023-12-05T09:19:11.197502


Vamos a descartar las columnas que son innecesarias para nuestro análisis.

In [15]:
df_products.drop(columns=['Unnamed: 0'], inplace=True)

In [16]:
df_products_clean = df_products.set_index('product_id')

# DATES
df_products_clean['created_at'] = df_products_clean['created_at'].str.strip()
df_products_clean['created_at'] = pd.to_datetime(df_products_clean['created_at'], errors='coerce')

# BOOLEANS
df_products_clean['is_active'] = df_products_clean['is_active'].str.strip().str.lower()
df_products_clean['is_active'] = (
    df_products_clean['is_active']
    .map({"true": True, "false": False, "1": True, "0": False})
    .astype("boolean")
)

# INTEGERS
for col in ['category_id']:
    df_products_clean[col] = df_products_clean[col].str.strip()
    df_products_clean[col] = pd.to_numeric(df_products_clean[col], errors='coerce').astype('Int64')

# FLOATS
for col in ['price', 'cost', 'stock_quantity', 'weight_kg']:
    df_products_clean[col] = df_products_clean[col].str.strip()
    df_products_clean[col] = pd.to_numeric(df_products_clean[col], errors='coerce')

# STRINGS
for col in ['product_name', 'description', 'dimensions']:
    df_products_clean[col] = df_products_clean[col].str.strip()

# CATEGORICAL
df_products_clean['brand'] = df_products_clean['brand'].str.strip().str.title().astype('category')

df_products_clean.head()

Unnamed: 0_level_0,product_name,category_id,brand,price,cost,stock_quantity,weight_kg,dimensions,description,is_active,created_at
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Centralized attitude-oriented synergy,164,Undefined,120.22,49.48,754.0,,194x144x13,Hotel quickly amount care meet. Likely conside...,True,2024-05-12 00:44:32.354038
2,Reactive asynchronous analyzer,9,Adidas,19.75,10.92,27.0,28.11,188x171x91,Wish white also. Natural career woman across c...,True,2024-06-13 17:16:40.512421
3,Polarized static flexibility,57,Dr. Martens,454.72,334.06,777.0,40.31,183x113x45,Senior go talk down story forget. Career make ...,True,2025-01-02 18:13:21.175439
4,Persevering homogeneous Graphic Interface,56,Tiffany & Co.,957.64,331.7,,,93x159x35,Exist little add economy discussion across. Pe...,True,2024-12-15 11:32:56.088835
5,Compatible non-volatile array,118,Ikea,540.87,311.55,,31.47,97x152x26,Stuff evening north city he sit style poor. Pe...,True,2023-12-05 09:19:11.197502


Exportamos el dataset limpio para reutilizarlo.

In [17]:
df_products_clean.to_pickle('../../data/clean/products.pkl')