# Preprocesamiento de datos

En este notebook se preparan los datos para entrenar modelos de Machine Learning,
incluyendo limpieza, selección de variables y transformación de features.

In [28]:
import pandas as pd
import sklearn as sk


In [21]:
df = pd.read_csv("../data/ecommerce.csv")
df.head()

Unnamed: 0,Order Date,Product Name,Category,Region,Quantity,Sales,Profit
0,2024-12-31,Printer,Office,North,4,3640,348.93
1,2022-11-27,Mouse,Accessories,East,7,1197,106.53
2,2022-05-11,Tablet,Electronics,South,5,5865,502.73
3,2024-03-16,Mouse,Accessories,South,2,786,202.87
4,2022-09-10,Mouse,Accessories,West,1,509,103.28


In [22]:
features = [
    'Order Date',
    'Product Name',
    'Category',
    'Region',
    'Quantity',
    'Sales'
]

target = 'Profit'

df_model = df[features + [target]]
df_model.head()

Unnamed: 0,Order Date,Product Name,Category,Region,Quantity,Sales,Profit
0,2024-12-31,Printer,Office,North,4,3640,348.93
1,2022-11-27,Mouse,Accessories,East,7,1197,106.53
2,2022-05-11,Tablet,Electronics,South,5,5865,502.73
3,2024-03-16,Mouse,Accessories,South,2,786,202.87
4,2022-09-10,Mouse,Accessories,West,1,509,103.28


In [23]:
df_model.isnull().sum()

Order Date      0
Product Name    0
Category        0
Region          0
Quantity        0
Sales           0
Profit          0
dtype: int64

In [24]:
numeric_features = df_model.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df_model.select_dtypes(include=['object']).columns.tolist()

numeric_features, categorical_features

See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  categorical_features = df_model.select_dtypes(include=['object']).columns.tolist()


(['Quantity', 'Sales', 'Profit'],
 ['Order Date', 'Product Name', 'Category', 'Region'])

In [25]:
X = df_model.drop(columns=[target])
y = df_model[target]

In [30]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
numeric_transformer = StandardScaler()

categorical_transformer = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [31]:
df_model.to_csv("../data/ecommerce_prepared.csv", index=False)