In [18]:
# Confirmamos ruta del archivo
import pandas as pd
import os

file_path = r"C:\Users\Lucia\PycharmProjects\zrive-ds\src\module_2\data\groceries\feature_frame.csv"

if not os.path.exists(file_path):
    raise FileNotFoundError(f"Archivo no encontrado en: {file_path}")
else:
    print("✅ Archivo encontrado:", file_path)

✅ Archivo encontrado: C:\Users\Lucia\PycharmProjects\zrive-ds\src\module_2\data\groceries\feature_frame.csv


In [19]:
# Cargamos CSV evitando DtypeWarning
df = pd.read_csv(file_path, header=None, low_memory=False)

In [23]:
# Asignamos nombres a columnas
column_names = [
    "order_id", "aisle", "user_id", "product_id", "order_datetime",
    "order_date", "order_number", "feature_1", "feature_2", "feature_3",
    "feature_4", "feature_5", "feature_6", "feature_7", "brand",
    "feature_8", "feature_9", "feature_10", "feature_11", "feature_12",
    "feature_13", "feature_14", "feature_15", "feature_16", "feature_17",
    "feature_18", "feature_19"
]
df.columns = column_names

In [25]:
# Convertimos tipos de columnas importantes
df["order_id"] = df["order_id"].astype(str)
df["user_id"] = df["user_id"].astype(str)
df["product_id"] = df["product_id"].astype(str)

# order_number como entero
df["order_number"] = pd.to_numeric(df["order_number"], errors='coerce')

# order_datetime y order_date como datetime
df["order_datetime"] = pd.to_datetime(df["order_datetime"], format="%Y-%m-%d %H:%M:%S", errors='coerce')
df["order_date"] = pd.to_datetime(df["order_date"], format="%Y-%m-%d", errors='coerce')

In [26]:
# ANÁLISIS EXPLORATORIO DE DATOS (EDA)

In [27]:
df.shape

(2880550, 27)

In [28]:
# Hay 2880550 filas y 27 columnas, es decir, hay 2880550 productos (órdenes) y de cada producto tenemos 27 características (features). Veamos los valores nulos por columna:
df.isnull().sum()

order_id          0
aisle             0
user_id           0
product_id        0
order_datetime    1
order_date        1
order_number      1
feature_1         0
feature_2         0
feature_3         0
feature_4         0
feature_5         0
feature_6         0
feature_7         0
brand             0
feature_8         0
feature_9         0
feature_10        0
feature_11        0
feature_12        0
feature_13        0
feature_14        0
feature_15        0
feature_16        0
feature_17        0
feature_18        0
feature_19        0
dtype: int64

In [34]:
# Sólo tengo un valor nulo en las columnas: order_datetime, order_date, order_number.
# Estadísticas de columnas numéricas
df.describe() # No tiene mucho sentido para estas columnas ya que son fechas

Unnamed: 0,order_datetime,order_date,order_number
count,2880549,2880549,2880549.0
mean,2021-01-13 02:56:12.506719744,2021-01-12 12:11:14.646395904,3.289342
min,2020-10-05 16:46:19,2020-10-05 00:00:00,2.0
25%,2020-12-16 21:06:58,2020-12-16 00:00:00,2.0
50%,2021-01-22 07:36:39,2021-01-22 00:00:00,3.0
75%,2021-02-14 18:19:46,2021-02-14 00:00:00,4.0
max,2021-03-03 14:42:05,2021-03-03 00:00:00,21.0
std,,,2.140176


In [36]:
# En este caso, el dataset nos informa si un producto es comprado (1) o no (0), es decir, la variable objetivo es una variable binaria.
print(df.columns.tolist())  # Lista todas las columnas

['order_id', 'aisle', 'user_id', 'product_id', 'order_datetime', 'order_date', 'order_number', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'brand', 'feature_8', 'feature_9', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19']


In [37]:
# Veamos qué info aporta cada columna (sobre todo las features)
print(df.head()) # primeras filas

         order_id            aisle        user_id     product_id  \
0      variant_id     product_type       order_id        user_id   
1  33826472919172  ricepastapulses  2807985930372  3482464092292   
2  33826472919172  ricepastapulses  2808027644036  3466586718340   
3  33826472919172  ricepastapulses  2808099078276  3481384026244   
4  33826472919172  ricepastapulses  2808393957508  3291363377284   

       order_datetime order_date  order_number feature_1       feature_2  \
0                 NaT        NaT           NaN   outcome  ordered_before   
1 2020-10-05 16:46:19 2020-10-05           3.0       0.0             0.0   
2 2020-10-05 17:59:51 2020-10-05           2.0       0.0             0.0   
3 2020-10-05 20:08:53 2020-10-05           4.0       0.0             0.0   
4 2020-10-06 08:57:59 2020-10-06           2.0       0.0             0.0   

          feature_3  ...      feature_10    feature_11  feature_12  \
0  abandoned_before  ...  count_children  count_babies  count_pe

In [38]:
print(df.info()) # tipo de datos de cada columa

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2880550 entries, 0 to 2880549
Data columns (total 27 columns):
 #   Column          Dtype         
---  ------          -----         
 0   order_id        object        
 1   aisle           object        
 2   user_id         object        
 3   product_id      object        
 4   order_datetime  datetime64[ns]
 5   order_date      datetime64[ns]
 6   order_number    float64       
 7   feature_1       object        
 8   feature_2       object        
 9   feature_3       object        
 10  feature_4       object        
 11  feature_5       object        
 12  feature_6       object        
 13  feature_7       object        
 14  brand           object        
 15  feature_8       object        
 16  feature_9       object        
 17  feature_10      object        
 18  feature_11      object        
 19  feature_12      object        
 20  feature_13      object        
 21  feature_14      object        
 22  feature_15      ob

In [45]:
print(df['outcome'].head())  # Muestra las primeras 5 filas


KeyError: 'outcome'