# Análisis Exploratorio de Datos

## Cargar datos

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("../data/retail_store_inventory.csv")
df.head()

Unnamed: 0,Date,Store ID,Product ID,Category,Region,Inventory Level,Units Sold,Units Ordered,Demand Forecast,Price,Discount,Weather Condition,Holiday/Promotion,Competitor Pricing,Seasonality
0,2022-01-01,S001,P0001,Groceries,North,231,127,55,135.47,33.5,20,Rainy,0,29.69,Autumn
1,2022-01-01,S001,P0002,Toys,South,204,150,66,144.04,63.01,20,Sunny,0,66.16,Autumn
2,2022-01-01,S001,P0003,Toys,West,102,65,51,74.02,27.99,10,Sunny,1,31.32,Summer
3,2022-01-01,S001,P0004,Toys,North,469,61,164,62.18,32.72,10,Cloudy,1,34.74,Autumn
4,2022-01-01,S001,P0005,Electronics,East,166,14,135,9.26,73.64,0,Sunny,0,68.95,Summer


## EDA

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73100 entries, 0 to 73099
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Date                73100 non-null  object 
 1   Store ID            73100 non-null  object 
 2   Product ID          73100 non-null  object 
 3   Category            73100 non-null  object 
 4   Region              73100 non-null  object 
 5   Inventory Level     73100 non-null  int64  
 6   Units Sold          73100 non-null  int64  
 7   Units Ordered       73100 non-null  int64  
 8   Demand Forecast     73100 non-null  float64
 9   Price               73100 non-null  float64
 10  Discount            73100 non-null  int64  
 11  Weather Condition   73100 non-null  object 
 12  Holiday/Promotion   73100 non-null  int64  
 13  Competitor Pricing  73100 non-null  float64
 14  Seasonality         73100 non-null  object 
dtypes: float64(3), int64(5), object(7)
memory usage: 8.4+

### Conversión en el tipo de dato de algunas columnas

In [16]:
# Convertir fechas
df['Date'] = pd.to_datetime(df['Date'])

In [17]:
valores_clima = df['Weather Condition'].unique()
print(valores_clima)

['Rainy' 'Sunny' 'Cloudy' 'Snowy']


In [18]:
# Convertir las condiciones climaticas
weather_map = {'Sunny': 0, 'Rainy': 1, 'Cloudy': 2, 'Snowy': 3}
df['Weather_Code'] = df['Weather Condition'].map(weather_map)

In [19]:
df['Seasonality_Code'] = df['Seasonality'].astype('category').cat.codes

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73100 entries, 0 to 73099
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                73100 non-null  datetime64[ns]
 1   Store ID            73100 non-null  object        
 2   Product ID          73100 non-null  object        
 3   Category            73100 non-null  object        
 4   Region              73100 non-null  object        
 5   Inventory Level     73100 non-null  int64         
 6   Units Sold          73100 non-null  int64         
 7   Units Ordered       73100 non-null  int64         
 8   Demand Forecast     73100 non-null  float64       
 9   Price               73100 non-null  float64       
 10  Discount            73100 non-null  int64         
 11  Weather Condition   73100 non-null  object        
 12  Holiday/Promotion   73100 non-null  int64         
 13  Competitor Pricing  73100 non-null  float64   

### Número de tiendas

In [21]:
tiendas = df['Store ID'].nunique()
print(f"El dataset contiene {tiendas} tiendas")

El dataset contiene 5 tiendas


### Número de productos

In [22]:
productos = df['Product ID'].nunique()
print(f"El dataset contiene {productos} tipos de productos")

El dataset contiene 20 tipos de productos


### Cantidad de productos por tienda

In [23]:
tiendas = df['Store ID'].value_counts()
print(tiendas)

Store ID
S001    14620
S002    14620
S003    14620
S004    14620
S005    14620
Name: count, dtype: int64


### Cantidad de productos por categoría

In [24]:
categorias = df['Category'].value_counts()
print(categorias)

Category
Furniture      14699
Toys           14643
Clothing       14626
Groceries      14611
Electronics    14521
Name: count, dtype: int64


## Filtrar datos
Filtrando datos para un dataset más pequeño para entrenamiento.

In [None]:
df_train = df[(df['Store ID'] == 'S001') & (df['Product ID'] == 'P0001')].copy()

df_train = df_train.sort_values('Date')

# Seleccionamos solo las columnas numéricas que verá el agente
cols_para_el_agente = [
    'Inventory Level', 
    'Demand Forecast', 
    'Price', 
    'Discount', 
    'Holiday/Promotion', 
    'Competitor Pricing',
    'Weather_Code',     
    'Seasonality_Code',
]

cols_guardar = cols_para_el_agente + ['Units Sold']

datos_limpios = df_train[cols_guardar]
datos_limpios.to_csv('../data/data_train.csv', index=False)

In [26]:
# Para el agente

datos_entorno = datos_limpios.values
print(datos_entorno.shape)

(731, 9)
