In [1]:
import pandas as pd

dtype_map = {
    'log_id': 'string',
    'product_id': 'string',
    'movement_type': 'string',
    'quantity_change': 'string',
    'reason': 'string',
    'timestamp': 'string',
    'reference_id': 'string',
    'notes': 'string'
}

df_inventory = pd.read_csv('../../data/raw/inventory_logs.csv', dtype=dtype_map)
df_inventory.head()

Unnamed: 0.1,Unnamed: 0,log_id,product_id,movement_type,quantity_change,reason,timestamp,reference_id,notes
0,0,1,996381,IN,228.0,Adjustment,2023-08-21T04:33:50.446753,57097,
1,1,2,962832,OUT,,Return,2024-09-25T06:15:03.583328,37909,
2,2,3,995198,OUT,,Sale,2024-05-06T01:11:03.276581,18823,
3,3,4,997550,IN,260.0,Adjustment,2024-09-03T20:47:37.637720,80650,Director own ball land themselves key.
4,4,5,978028,IN,263.0,Return,2024-10-19T08:55:45.508647,90031,


Vamos a descartar las columnas que son innecesarias para nuestro análisis.

In [2]:
df_inventory.drop(columns=['Unnamed: 0'], inplace=True)

In [3]:
df_inventory_clean = df_inventory.set_index('log_id')

# DATES
df_inventory_clean['timestamp'] = df_inventory_clean['timestamp'].str.strip()
df_inventory_clean['timestamp'] = pd.to_datetime(df_inventory_clean['timestamp'], errors='coerce')

# INTEGERS
for col in ['product_id', 'quantity_change', 'reference_id']:
    df_inventory_clean[col] = df_inventory_clean[col].str.strip()
    df_inventory_clean[col] = pd.to_numeric(df_inventory_clean[col], errors='coerce').astype('Int64')

# STRINGS
for col in ['notes']:
    df_inventory_clean[col] = df_inventory_clean[col].str.strip()

# CATEGORICAL
df_inventory_clean['movement_type'] = df_inventory_clean['movement_type'].str.strip().str.upper().astype('category')
df_inventory_clean['reason'] = df_inventory_clean['reason'].str.strip().str.title().astype('category')

df_inventory_clean.head()

Unnamed: 0_level_0,product_id,movement_type,quantity_change,reason,timestamp,reference_id,notes
log_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,996381,IN,228.0,Adjustment,2023-08-21 04:33:50.446753,57097,
2,962832,OUT,,Return,2024-09-25 06:15:03.583328,37909,
3,995198,OUT,,Sale,2024-05-06 01:11:03.276581,18823,
4,997550,IN,260.0,Adjustment,2024-09-03 20:47:37.637720,80650,Director own ball land themselves key.
5,978028,IN,263.0,Return,2024-10-19 08:55:45.508647,90031,


Exportamos el dataset limpio para reutilizarlo.

In [4]:
df_inventory_clean.to_pickle('../../data/clean/inventory_logs.pkl')