In [1]:
import pandas as pd
import os

from datetime import date
from pathlib import Path

In [2]:
path = Path("data")
entries = [path / entry for entry in os.listdir(path) if entry.endswith(".csv")]
list(entries)

[PosixPath('data/20231231_products.csv'),
 PosixPath('data/20240104_products.csv'),
 PosixPath('data/20240107_products.csv'),
 PosixPath('data/20240111_products.csv')]

In [3]:
column_name_mapping = dict(
    codigo="code",
    marca="brand",
    nombre="name",
    precio="price",
    categoria="category",
)

def load_dataset(csv_path):
    df = pd.read_csv(csv_path)
    filename = csv_path.name
    dataset_date = date(int(filename[0:4]), int(filename[4:6]), int(filename[6:8]))
    df = df.assign(date=dataset_date)
    return df.rename(columns=column_name_mapping)

df = load_dataset(entries[-1])

df.tail()

Unnamed: 0,id,code,brand,name,price,category,date
3126,10260,124318,IMPORTADORA SUDAMERI,RAYOVAC PILA AAAX1U,395.0,PILAS,2024-01-11
3127,141,1429,A BALANZA,MILANESA DE CERDO XKG,6699.0,PREPARADOS,2024-01-11
3128,69,127,A BALANZA,MILANESAS DE CARNE XKG,6999.0,PREPARADOS,2024-01-11
3129,70,128,A BALANZA,MUSLO DE POLLO REBOZADO XKG,3799.0,PREPARADOS,2024-01-11
3130,61,106,A BALANZA,SUPREMA DE POLLO REBOZADA XKG,3999.0,PREPARADOS,2024-01-11


In [4]:
duplicated_codes = df.code.value_counts()[lambda count: count > 1]
duplicated_codes

code
117140    2
118729    2
50003     2
102172    2
117350    2
124186    2
110560    2
Name: count, dtype: int64

In [5]:
df[lambda item: item.code.isin(duplicated_codes.index)]

Unnamed: 0,id,code,brand,name,price,category,date
413,1475,110560,EL ABASTECEDOR,CEREALITAS TOSTADAS ARROZ X160,1050.0,GALLETAS / TOSTADAS / GRISINES,2024-01-11
414,4902,110560,EL ABASTECEDOR,CEREALITAS TOSTADAS ARROZ X160,1050.0,GALLETAS / TOSTADAS / GRISINES,2024-01-11
2519,4359,124186,PLUSBELLE-ZORRO,PLUSBELLE SHAMPOO LAR.SALUDABLE D/P X300ML,599.0,CUIDADO DEL CABELLO,2024-01-11
2520,10160,124186,PLUSBELLE-ZORRO,PLUSBELLE SHAMPOO LAR.SALUDABLE D/P X300ML,599.0,CUIDADO DEL CABELLO,2024-01-11
2582,8231,102172,INVENTARIABLES,COLGATE CREMA DENTAL ORIG.ANTIC.X180G,2105.0,CUIDADO ORAL,2024-01-11
2583,656,102172,INVENTARIABLES,COLGATE CREMA DENTAL ORIG.ANTIC.X180G,2105.0,CUIDADO ORAL,2024-01-11
2585,5307,118729,COLGATE PALMOLIVE AR,COLGATE CREMA DENTAL T.CL.MINT X140G,4379.0,CUIDADO ORAL,2024-01-11
2586,8396,118729,COLGATE PALMOLIVE AR,COLGATE CREMA DENTAL T.CL.MINT X140G,4379.0,CUIDADO ORAL,2024-01-11
2798,10413,117140,PRECIOS JUSTOS,GLADE AROMATIZANTE L/V RPTO 175G,5199.0,DESODORANTES DE AMBIENTES,2024-01-11
2799,2946,117140,PRECIOS JUSTOS,GLADE AROMATIZANTE L/V RPTO 175G,5199.0,DESODORANTES DE AMBIENTES,2024-01-11


In [6]:
categories = df.category.drop_duplicates().reset_index(drop=True).reset_index()
categories

Unnamed: 0,index,category
0,0,ACEITES
1,1,ADEREZOS
2,2,APTOS PARA CELIACOS
3,3,ARROCES
4,4,AZUCAR
...,...,...
132,132,LIMPIEZA DE PISOS Y MUEBLES
133,133,LIMPIEZA DEL AUTOMOTOR
134,134,PAPELES
135,135,PILAS


In [7]:
brands = df.brand.drop_duplicates().reset_index(drop=True).reset_index()
brands

Unnamed: 0,index,brand
0,0,MOLINO CAÃUELAS S.A.
1,1,PRECIOS JUSTOS
2,2,EL ABASTECEDOR
3,3,MOLINOS RIO DE LA PL
4,4,ACEITERA GENERAL DEH
...,...,...
194,194,TEXTIL CATAMARCA S.A
195,195,ROLL PACK
196,196,TEA
197,197,ENERGIZER ARGENTINA


In [8]:
df_merged = pd.merge(pd.merge(df, brands, on="brand"), categories, on="category", suffixes=("_brand","_category"))
df_merged

Unnamed: 0,id,code,brand,name,price,category,date,index_brand,index_category
0,7508,114746,MOLINO CAÃUELAS S.A.,CAÃUELAS AC.OLI.INTEN.LATA.X500C,10715.0,ACEITES,2024-01-11,0,0
1,8147,112222,MOLINO CAÃUELAS S.A.,CAÃUELAS AC.OLIVA E/VIR.PETX500,9235.0,ACEITES,2024-01-11,0,0
2,4241,112537,MOLINO CAÃUELAS S.A.,CAÃUELAS ACEITE OLIVA E/VIR LATAX500,10715.0,ACEITES,2024-01-11,0,0
3,7274,121431,MOLINO CAÃUELAS S.A.,PUREZA ACEITE GIRASOL PET X1.5L,2639.0,ACEITES,2024-01-11,0,0
4,757,103100,PRECIOS JUSTOS,CAÃUELAS ACEITE GIRASOL PET X1.5L,2518.0,ACEITES,2024-01-11,1,0
...,...,...,...,...,...,...,...,...,...
3126,3565,102139,WASSINGTON S.A.C.I.F,WASSINGTON POM.MARRON X30GR,2100.0,CALZADO,2024-01-11,189,124
3127,7882,120599,WASSINGTON S.A.C.I.F,WASSINGTON POM.NEGRA CALZ.X60G,3500.0,CALZADO,2024-01-11,189,124
3128,652,102140,WASSINGTON S.A.C.I.F,WASSINGTON POM.NEGRA X30GR,2100.0,CALZADO,2024-01-11,189,124
3129,17876,150033,ENERGIZER ARGENTINA,ARMOR ALL CERA LIQUIDA ULTRA BRILLANTE X500ML,6559.0,LIMPIEZA DEL AUTOMOTOR,2024-01-11,197,133


In [9]:
products = df_merged[["code", "name", "index_brand", "index_category"]]
products

Unnamed: 0,code,name,index_brand,index_category
0,114746,CAÃUELAS AC.OLI.INTEN.LATA.X500C,0,0
1,112222,CAÃUELAS AC.OLIVA E/VIR.PETX500,0,0
2,112537,CAÃUELAS ACEITE OLIVA E/VIR LATAX500,0,0
3,121431,PUREZA ACEITE GIRASOL PET X1.5L,0,0
4,103100,CAÃUELAS ACEITE GIRASOL PET X1.5L,1,0
...,...,...,...,...
3126,102139,WASSINGTON POM.MARRON X30GR,189,124
3127,120599,WASSINGTON POM.NEGRA CALZ.X60G,189,124
3128,102140,WASSINGTON POM.NEGRA X30GR,189,124
3129,150033,ARMOR ALL CERA LIQUIDA ULTRA BRILLANTE X500ML,197,133


In [10]:
prices = df_merged[["code", "price"]]
prices

Unnamed: 0,code,price
0,114746,10715.0
1,112222,9235.0
2,112537,10715.0
3,121431,2639.0
4,103100,2518.0
...,...,...
3126,102139,2100.0
3127,120599,3500.0
3128,102140,2100.0
3129,150033,6559.0
