In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
#Semilla para reproducibilidad
np.random.seed(42)
random.seed(42)

#Muestra
n= 150_000

In [8]:
# 1.  Generación de fechas

dates= pd.date_range(start = '2024-01-01', end='2024-12-31', freq="D")
weights= np.where(
    (dates.month == 11) | (dates.month == 12),
    5.0,
    1.

)

weights = weights / weights.sum()
delivery_date= np.random.choice(dates, size=n, p=weights)

In [9]:
# 2. Días de entrega

delivery_days = np.random.gamma(shape=2.0, scale=1.5, size=n)
delivery_days= np.clip(delivery_days, 0.5, 50).round(1)

In [10]:
# 3. Valor del paquete

package_value = np.random.lognormal(mean=-0.2, sigma= 1.0, size=n)
package_value =np.clip(package_value, 0.01, 100).round(2)

In [11]:
# 4. Peso del paquete

package_weight = np.random.lognormal(mean=0.5, sigma=0.6, size=n)
package_weight = np.clip(package_weight, 0.1, 17).round(2)

In [13]:
# 5. Costo del paquete

cost_modes= np.random.choice([2, 4], size=n, p=[0.6, 0.4])
delivery_cost = cost_modes + np.random.normal(loc=0, scale= 0.3, size=n)
delivery_cost = np.clip(delivery_cost, 1, 10).round(2)

In [17]:
# 6. Zonas geográficas

regions = {
    "Tumbes": "Norte", "Piura": "Norte", "Lambayeque": "Norte", "La Libertad": "Norte", "Cajamarca": "Norte",
    "Lima": "Centro", "Callao": "Centro", "Junín": "Centro", "Pasco": "Centro", "Huánuco": "Centro", "Ancash": "Centro",
    "Arequipa": "Sur", "Cusco": "Sur", "Puno": "Sur", "Tacna": "Sur", "Moquegua": "Sur",
    "Apurímac": "Sur", "Ayacucho": "Sur", "Ica": "Sur",
    "Loreto": "Oriente", "Ucayali": "Oriente", "San Martín": "Oriente", "Madre de Dios": "Oriente",
    "Amazonas": "Sierra", "Huancavelica": "Sierra"
}

# Share  (más paquetes en Lima, Callao, Arequipa, etc.)
region_probs = {
    "Lima": 0.30, "Callao": 0.06, "Arequipa": 0.10, "La Libertad": 0.06, "Piura": 0.05,
    "Cusco": 0.04, "Junín": 0.04, "Lambayeque": 0.04, "Ica": 0.04,
    "Ancash": 0.03, "Puno": 0.03, "Ucayali": 0.02, "San Martín": 0.02,
    "Loreto": 0.02, "Tacna": 0.02, "Apurímac": 0.02, "Ayacucho": 0.02,
    "Cajamarca": 0.02, "Moquegua": 0.01, "Tumbes": 0.01,
    "Madre de Dios": 0.01, "Pasco": 0.01, "Amazonas": 0.01,
    "Huancavelica": 0.01, "Huánuco": 0.01
}

region_names = list(region_probs.keys())
region_weights = list(region_probs.values())
region = np.random.choice(region_names, size=n, p=region_weights)
zone = [regions[r] for r in region]

In [19]:
# 7. Proveedores logisticos

providers = ["LogisticX", "EnvíaloYa", "FastCargo", "PerúGo", "RapidBox"]
provider = np.random.choice(providers, size=n)

shipping_type = np.random.choice(["Normal", "Express"], size=n, p=[0.8, 0.2])

In [21]:
# 8. Customers

# IDs únicos de cliente
customer_id = [f"CUST{str(i).zfill(4)}" for i in np.random.randint(1, 2000, size=n)]

# Tipo de cliente (segmentación)
customer_type = np.random.choice(["nuevo", "frecuente", "premium"], size=n, p=[0.4, 0.4, 0.2])

In [22]:
# 9. ID del paquete

package_id = [f"PKG{str(i).zfill(6)}" for i in range(n)]

In [26]:
# 10. Estacionalidad

delivery_date = pd.Series(delivery_date)
is_peak_season = delivery_date.dt.month.isin([11, 12]).astype(int)

In [25]:
# 11. Profit

cost_per_kg = delivery_cost / package_weight
is_profitable = (
    (package_value > 3) &
    (delivery_cost < 4) &
    (cost_per_kg < 1.5)
).astype(int)

In [28]:
# Dataframe final

df = pd.DataFrame({
    "package_id": package_id,
    "customer_id": customer_id,
    "customer_type": customer_type,
    "delivery_date": delivery_date,
    "delivery_days": delivery_days,
    "package_value": package_value,
    "package_weight_kg": package_weight,
    "delivery_cost": delivery_cost,
    "region": region,
    "zone": zone,
    "provider": provider,
    "shipping_type": shipping_type,
    "is_peak_season": is_peak_season,
    "is_profitable": is_profitable
})


In [29]:
df.head(5)

Unnamed: 0,package_id,customer_id,customer_type,delivery_date,delivery_days,package_value,package_weight_kg,delivery_cost,region,zone,provider,shipping_type,is_peak_season,is_profitable
0,PKG000000,CUST1259,frecuente,2024-08-16,2.8,0.34,2.67,1.91,Arequipa,Sur,RapidBox,Normal,0,0
1,PKG000001,CUST1354,nuevo,2024-12-25,1.4,0.69,0.82,3.7,Piura,Norte,FastCargo,Normal,1,0
2,PKG000002,CUST0062,frecuente,2024-11-29,16.9,0.36,3.76,1.96,Tacna,Sur,RapidBox,Normal,1,0
3,PKG000003,CUST0237,frecuente,2024-11-13,1.7,1.25,2.05,3.83,Callao,Centro,LogisticX,Normal,1,0
4,PKG000004,CUST0419,nuevo,2024-04-05,3.2,1.1,1.32,2.18,Puno,Sur,LogisticX,Normal,0,0


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 14 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   package_id         150000 non-null  object        
 1   customer_id        150000 non-null  object        
 2   customer_type      150000 non-null  object        
 3   delivery_date      150000 non-null  datetime64[ns]
 4   delivery_days      150000 non-null  float64       
 5   package_value      150000 non-null  float64       
 6   package_weight_kg  150000 non-null  float64       
 7   delivery_cost      150000 non-null  float64       
 8   region             150000 non-null  object        
 9   zone               150000 non-null  object        
 10  provider           150000 non-null  object        
 11  shipping_type      150000 non-null  object        
 12  is_peak_season     150000 non-null  int64         
 13  is_profitable      150000 non-null  int64   

In [32]:
df.shape

(150000, 14)

In [33]:
df.describe()

Unnamed: 0,delivery_date,delivery_days,package_value,package_weight_kg,delivery_cost,is_peak_season,is_profitable
count,150000,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0
mean,2024-08-31 14:54:19.583999744,3.014115,1.34529,1.975838,2.796994,0.501433,0.041653
min,2024-01-01 00:00:00,0.5,0.01,0.12,1.0,0.0,0.0
25%,2024-06-01 00:00:00,1.4,0.41,1.1,1.93,0.0,0.0
50%,2024-11-01 00:00:00,2.5,0.81,1.65,2.29,1.0,0.0
75%,2024-12-01 00:00:00,4.0,1.6,2.48,3.9,1.0,0.0
max,2024-12-31 00:00:00,20.7,60.45,17.0,5.2,1.0,1.0
std,,2.116759,1.770419,1.300599,1.025026,0.5,0.199796


In [34]:
df.to_csv('logistics_dataset.csv', index=False)

In [35]:
from google.colab import files
files.download("logistics_dataset.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>