In [20]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Para modelos binarios y Croston:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import classification_report, mean_absolute_error, recall_score, precision_score, accuracy_score
from darts import TimeSeries
from darts.utils.missing_values import fill_missing_values
from darts.models import Croston

In [21]:
classification_df = pd.read_csv('demand_classification_by_item.csv', sep=';')
raw_df = pd.read_csv('df_periodos_rellenados.csv', sep=';')
display(raw_df)
display(classification_df)

Unnamed: 0,time,item,sales,price,unit_price
0,2019-06-24,1002,24,0.72625,0.030260
1,2020-08-31,1002,224,0.28750,0.001283
2,2021-02-08,1002,40,0.28500,0.007125
3,2021-03-22,1002,80,0.28500,0.003562
4,2021-03-29,1002,16,0.28750,0.017969
...,...,...,...,...,...
3338696,2023-11-22,9998,0,,
3338697,2023-11-23,9998,0,,
3338698,2023-11-24,9998,0,,
3338699,2023-11-25,9998,0,,


Unnamed: 0,item,p,cv2,demand_type,unit_price
0,1002,128.071429,0.612206,lumpy,0.009178
1,1003,234.000000,0.213018,intermittent,1.005850
2,1006,84.652174,4.362716,lumpy,2.061909
3,1007,1.000000,0.000000,smooth,0.398125
4,1009,1.000000,0.000000,smooth,0.133375
...,...,...,...,...,...
3812,9988,312.000000,0.000000,intermittent,1.943750
3813,9989,1.000000,0.000000,smooth,0.191875
3814,9994,150.000000,0.291320,intermittent,4.370863
3815,9997,1.000000,0.000000,smooth,0.106354


### Limpieza de datos con pocas ventas

In [22]:
intermittent_ids = classification_df[classification_df.demand_type == 'intermittent']['item'].unique()
lumpy_ids = classification_df[classification_df.demand_type == 'lumpy']['item'].unique()
raw_df = raw_df[(raw_df['item'].isin(intermittent_ids)) | (raw_df['item'].isin(lumpy_ids))]

In [23]:
item_ids = raw_df.item.unique().tolist()
item_ids

[1002,
 1003,
 1006,
 1013,
 1014,
 1018,
 1019,
 1020,
 1024,
 1028,
 1030,
 1034,
 1035,
 1044,
 1051,
 1055,
 1061,
 1066,
 1070,
 1074,
 1080,
 1084,
 1085,
 1093,
 1095,
 1096,
 1098,
 1101,
 1103,
 1111,
 1114,
 1115,
 1119,
 1121,
 1122,
 1124,
 1127,
 1128,
 1133,
 1134,
 1137,
 1143,
 1153,
 1155,
 1159,
 1160,
 1166,
 1175,
 1181,
 1186,
 1187,
 1192,
 1202,
 1203,
 1207,
 1208,
 1210,
 1213,
 1214,
 1216,
 1220,
 1221,
 1228,
 1229,
 1235,
 1236,
 1242,
 1248,
 1250,
 1256,
 1262,
 1263,
 1275,
 1276,
 1277,
 1278,
 1280,
 1283,
 1284,
 1285,
 1288,
 1292,
 1303,
 1304,
 1312,
 1313,
 1316,
 1325,
 1328,
 1332,
 1334,
 1337,
 1340,
 1345,
 1346,
 1347,
 1353,
 1354,
 1356,
 1361,
 1363,
 1369,
 1371,
 1376,
 1377,
 1378,
 1379,
 1384,
 1389,
 1395,
 1397,
 1399,
 1403,
 1404,
 1405,
 1409,
 1410,
 1412,
 1414,
 1415,
 1427,
 1428,
 1429,
 1432,
 1433,
 1435,
 1439,
 1440,
 1441,
 1443,
 1444,
 1446,
 1454,
 1455,
 1458,
 1466,
 1467,
 1469,
 1471,
 1472,
 1473,
 1474,
 1475,

### Agrupación mensual

In [24]:
raw_df['time'] = pd.to_datetime(raw_df['time'], format='%Y-%m-%d')
raw_df['time'] = raw_df['time'].dt.to_period('M').dt.start_time
grouped_monthly_df = raw_df.groupby(['time', 'item',]).sum().reset_index()[['time', 'item', 'sales']]
monthly_df = pd.merge(grouped_monthly_df, classification_df, how='inner', on='item')
monthly_df

Unnamed: 0,time,item,sales,p,cv2,demand_type,unit_price
0,2018-08-01,1143,10,41.230769,0.644682,lumpy,0.259454
1,2018-08-01,1159,7,56.743590,1.449653,lumpy,0.286071
2,2018-08-01,1187,1,176.363636,0.282579,intermittent,1.667574
3,2018-08-01,1203,20,31.614286,1.941734,lumpy,0.182556
4,2018-08-01,1235,35,51.052632,0.725809,lumpy,0.080731
...,...,...,...,...,...,...,...
112655,2024-09-01,9627,100,21.764706,1.292582,lumpy,0.181902
112656,2024-09-01,9803,140,53.875000,1.263396,lumpy,0.070791
112657,2024-09-01,9877,2,322.250000,0.436130,intermittent,1.519509
112658,2024-09-01,9891,10,53.292683,0.777269,lumpy,2.532289


### Agrupación semanal

In [25]:
raw_df['time'] = pd.to_datetime(raw_df['time'], format='%Y-%m-%d')
raw_df['time'] = raw_df['time'] - pd.to_timedelta(raw_df['time'].dt.weekday, unit='D')
grouped_weekly_df = raw_df.groupby(['time', 'item',]).sum().reset_index()[['time', 'item', 'sales']]
weekly_df = pd.merge(grouped_weekly_df, classification_df, how='inner', on='item')
weekly_df

Unnamed: 0,time,item,sales,p,cv2,demand_type,unit_price
0,2018-07-30,1143,10,41.230769,0.644682,lumpy,0.259454
1,2018-07-30,1159,7,56.743590,1.449653,lumpy,0.286071
2,2018-07-30,1187,1,176.363636,0.282579,intermittent,1.667574
3,2018-07-30,1203,20,31.614286,1.941734,lumpy,0.182556
4,2018-07-30,1235,35,51.052632,0.725809,lumpy,0.080731
...,...,...,...,...,...,...,...
112655,2024-08-26,9627,100,21.764706,1.292582,lumpy,0.181902
112656,2024-08-26,9803,140,53.875000,1.263396,lumpy,0.070791
112657,2024-08-26,9877,2,322.250000,0.436130,intermittent,1.519509
112658,2024-08-26,9891,10,53.292683,0.777269,lumpy,2.532289


### Agrupación diaria

In [26]:
daily_df = pd.merge(raw_df, classification_df, how='inner', on='item')
grouped_daily_df = raw_df.groupby(['time', 'item',]).sum().reset_index()[['time', 'item', 'sales']]
grouped_daily_df

Unnamed: 0,time,item,sales
0,2018-07-30,1143,10
1,2018-07-30,1159,7
2,2018-07-30,1187,1
3,2018-07-30,1203,20
4,2018-07-30,1235,35
...,...,...,...
112655,2024-08-26,9627,100
112656,2024-08-26,9803,140
112657,2024-08-26,9877,2
112658,2024-08-26,9891,10


In [27]:
df_final_results = {}

## Mensual

In [42]:
df = monthly_df.copy()

In [43]:
results_list = []

for it in item_ids:
    item_data = df[df.item == it].sort_values('time').reset_index(drop=True)[['time', 'sales']]
    train_size = int(len(item_data)*0.8)

    if len(item_data) < 10:
        continue

    # (A) MODELO BINARIO + REGRESIÓN
    item_ts = TimeSeries.from_dataframe(
            item_data, time_col='time', value_cols='sales', fill_missing_dates=False
        )

    train_ts = item_ts[:train_size]
    test_ts  = item_ts[train_size:]

    train_df = item_data.iloc[:train_size].copy()
    test_df  = item_data.iloc[train_size:].copy()

    # Si hay pocos datos, saltamos
    if len(train_df) < 10:
        continue

    croston_model = Croston()

    try:
        croston_model.fit(train_ts)
        croston_pred = croston_model.predict(len(test_ts))
        y_pred = croston_pred.values().flatten()
        y_test = test_ts.values().flatten()
        croston_mae_val = np.mean(np.abs(y_test - y_pred))

        print(len(test_df))
        print(len(train_df))
        print(len(y_pred))
        print()

        # Pasar predicciones de Croston a DataFrame para merge
        croston_pred_df = pd.DataFrame({
            'time': test_df['time'],
            'pred_croston': y_pred
        })

    except Exception as err:
        croston_values = [np.nan]*len(test_df)
        croston_mae_val = None
        #print(err)

    
    #if precision > 0 or recall > 0:
    results_list.append({
        'item_id': it,
        'train_df': train_df,
        'test_df': test_df,
        'y_pred': y_pred,
    })

results_list

for result in results_list:
    result['test_df']['item'] = result['item_id']
    result['test_df']['y_pred'] = result['y_pred']

df_final_monthly = pd.concat(list(map(lambda x: x['test_df'], results_list)), axis=0)
df_final_monthly

12
48
12

8
32
8

13
52
13

14
52
14

5
19
5

14
56
14

9
33
9

11
40
11

10
39
10

6
23
6

9
32
9

14
55
14

10
38
10

15
58
15

9
33
9

7
25
7

6
24
6

8
30
8

9
36
9

9
35
9

5
16
5

3
11
3

10
39
10

9
32
9

11
41
11

10
37
10

10
39
10

12
47
12

14
56
14

11
42
11

10
38
10

9
34
9

11
44
11

12
45
12

8
28
8

11
44
11

3
10
3

15
59
15

11
41
11

7
26
7

6
20
6

13
50
13

4
12
4

13
52
13

13
49
13

15
59
15

15
58
15

13
51
13

8
32
8

9
32
9

6
24
6

5
17
5

9
32
9

13
52
13

5
18
5

5
17
5

15
57
15

14
52
14

4
16
4

7
24
7

5
19
5

5
16
5

5
20
5

15
56
15

5
17
5

3
10
3

5
17
5

7
27
7

6
24
6

12
44
12

5
20
5

13
49
13

13
48
13

10
40
10

10
36
10

15
58
15

9
34
9

3
10
3

10
40
10

12
44
12

10
40
10

6
21
6

4
16
4

8
29
8

4
13
4

13
51
13

12
46
12

6
22
6

4
14
4

15
58
15

10
36
10

11
44
11

15
57
15

4
13
4

14
52
14

13
51
13

9
32
9

5
19
5

9
36
9

5
20
5

5
19
5

7
25
7

8
30
8

13
52
13

10
40
10

9
33
9

7
26
7

15
57
15

8
29
8

8
28
8

5
16
5

6
22
6



Unnamed: 0,time,sales,item,y_pred
48,2023-06-01,0,1002,28.127627
49,2023-07-01,0,1002,28.127627
50,2023-08-01,0,1002,28.127627
51,2023-09-01,0,1002,28.127627
52,2023-10-01,0,1002,28.127627
...,...,...,...,...
48,2023-07-01,0,9998,35.591481
49,2023-08-01,2,9998,35.591481
50,2023-09-01,0,9998,35.591481
51,2023-10-01,0,9998,35.591481


In [30]:
df_final_results['monthly'] = df_final_monthly

In [31]:
df_final_results

{'monthly':          time  sales  item     y_pred
 48 2023-06-01      0  1002  28.127627
 49 2023-07-01      0  1002  28.127627
 50 2023-08-01      0  1002  28.127627
 51 2023-09-01      0  1002  28.127627
 52 2023-10-01      0  1002  28.127627
 ..        ...    ...   ...        ...
 48 2023-07-01      0  9998  35.591481
 49 2023-08-01      2  9998  35.591481
 50 2023-09-01      0  9998  35.591481
 51 2023-10-01      0  9998  35.591481
 52 2023-11-01      4  9998  35.591481
 
 [22998 rows x 4 columns]}

## Diario

In [32]:
df = grouped_daily_df.copy()

In [33]:
grouped_daily_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112660 entries, 0 to 112659
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   time    112660 non-null  datetime64[ns]
 1   item    112660 non-null  int64         
 2   sales   112660 non-null  int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 2.6 MB


In [46]:
results_list = []

for it in item_ids:
    item_data = df[df.item == it].sort_values('time').reset_index(drop=True)[['time', 'sales']]
    train_size = int(len(item_data)*0.8)

    if len(item_data) < 10:
        continue

    # (A) MODELO BINARIO + REGRESIÓN
    item_ts = TimeSeries.from_dataframe(
            item_data, time_col='time', value_cols='sales', freq='D'
        )

    train_ts = item_ts[:train_size]
    test_ts  = item_ts[train_size:]

    train_df = item_data.iloc[:train_size].copy()
    test_df  = item_data.iloc[train_size:].copy()

    # Si hay pocos datos, saltamos
    if len(train_df) < 10:
        continue

    croston_model = Croston()

    try:
        croston_model.fit(train_ts)
        croston_pred = croston_model.predict(len(test_ts))
        y_pred = croston_pred.values().flatten()
        y_test = test_ts.values().flatten()
        croston_mae_val = np.mean(np.abs(y_test - y_pred))

        print(len(test_df))
        print(len(train_df))
        print(croston_pred)
        print()

        # Pasar predicciones de Croston a DataFrame para merge
        croston_pred_df = pd.DataFrame({
            'time': test_df['time'],
            'pred_croston': y_pred
        })

    except Exception as err:
        croston_values = [np.nan]*len(test_df)
        croston_mae_val = None
        #print(err)

    results_list.append({
        'item_id': it,
        'train_df': train_df,
        'test_df': test_df,
        'y_pred': y_pred,
    })

results_list

for result in results_list:
    print(result)
    result['test_df']['item'] = result['item_id']
    result['test_df']['y_pred'] = result['y_pred']

df_final_daily = pd.concat(list(map(lambda x: x['test_df'], results_list)), axis=0)
df_final_daily

12
48
<TimeSeries (DataArray) (time: 1749, component: 1, sample: 1)> Size: 14kB
array([[[23.56336607]],

       [[23.56336607]],

       [[23.56336607]],

       ...,

       [[23.56336607]],

       [[23.56336607]],

       [[23.56336607]]])
Coordinates:
  * time       (time) datetime64[ns] 14kB 2019-07-19 2019-07-20 ... 2024-05-01
  * component  (component) object 8B 'sales'
Dimensions without coordinates: sample
Attributes:
    static_covariates:  None
    hierarchy:          None

8
32
<TimeSeries (DataArray) (time: 1155, component: 1, sample: 1)> Size: 9kB
array([[[1.81818182]],

       [[1.81818182]],

       [[1.81818182]],

       ...,

       [[1.81818182]],

       [[1.81818182]],

       [[1.81818182]]])
Coordinates:
  * time       (time) datetime64[ns] 9kB 2020-10-03 2020-10-04 ... 2023-12-01
  * component  (component) object 8B 'sales'
Dimensions without coordinates: sample
Attributes:
    static_covariates:  None
    hierarchy:          None

13
52
<TimeSeries (DataArray)

KeyboardInterrupt: 