In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score


In [2]:

df = pd.read_csv('../data/sell-in.txt', sep='\t')
df_productos_predecir = pd.read_csv('../data/product_id_apredecir201912.txt', sep='\t')

In [3]:
df.head()

Unnamed: 0,periodo,customer_id,product_id,plan_precios_cuidados,cust_request_qty,cust_request_tn,tn
0,201701,10234,20524,0,2,0.053,0.053
1,201701,10032,20524,0,1,0.13628,0.13628
2,201701,10217,20524,0,1,0.03028,0.03028
3,201701,10125,20524,0,1,0.02271,0.02271
4,201701,10012,20524,0,11,1.54452,1.54452


In [4]:
#Filter df to contain only products that are in df_productos_predecir
product_ids_to_predict = df_productos_predecir['product_id'].unique()
df = df[df['product_id'].isin(product_ids_to_predict)]

print(f"Original df shape after filtering: {df.shape}")
print(f"Unique products in df_productos_predecir: {len(product_ids_to_predict)}")
print(f"Unique products in filtered df: {df['product_id'].nunique()}")

Original df shape after filtering: (2293481, 7)
Unique products in df_productos_predecir: 780
Unique products in filtered df: 780


In [5]:
df.head()

Unnamed: 0,periodo,customer_id,product_id,plan_precios_cuidados,cust_request_qty,cust_request_tn,tn
0,201701,10234,20524,0,2,0.053,0.053
1,201701,10032,20524,0,1,0.13628,0.13628
2,201701,10217,20524,0,1,0.03028,0.03028
3,201701,10125,20524,0,1,0.02271,0.02271
4,201701,10012,20524,0,11,1.54452,1.54452


In [6]:
pivot_df = df.pivot_table(
    index=['customer_id', 'product_id'],
    columns='periodo',
    values='tn',
    aggfunc='sum'
).reset_index()

In [7]:
pivot_df.head()

periodo,customer_id,product_id,201701,201702,201703,201704,201705,201706,201707,201708,...,201903,201904,201905,201906,201907,201908,201909,201910,201911,201912
0,10001,20001,99.43861,198.84365,92.46537,13.29728,101.00563,128.04792,101.20711,43.3393,...,130.54927,364.37071,439.90647,65.92436,144.78714,33.63991,109.05244,176.0298,236.65556,180.21938
1,10001,20002,87.64856,66.08396,75.09182,49.51494,122.40283,167.4647,156.1512,18.15133,...,220.19153,155.81927,264.55349,151.12081,103.12062,148.91108,213.36148,430.90803,547.87849,334.03714
2,10001,20003,100.21284,126.97776,114.52896,37.3464,76.66386,108.30456,87.1416,43.5708,...,125.49948,86.54509,74.71874,78.79703,105.8148,121.06458,101.61982,196.18531,135.69192,137.98717
3,10001,20004,21.73954,29.76246,42.54996,9.31694,8.33349,10.92153,15.01063,12.42259,...,25.94769,17.84712,27.99741,34.26047,16.04585,8.33349,20.57492,37.88891,27.58851,12.9402
4,10001,20005,,,,,,,,,...,5.66966,1.72238,4.25654,3.20851,5.41195,2.51269,5.66966,7.98907,11.01719,7.66693


In [8]:
# Fill NaN values following the rule: keep NaN for values before the first non-null value in each row
def fill_nans_after_first_value(row):
    # Get the time series columns (excluding customer_id and product_id)
    time_columns = row.index[2:]  # Assuming first 2 columns are customer_id and product_id
    
    # Find the first non-null index
    first_non_null_idx = None
    for idx in time_columns:
        if pd.notna(row[idx]):
            first_non_null_idx = idx
            break
    
    # If no non-null value found, return the row as is
    if first_non_null_idx is None:
        return row
    
    # Fill NaN values with 0 only after the first non-null value
    first_non_null_position = time_columns.get_loc(first_non_null_idx)
    for i in range(first_non_null_position + 1, len(time_columns)):
        col = time_columns[i]
        if pd.isna(row[col]):
            row[col] = 0
    
    return row

# Apply the function to fill NaN values
pivot_df_filled = pivot_df.apply(fill_nans_after_first_value, axis=1)

In [9]:
pivot_df_filled.head()

periodo,customer_id,product_id,201701,201702,201703,201704,201705,201706,201707,201708,...,201903,201904,201905,201906,201907,201908,201909,201910,201911,201912
0,10001.0,20001.0,99.43861,198.84365,92.46537,13.29728,101.00563,128.04792,101.20711,43.3393,...,130.54927,364.37071,439.90647,65.92436,144.78714,33.63991,109.05244,176.0298,236.65556,180.21938
1,10001.0,20002.0,87.64856,66.08396,75.09182,49.51494,122.40283,167.4647,156.1512,18.15133,...,220.19153,155.81927,264.55349,151.12081,103.12062,148.91108,213.36148,430.90803,547.87849,334.03714
2,10001.0,20003.0,100.21284,126.97776,114.52896,37.3464,76.66386,108.30456,87.1416,43.5708,...,125.49948,86.54509,74.71874,78.79703,105.8148,121.06458,101.61982,196.18531,135.69192,137.98717
3,10001.0,20004.0,21.73954,29.76246,42.54996,9.31694,8.33349,10.92153,15.01063,12.42259,...,25.94769,17.84712,27.99741,34.26047,16.04585,8.33349,20.57492,37.88891,27.58851,12.9402
4,10001.0,20005.0,,,,,,,,,...,5.66966,1.72238,4.25654,3.20851,5.41195,2.51269,5.66966,7.98907,11.01719,7.66693


In [10]:
filtered_df = pivot_df_filled[pivot_df_filled['product_id'].isin([20001, 20002, 20003, 20004, 20005, 20006, 20007, 20008, 20009, 20010, 20011, 20012, 20013, 20014, 20015, 20016, 20017, 20018, 20019, 20020])]
print(f"Filtered dataframe shape: {filtered_df.shape}")
print(filtered_df.head())

Filtered dataframe shape: (8757, 38)
periodo  customer_id  product_id     201701     201702     201703    201704  \
0            10001.0     20001.0   99.43861  198.84365   92.46537  13.29728   
1            10001.0     20002.0   87.64856   66.08396   75.09182  49.51494   
2            10001.0     20003.0  100.21284  126.97776  114.52896  37.34640   
3            10001.0     20004.0   21.73954   29.76246   42.54996   9.31694   
4            10001.0     20005.0        NaN        NaN        NaN       NaN   

periodo     201705     201706     201707    201708  ...     201903     201904  \
0        101.00563  128.04792  101.20711  43.33930  ...  130.54927  364.37071   
1        122.40283  167.46470  156.15120  18.15133  ...  220.19153  155.81927   
2         76.66386  108.30456   87.14160  43.57080  ...  125.49948   86.54509   
3          8.33349   10.92153   15.01063  12.42259  ...   25.94769   17.84712   
4              NaN        NaN        NaN       NaN  ...    5.66966    1.72238   

p

In [11]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
import warnings

warnings.filterwarnings('ignore')

# Get time series columns (excluding customer_id and product_id)
time_columns = [col for col in filtered_df.columns if col not in ['customer_id', 'product_id']]

# Initialize list to store predictions
predictions = []

# Iterate through each row (customer-product combination)
for idx, row in filtered_df.iterrows():
    # Extract time series data up to 201910
    ts_data = row[time_columns].dropna()
    
    # Only proceed if we have enough data points
    if len(ts_data) >= 3:
        try:
            # Fit SARIMAX model (using auto ARIMA-like approach with simple parameters)
            model = SARIMAX(ts_data, order=(1,1,1), seasonal_order=(1,1,1,12))
            fitted_model = model.fit(disp=False)
            
            # Predict next period (201912)
            forecast = fitted_model.forecast(steps=1)
            pred_value = max(0, forecast[0])  # Ensure non-negative prediction
            
            predictions.append({
                'customer_id': row['customer_id'],
                'product_id': row['product_id'],
                'predicted_201912': pred_value
            })
            
        except:
            # If model fails, use simple moving average as fallback
            pred_value = ts_data.tail(3).mean()
            predictions.append({
                'customer_id': row['customer_id'],
                'product_id': row['product_id'],
                'predicted_201912': pred_value
            })

# Convert to DataFrame
predictions_df = pd.DataFrame(predictions)
print(f"Generated {len(predictions_df)} predictions")
print(predictions_df.head())

Generated 8695 predictions
   customer_id  product_id  predicted_201912
0      10001.0     20001.0        197.634913
1      10001.0     20002.0        437.607887
2      10001.0     20003.0        156.621467
3      10001.0     20004.0         26.139207
4      10001.0     20005.0          8.891063


In [13]:
# Calculate error rate of predictions vs actual values for 201912
actual_201912 = filtered_df[['customer_id', 'product_id', 201912]].copy()
actual_201912 = actual_201912.rename(columns={201912: 'actual_201912'})

# Merge predictions with actual values
comparison_df = predictions_df.merge(actual_201912, on=['customer_id', 'product_id'], how='inner')

# Calculate error metrics
comparison_df['absolute_error'] = abs(comparison_df['predicted_201912'] - comparison_df['actual_201912'])
comparison_df['squared_error'] = (comparison_df['predicted_201912'] - comparison_df['actual_201912'])**2
comparison_df['percentage_error'] = abs(comparison_df['predicted_201912'] - comparison_df['actual_201912']) / (comparison_df['actual_201912'] + 1e-10) * 100

# Calculate overall error metrics
mae = comparison_df['absolute_error'].mean()
mse = comparison_df['squared_error'].mean()
rmse = np.sqrt(mse)
mape = comparison_df['percentage_error'].mean()

print(f"Error Metrics:")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")
print(f"\nNumber of predictions compared: {len(comparison_df)}")

Error Metrics:
Mean Absolute Error (MAE): 0.6843
Mean Squared Error (MSE): 13.6265
Root Mean Squared Error (RMSE): 3.6914
Mean Absolute Percentage Error (MAPE): 191230655284.98%

Number of predictions compared: 8695
