In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score


In [2]:

df = pd.read_csv('../data/sell-in.txt', sep='\t')
df_productos_predecir = pd.read_csv('../data/product_id_apredecir201912.txt', sep='\t')

In [3]:
df.head()

Unnamed: 0,periodo,customer_id,product_id,plan_precios_cuidados,cust_request_qty,cust_request_tn,tn
0,201701,10234,20524,0,2,0.053,0.053
1,201701,10032,20524,0,1,0.13628,0.13628
2,201701,10217,20524,0,1,0.03028,0.03028
3,201701,10125,20524,0,1,0.02271,0.02271
4,201701,10012,20524,0,11,1.54452,1.54452


In [4]:
#Filter df to contain only products that are in df_productos_predecir
product_ids_to_predict = df_productos_predecir['product_id'].unique()
df = df[df['product_id'].isin(product_ids_to_predict)]

print(f"Original df shape after filtering: {df.shape}")
print(f"Unique products in df_productos_predecir: {len(product_ids_to_predict)}")
print(f"Unique products in filtered df: {df['product_id'].nunique()}")

Original df shape after filtering: (2293481, 7)
Unique products in df_productos_predecir: 780
Unique products in filtered df: 780


In [5]:
df.head()

Unnamed: 0,periodo,customer_id,product_id,plan_precios_cuidados,cust_request_qty,cust_request_tn,tn
0,201701,10234,20524,0,2,0.053,0.053
1,201701,10032,20524,0,1,0.13628,0.13628
2,201701,10217,20524,0,1,0.03028,0.03028
3,201701,10125,20524,0,1,0.02271,0.02271
4,201701,10012,20524,0,11,1.54452,1.54452


In [6]:
pivot_df = df.pivot_table(
    index=['customer_id', 'product_id'],
    columns='periodo',
    values='tn',
    aggfunc='sum'
).reset_index()

In [7]:
pivot_df.head()

periodo,customer_id,product_id,201701,201702,201703,201704,201705,201706,201707,201708,...,201903,201904,201905,201906,201907,201908,201909,201910,201911,201912
0,10001,20001,99.43861,198.84365,92.46537,13.29728,101.00563,128.04792,101.20711,43.3393,...,130.54927,364.37071,439.90647,65.92436,144.78714,33.63991,109.05244,176.0298,236.65556,180.21938
1,10001,20002,87.64856,66.08396,75.09182,49.51494,122.40283,167.4647,156.1512,18.15133,...,220.19153,155.81927,264.55349,151.12081,103.12062,148.91108,213.36148,430.90803,547.87849,334.03714
2,10001,20003,100.21284,126.97776,114.52896,37.3464,76.66386,108.30456,87.1416,43.5708,...,125.49948,86.54509,74.71874,78.79703,105.8148,121.06458,101.61982,196.18531,135.69192,137.98717
3,10001,20004,21.73954,29.76246,42.54996,9.31694,8.33349,10.92153,15.01063,12.42259,...,25.94769,17.84712,27.99741,34.26047,16.04585,8.33349,20.57492,37.88891,27.58851,12.9402
4,10001,20005,,,,,,,,,...,5.66966,1.72238,4.25654,3.20851,5.41195,2.51269,5.66966,7.98907,11.01719,7.66693


In [8]:
# Fill NaN values following the rule: keep NaN for values before the first non-null value in each row
def fill_nans_after_first_value(row):
    # Get the time series columns (excluding customer_id and product_id)
    time_columns = row.index[2:]  # Assuming first 2 columns are customer_id and product_id
    
    # Find the first non-null index
    first_non_null_idx = None
    for idx in time_columns:
        if pd.notna(row[idx]):
            first_non_null_idx = idx
            break
    
    # If no non-null value found, return the row as is
    if first_non_null_idx is None:
        return row
    
    # Fill NaN values with 0 only after the first non-null value
    first_non_null_position = time_columns.get_loc(first_non_null_idx)
    for i in range(first_non_null_position + 1, len(time_columns)):
        col = time_columns[i]
        if pd.isna(row[col]):
            row[col] = 0
    
    return row

# Apply the function to fill NaN values
pivot_df_filled = pivot_df.apply(fill_nans_after_first_value, axis=1)

In [9]:
pivot_df_filled.head()

periodo,customer_id,product_id,201701,201702,201703,201704,201705,201706,201707,201708,...,201903,201904,201905,201906,201907,201908,201909,201910,201911,201912
0,10001.0,20001.0,99.43861,198.84365,92.46537,13.29728,101.00563,128.04792,101.20711,43.3393,...,130.54927,364.37071,439.90647,65.92436,144.78714,33.63991,109.05244,176.0298,236.65556,180.21938
1,10001.0,20002.0,87.64856,66.08396,75.09182,49.51494,122.40283,167.4647,156.1512,18.15133,...,220.19153,155.81927,264.55349,151.12081,103.12062,148.91108,213.36148,430.90803,547.87849,334.03714
2,10001.0,20003.0,100.21284,126.97776,114.52896,37.3464,76.66386,108.30456,87.1416,43.5708,...,125.49948,86.54509,74.71874,78.79703,105.8148,121.06458,101.61982,196.18531,135.69192,137.98717
3,10001.0,20004.0,21.73954,29.76246,42.54996,9.31694,8.33349,10.92153,15.01063,12.42259,...,25.94769,17.84712,27.99741,34.26047,16.04585,8.33349,20.57492,37.88891,27.58851,12.9402
4,10001.0,20005.0,,,,,,,,,...,5.66966,1.72238,4.25654,3.20851,5.41195,2.51269,5.66966,7.98907,11.01719,7.66693


In [10]:
# Group by product_id and sum the values for each period, removing customer_id column
product_summary = pivot_df_filled.drop('customer_id', axis=1).groupby('product_id').sum().reset_index()
product_summary.head()

periodo,product_id,201701,201702,201703,201704,201705,201706,201707,201708,201709,...,201903,201904,201905,201906,201907,201908,201909,201910,201911,201912
0,20001.0,934.77222,798.0162,1303.35771,1069.9613,1502.20132,1520.06539,1030.67391,1267.39462,1316.94604,...,1470.65653,1647.63848,1629.78233,1109.93769,1678.99318,1261.34529,1660.00561,1561.50552,1397.37231,1504.68856
1,20002.0,550.15707,505.88633,834.73521,522.35365,843.43785,968.15756,845.39319,619.71078,1065.34529,...,1083.62552,1287.62346,1034.98927,928.36431,1066.44999,813.78215,1090.18771,1979.53635,1423.57739,1087.30855
2,20003.0,1063.45835,752.1152,917.16548,525.82591,620.48202,744.08829,785.12398,864.21928,1465.83347,...,638.0401,565.33774,590.12515,662.38654,715.20314,635.59563,967.77116,1081.36645,948.29393,892.50129
3,20004.0,555.91614,508.20044,489.91328,512.05402,543.3667,590.50779,569.88117,1042.52979,1259.6456,...,619.77084,466.70901,603.31081,667.19411,521.71519,482.13372,786.1714,1064.69633,723.94206,637.90002
4,20005.0,494.27011,551.4306,563.89955,662.59032,515.58711,528.58883,625.84925,1068.01865,1247.8888,...,488.21387,624.9988,897.26297,876.39696,745.74978,536.668,879.52808,996.78275,606.91173,593.24443


In [17]:
# Calculate percentage difference from average of last 3 months to 201810 vs 201812 and apply to predict 201912 from 201910
def calculate_growth_prediction(df):
    """
    Calculate percentage growth from average of 201808-201810 to 201812 and apply to 201910 to predict 201912
    """
    results = []
    
    for idx, row in df.iterrows():
        product_id = row['product_id']
        # Calculate average of last 3 months before 201812
        avg_last_3_months = (row[201808] + row[201809] + row[201810]) / 3
        value_201812 = row[201812]
        value_201910 = row[201910]
        actual_201912 = row[201912]
        
        # Calculate growth coefficient from average of last 3 months to 201812
        if avg_last_3_months > 0:
            growth_coef = value_201812 / avg_last_3_months
        else:
            growth_coef = 1.0  # No growth if base value is 0
        
        # Apply growth coefficient to predict 201912 from 201910
        predicted_201912 = value_201910 * growth_coef
        
        results.append({
            'product_id': product_id,
            'avg_201808_201810': avg_last_3_months,
            'value_201812': value_201812,
            'value_201910': value_201910,
            'actual_201912': actual_201912,
            'growth_coef': growth_coef,
            'predicted_201912': predicted_201912,
            'percentage_diff_avg_1812': ((value_201812 - avg_last_3_months) / avg_last_3_months * 100) if avg_last_3_months > 0 else 0
        })
    
    return pd.DataFrame(results)

# Apply the growth prediction method
growth_predictions_df = calculate_growth_prediction(product_summary)
print("Growth-based predictions (using 3-month average):")
print(growth_predictions_df.head(10))

# Calculate error metrics
growth_predictions_df['error_rate'] = abs((growth_predictions_df['predicted_201912'] - growth_predictions_df['actual_201912']) / growth_predictions_df['actual_201912']) * 100

print(f"\nMean Error Rate (Growth Method with 3-month avg): {growth_predictions_df['error_rate'].mean():.2f}%")
print(f"Median Error Rate (Growth Method with 3-month avg): {growth_predictions_df['error_rate'].median():.2f}%")

Growth-based predictions (using 3-month average):
   product_id  avg_201808_201810  value_201812  value_201910  actual_201912  \
0     20001.0        1844.944850    1486.68669    1561.50552     1504.68856   
1     20002.0        1164.870123    1009.45458    1979.53635     1087.30855   
2     20003.0        1060.551487     769.82869    1081.36645      892.50129   
3     20004.0         898.318097     585.56477    1064.69633      637.90002   
4     20005.0         843.468433     372.63428     996.78275      593.24443   
5     20006.0         535.634923     407.75925     528.32630      417.23228   
6     20007.0         558.820223     361.82904     445.34884      390.43432   
7     20008.0         508.080200     426.32899     452.77197      195.36854   
8     20009.0         571.377270     555.27622     556.15182      495.03574   
9     20010.0         558.646867     285.02947     448.82078      359.59998   

   growth_coef  predicted_201912  percentage_diff_avg_1812  
0     0.805816     

In [18]:
growth_predictions_df.to_csv('../data/growth_predictions_3_month_avg.csv', index=False)

In [12]:
# # Calculate the total tonnage growth coefficient across all products
# total_201810 = product_summary[201810].sum()
# total_201812 = product_summary[201812].sum()

# # Calculate the overall growth coefficient
# overall_growth_coef = total_201812 / total_201810 if total_201810 > 0 else 1.0

# print(f"Total tonnage 201810: {total_201810:.2f}")
# print(f"Total tonnage 201812: {total_201812:.2f}")
# print(f"Overall growth coefficient: {overall_growth_coef:.4f}")
# print(f"Overall percentage change: {((total_201812 - total_201810) / total_201810 * 100):.2f}%")

# # Apply the reinforcement: multiply predicted_201912 by the overall growth coefficient
# growth_predictions_df['predicted_201912_reinforced'] = growth_predictions_df['predicted_201912'] * overall_growth_coef

# # Calculate error metrics for the reinforced predictions
# growth_predictions_df['error_rate_reinforced'] = abs((growth_predictions_df['predicted_201912_reinforced'] - growth_predictions_df['actual_201912']) / growth_predictions_df['actual_201912']) * 100

# print("\nComparison of predictions:")
# print("Growth-based predictions with reinforcement:")
# print(growth_predictions_df[['product_id', 'predicted_201912', 'predicted_201912_reinforced', 'actual_201912', 'error_rate', 'error_rate_reinforced']].head(10))

# print(f"\nOriginal Mean Error Rate: {growth_predictions_df['error_rate'].mean():.2f}%")
# print(f"Reinforced Mean Error Rate: {growth_predictions_df['error_rate_reinforced'].mean():.2f}%")
# print(f"Original Median Error Rate: {growth_predictions_df['error_rate'].median():.2f}%")
# print(f"Reinforced Median Error Rate: {growth_predictions_df['error_rate_reinforced'].median():.2f}%")
