In [46]:
import numpy as np
import pandas as pd
from itertools import zip_longest

## Read Data

In [54]:
HP_raw = pd.read_csv("../data/train.csv", header=0, delimiter=',')
HP_raw.shape

(15454, 11)

In [None]:
HP_raw.columns

Index(['id', 'date', 'year_week', 'product_number', 'reporterhq_id',
       'prod_category', 'specs', 'display_size', 'segment', 'sales_units',
       'inventory_units'],
      dtype='object')

In [None]:
HP_raw.head()

Unnamed: 0,id,date,year_week,product_number,reporterhq_id,prod_category,specs,display_size,segment,sales_units,inventory_units
0,202119-6909,2021-05-15,202119,6909,93,Arale,21274,13.3,Premium,2.0,35.0
1,202120-6909,2021-05-22,202120,6909,93,Arale,21274,13.3,Premium,0.0,70.0
2,202121-6909,2021-05-29,202121,6909,93,Arale,21274,13.3,Premium,3.0,137.0
3,202122-6909,2021-06-05,202122,6909,93,Arale,21274,13.3,Premium,0.0,274.0
4,202123-6909,2021-06-12,202123,6909,93,Arale,21274,13.3,Premium,0.0,333.0


## Basic Description

In [None]:
HP_raw.describe(include='all')

Unnamed: 0,id,date,year_week,product_number,reporterhq_id,prod_category,specs,display_size,segment,sales_units,inventory_units
count,15454,15454,15454.0,15454.0,15454.0,15454,15454.0,15454.0,15454,14963.0,14961.0
unique,7709,213,,,,7,,,3,,
top,202252-158907,2023-05-06,,,,Clover,,,Core,,
freq,5,277,,,,4235,,,7651,,
mean,,,202240.709202,127194.308076,42.14074,,112262.542837,15.163543,,25.974938,71.726222
std,,,61.596666,77503.015241,34.910293,,63619.857944,1.422376,,29.392055,66.145326
min,,,201915.0,6909.0,3.0,,967.0,11.0,,0.0,0.0
25%,,,202218.0,58233.0,15.0,,58987.0,14.0,,3.0,26.0
50%,,,202239.0,116466.0,24.0,,109271.0,15.6,,16.0,56.0
75%,,,202304.0,196413.0,78.0,,157621.0,15.6,,38.0,97.0


In [None]:
HP_raw[HP_raw.prod_category == "Arale"].product_number.unique()

array([  6909,  17766,  25662,  58233,  65142,  81921,  93765, 138180,
       143115, 201348, 213192, 214179, 230958, 231945, 247737])

In [None]:
HP_raw[HP_raw.sales_units >= HP_raw.inventory_units]

Unnamed: 0,id,date,year_week,product_number,reporterhq_id,prod_category,specs,display_size,segment,sales_units,inventory_units
10,202129-6909,2021-07-24,202129,6909,93,Arale,21274,13.3,Premium,91.0,62.0
28,202148-6909,2021-12-04,202148,6909,93,Arale,21274,13.3,Premium,81.0,48.0
40,202208-6909,2022-02-26,202208,6909,93,Arale,21274,13.3,Premium,13.0,5.0
41,202209-6909,2022-03-05,202209,6909,93,Arale,21274,13.3,Premium,2.0,2.0
46,202214-6909,2022-04-09,202214,6909,93,Arale,21274,13.3,Premium,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
15445,202316-247737,2023-04-22,202316,247737,3,Arale,10637,13.3,Premium,6.0,0.0
15447,202316-247737,2023-04-22,202316,247737,24,Arale,10637,13.3,Premium,6.0,2.0
15448,202317-247737,2023-04-29,202317,247737,3,Arale,10637,13.3,Premium,12.0,0.0
15451,202318-247737,2023-05-06,202318,247737,3,Arale,10637,13.3,Premium,22.0,0.0


## Imputation

In [None]:
HP_raw.isna().sum()

id                   0
date                 0
year_week            0
product_number       0
reporterhq_id        0
prod_category        0
specs                0
display_size         0
segment              0
sales_units        491
inventory_units    493
dtype: int64

## Functions Predict

In [76]:
def LMSalgorithm(x, K, L, init):  
    L = int(L)
    total_error = 0
    iterations = 0
    Nr = x@x
    if Nr == 0:
        Nr = Nr +0.0000000001
    mu = 2/Nr*K
    i = 0
    if init == 0:
        h = np.zeros(L)
    elif init == 1:
        h = 0.5*np.ones(L)
    else:
        h = -0.5*np.ones(L)
    x_n = x[i:i+L]
    while (i+L < len(x)):
        d = x[i+L]
        x_n = x[i:i+L]
        y = x_n@h
        if y%2 < 0.5:
            y = y+1
        y = int(y)
        #y = round(y)
        e = d-y
        if len(x)-i-L == 1:
            total_error = total_error + e*e
            iterations = iterations+1
        h = h+mu*x_n*e
        i = i+1
    #pred = round(x_n@h)
    return np.sqrt(total_error/iterations), x_n@h


In [77]:
def LMSalgorithm_adaptativeMU(x, K, L, init):
    L = int(L)
    total_error = 0
    iterations = 0
    i = 0
    if init == 0:
        h = np.zeros(L)
    elif init == 1:
        h = 0.5*np.ones(L)
    else:
        h = -0.5*np.ones(L)
    x_n = x[i:i+L]
    while (i+L < len(x)):
        d = x[i+L]
        x_n = x[i:i+L]
        y = x_n@h
        if y%2 < 0.5:
            y = y+1
        y = int(y)
        # y = round(y)
        e = d-y
        if len(x)-i-L == 1:
            total_error = total_error + e*e
            iterations = iterations+1
        Nr = x_n@x_n
        if Nr == 0:
            Nr = Nr+0.0000000001
        mu = 2/Nr*K
        h = h+mu*x_n*e
        i = i+1
    #pred = round(x_n@h)
    return np.sqrt(total_error/iterations), x_n@h

In [78]:
possible_K = np.array([1/2, 1/3, 1/5, 1/10, 1/50, 1/100])
possible_L = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
initialization = np.array([0,1,2])

def IPA_prediction(x):
    min_error = 100000000
    for k in range(len(possible_K)):
        for l in range(len(possible_L)):
            for i in range(len(initialization)):
                [rmse, pred] = LMSalgorithm(x, possible_K[k], possible_L[l], initialization[i]) 
                if rmse < min_error:
                    min_error = rmse
                    best_L = possible_L[l]
                    best_K = possible_K[k]
                    best_algorithm = LMSalgorithm
                    best_init = initialization[i]
                [rmse, pred] = LMSalgorithm_adaptativeMU(x, possible_K[k], possible_L[l], initialization[i])
                if rmse < min_error:
                    min_error = rmse
                    best_L = possible_L[l]
                    best_K = possible_K[k]
                    best_algorithm = LMSalgorithm_adaptativeMU
                    best_init = initialization[i]
    return best_L, best_K, best_algorithm , best_init


## Preprocessing

In [50]:
def remove_duplicates(df):
    ''' Returns pandas.DataFrame without duplicate elements.'''
    df = df.copy()
    to_drop = []
    sales = 0.
    inventory = 0.

    for idx, (id, next_id)  in enumerate(zip_longest(df.id, df.id[1:])):
        item = df.iloc[idx]

        if id == next_id:
            sales += item['sales_units']
            inventory += item['inventory_units']
            to_drop.append(idx)
        
        else:
            df.loc[idx, 'sales_units'] += sales
            df.loc[idx, 'inventory_units'] += inventory
            sales = 0.
            inventory = 0.

    df = df.drop(to_drop)
    df = df.reset_index(drop=True)
    return df

def remove_nan(df):
    ''' Returns pandas.DataFrame without nan elements.'''
    inv = 'inventory_units'
    for idx in range(1, len(df)):
        item = df.iloc[idx]

        if np.isnan(item['sales_units']):
            df.loc[idx, 'sales_units'] = max(0., df.loc[idx-1, inv] - item[inv])

        if np.isnan(item['inventory_units']):
            df.loc[idx, 'inventory_units'] = max(0., df.loc[idx-1, inv]-item['sales_units'])

    return df

In [51]:
def create_characteristics_and_inventory_matrix(df):
    ''' Returns a DataFrame of the characteristics of each product, a Matrix of the different inventories for each product,
        and a dictionary relating each product with its index in both the DataFrame and the Matrix.'''
    inv_matrix = []
    product_dict = {}
    prev = 0
    i = 0
    chars = pd.DataFrame(columns=list(df.columns[2:-2])+['avg_sales', 'avg_inventory'])

    for idx, (num1, num2) in enumerate(zip_longest(df.product_number, df.product_number[1:])):
        if num1 != num2:

            sales = np.array(df.sales_units[prev:idx+1])
            inventory = np.array(df.inventory_units[prev:idx+1])
            inv_matrix.append(inventory)

            item = df.iloc[idx]
            chars.loc[i] = [item[c] for c in chars.columns[:-2]] + [np.average(sales), np.average(inventory)]
            
            product_dict[str(num1)] = i
            i += 1
            prev = idx+1

    return chars, inv_matrix, product_dict

In [67]:
df = remove_duplicates(HP_raw)
df = remove_nan(df)
[chars, inv_matrix, product_dict] = create_characteristics_and_inventory_matrix(df)

In [9]:
#imputation 2
df = pd.read_csv("../data/preprocessed_data2.csv", header=0, delimiter=',')

In [68]:
[chars, inv_matrix, product_dict] = create_characteristics_and_inventory_matrix(df)

## IPA with neat dataset

In [20]:
IPA_data = pd.read_csv("../data/preprocessed_data_with_week_num.csv", header=0, delimiter=',')
IPA_data.shape

(7709, 10)

In [7]:
IPA_data.head()

Unnamed: 0,id,week_num,product_number,reporterhq_id,prod_category,specs,display_size,segment,sales_units,inventory_units
0,202119-6909,123,6909,93,Arale,21274,13.3,Premium,2.0,35.0
1,202120-6909,124,6909,93,Arale,21274,13.3,Premium,0.0,70.0
2,202121-6909,125,6909,93,Arale,21274,13.3,Premium,3.0,137.0
3,202122-6909,126,6909,93,Arale,21274,13.3,Premium,0.0,274.0
4,202123-6909,127,6909,93,Arale,21274,13.3,Premium,0.0,333.0


In [21]:
x = np.array(IPA_data.inventory_units[202:302])
d = IPA_data.inventory_units[302]
print(d)

179.0


In [25]:
best = IPA_prediction(x)
print(best[0], best[1], best[2])

9 0.1 <function LMSalgorithm at 0x0000022E4E1D10D0>


In [27]:
[rmse , pred]=best[2](x, best[1], best[0], best[3])
print(pred)

156.91024848316312


In [79]:
total=0
for i in range(len(inv_matrix)):
    x = inv_matrix[i][:-13]
    d = inv_matrix[i][-13:]
    for j in range(13):
        best = IPA_prediction(x)
        [rmse , pred]=best[2](x, best[1], best[0], best[3])
        total = total + abs(pred-d[j])**2 
        x = np.block([x, pred])

print(np.sqrt(total/(len(inv_matrix)*13)))
               

107.3474403788884


In [70]:
res_matrix = [[],[],[],[],[],[],[],[],[],[],[],[],[]]
for i in range(len(inv_matrix)):
    x = inv_matrix[i][:]
    for j in range(13):
        best = IPA_prediction(x)
        [rmse , pred]=best[2](x, best[1], best[0], best[3])
        res_matrix[j].append(pred)
        x = np.block([x, pred])



In [71]:
new_prod_dict = dict(sorted(product_dict.items()))

In [72]:
week = 202319
final_dict = {}
for i in range(13):
    for item in new_prod_dict:
        final_dict[str(week+i)+"-"+item] = res_matrix[i][new_prod_dict[item]]


In [73]:
final_id = []
final_items = []
for item in final_dict:
    final_id.append(item)
    final_items.append(final_dict[item])

In [74]:
df_final = pd.DataFrame({'id': final_id, 'inventory_units': final_items})

In [75]:
df_final.to_csv('../submissions/submission7.csv', index = False)