In [4]:
import numpy as np
import pandas as pd
from itertools import zip_longest

In [3]:
HP_raw = pd.read_csv("../data/train.csv", header=0, delimiter=',')

In [8]:
def remove_duplicates(df):
    ''' Returns pandas.DataFrame without duplicate elements.'''
    df = df.copy()
    to_drop = []
    sales = 0.
    inventory = 0.

    for idx, (id, next_id)  in enumerate(zip_longest(df.id, df.id[1:])):
        item = df.iloc[idx]

        if id == next_id:
            sales += item['sales_units']
            inventory += item['inventory_units']
            to_drop.append(idx)
        
        else:
            df.loc[idx, 'sales_units'] += sales
            df.loc[idx, 'inventory_units'] += inventory
            sales = 0.
            inventory = 0.

    df = df.drop(to_drop)
    df = df.reset_index(drop=True)
    return df

def remove_nan(df):
    ''' Returns pandas.DataFrame without nan elements.'''
    inv = 'inventory_units'
    for idx in range(1, len(df)):
        item = df.iloc[idx]

        if np.isnan(item['sales_units']):
            df.loc[idx, 'sales_units'] = max(0., df.loc[idx-1, inv] - item[inv])

        if np.isnan(item['inventory_units']):
            df.loc[idx, 'inventory_units'] = max(0., df.loc[idx-1, inv]-item['sales_units'])

    return df

In [9]:
def create_characteristics_and_inventory_matrix(df):
    ''' Returns a DataFrame of the characteristics of each product, a Matrix of the different inventories for each product,
        and a dictionary relating each product with its index in both the DataFrame and the Matrix.'''
    inv_matrix = []
    product_dict = {}
    prev = 0
    i = 0
    chars = pd.DataFrame(columns=list(df.columns[2:-2])+['avg_sales', 'avg_inventory'])

    for idx, (num1, num2) in enumerate(zip_longest(df.product_number, df.product_number[1:])):
        if num1 != num2:

            sales = np.array(df.sales_units[prev:idx+1])
            inventory = np.array(df.inventory_units[prev:idx+1])
            inv_matrix.append(inventory)

            item = df.iloc[idx]
            chars.loc[i] = [item[c] for c in chars.columns[:-2]] + [np.average(sales), np.average(inventory)]
            
            product_dict[str(num1)] = i
            i += 1
            prev = idx+1

    return chars, inv_matrix, product_dict

In [10]:
df = remove_duplicates(HP_raw)
df = remove_nan(df)
[chars, inv_matrix, product_dict] = create_characteristics_and_inventory_matrix(df)

In [23]:
#create groups
group1 = []
group2 = []
group3 = []

for item in df.iterrows():
    if item[1][8] == "Premium":
        group1.append(product_dict[item[1][0][7:]])
    elif item[1][8] == "Core":
        group2.append(product_dict[item[1][0][7:]])
    else:
        group3.append(product_dict[item[1][0][7:]])

IPA: THE SEQUEL

In [57]:
def LMSalgorithm_groups(x, K, L):
  L = int(L)
  total_error = 0
  iterations = 0
  N = len(x)
  min_l = len(x[0])
  for k in range (len(x)):
    if len(x[k]) < min_l:
      min_l = len(x[k])
  Nr = 0
  for k in range (len(x)):
    aux = x[k]@x[k]
    if (aux > Nr):
      Nr = aux
  if Nr == 0:
    Nr = Nr +0.0000000001
  mu = 2/Nr*K
  i = 0
  h = np.zeros(L)
  while (i+L < min_l):
    for k in range(len(x)):
      d = x[k][-min_l+i+L]
      x_n = x[k][-min_l+i:-min_l+i+L]
      y = x_n@h
      e = d-y
      total_error = total_error + e*e*1/(len(x)-i-L)**2
      iterations = iterations+1
      h = h+mu*x_n*e
    i = i+1
  pred = []
  for k in range (len(x)):
    x_n = x[k][-L:]
    pred.append(x_n@h)
  return np.sqrt(total_error/iterations), pred

In [53]:
def LMSalgorithm_adaptativeMU_groups(x, K, L):
  L = int(L)
  N = len(x)
  min_l = len(x[0])
  for k in range (len(x)):
    if len(x[k]) < min_l:
      min_l = len(x[k])
  total_error = 0
  iterations = 0
  i = 0
  h = np.zeros(L)
  while (i+L < min_l):
    for k in range(len(x)):
      d = x[k][-min_l+i+L]
      x_n = x[k][-min_l+i:-min_l+i+L]
      y = x_n@h
      e = d-y
      total_error = total_error + e*e*1/(min_l-i-L)**2
      iterations = iterations+1
      Nr = x_n@x_n
      if Nr == 0:
        Nr = Nr+0.0000000001
      mu = 2/Nr*K
      h = h+mu*x_n*e
    i = i+1
  pred = []
  for k in range (len(x)):
    x_n = x[k][-L:]
    pred.append(x_n@h)
  return np.sqrt(total_error/iterations), pred

In [32]:
possible_K = np.array([1/2, 1/3, 1/5, 1/10, 1/50, 1/100])
possible_L = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

def IPA_prediction(x):
  min_error = 100000000
  for k in range(len(possible_K)):
    for l in range(len(possible_L)):
      [rmse, pred] = LMSalgorithm_groups(x, possible_K[k], possible_L[l]) 
      if rmse < min_error:
        min_error = rmse
        best_L = possible_L[l]
        best_K = possible_K[k]
        best_algorithm = LMSalgorithm_groups
      [rmse, pred] = LMSalgorithm_adaptativeMU_groups(x, possible_K[k], possible_L[l])
      if rmse < min_error:
        min_error = rmse
        best_L = possible_L[l]
        best_K = possible_K[k]
        best_algorithm = LMSalgorithm_adaptativeMU_groups
  return best_L, best_K, best_algorithm

In [60]:
total = 0
num_groups = 3
groups = [group1, group2, group3]
for i in range(num_groups):
    x = []
    d = []
    for j in groups[i]:
        x.append(inv_matrix[j][:-13])
        d.append(inv_matrix[j][-13:])
    for j in range(13):
        best = IPA_prediction(x)
        [rmse , pred]=best[2](x, best[1], best[0])
        for k in range (len(groups[i])):
            total = total - abs(pred[k]-d[k][j])**2
            np.append(x[k], pred[k])


In [63]:
print(np.sqrt(-total/(1300)))

844.43349448009
