# Sub Research Questions

### Imports

#### Packages

In [6]:
# General Packages
import pandas as pd
import numpy as np 
import ast
import os

#EDA Packages

#Waste Analysis

#Promotion Analysis
import matplotlib.pyplot as plt

#### Data

In [8]:
inventory = pd.read_csv("inventory.csv")
products = pd.read_csv("products.csv")
promotions = pd.read_csv("promotions.csv")
transactions = pd.read_csv("transactions.csv")

## Explorative Data Analysis

In [None]:
# Finding the quantity of a product

def count_inventory(colname, df_inventory):
    item = df_inventory[colname]
    amount = 0
    current_len = 0
    for inv in item:
        if len(inv) > current_len:
            amount += inv[-1][-1]
        current_len = len(inv)
    return amount

## Waste Analysis

In [None]:
#create a reference dataframe
dayofyear = list(range(365))
date = [datetime.datetime(2018, 1, 1) + datetime.timedelta(day) for day in dayofyear]
df_date = pd.DataFrame({"day":dayofyear, "date":date})

for colname in inventory.columns[2:]:
    inventory[colname] = [ast.literal_eval(i) for i in inventory[colname]]
    #converting inventory rows to list of tuples from string

In [None]:
product = "Biologische prei" #select the product to analyze
transactions[transactions["description"] == product] #transactions for that product

In [None]:
def date_converter(row):
    converted_date = datetime.datetime(2018, 1, 1) + datetime.timedelta(row["day"])
    return converted_date

inventory["date"] = inventory.apply(date_converter, axis = 1) #convert day of year to actual timestamp

def week_of_year(row):
    week_nr = datetime.date(row["date"].year, row["date"].month, row["date"].day).isocalendar()[1]
    return week_nr

inventory["week"] = inventory.apply(week_of_year, axis = 1)
#add nr week of year as an extra column to inventory

In [None]:
#only keep relevant columns in inventory
filter_inventory = inventory[["day", "before or after delivery", "date", "week", product]]
#filters promotion table to only contain discounts for chosen product
promotions_filter = promotions[promotions["description"] == product]

def check_discount(row):
    if row["week"] in promotions_filter["week"].values:
        discount_amount = promotions_filter[promotions_filter["week"] == row["week"]]["discount"]
        return discount_amount.values[0]
    else:
        return 0
#adds extra column to filter inventory containing discount amount
filter_inventory["discount"] = filter_inventory.apply(check_discount, axis = 1)


In [None]:
def total_inv(row):
    total = 0
    if len(row[product]) > 0:
        for batch in row[product]:
            total += batch[1]
    return total
#adds total nr items in inventory for the filtered dataframe
filter_inventory["total inventory"] = filter_inventory.apply(total_inv, axis = 1)

In [None]:
best_before = []
for element_list in filter_inventory[product]:
    for batch in element_list:
        best_before.append(batch[0])
bb_dates = np.unique(best_before)
#list of each unique best before dates
bb_dates

In [None]:
df_waste = pd.DataFrame({"best before":bb_dates})
df_waste #build initial df with points where waste can occur

In [None]:
def input_inv(row):
    for element_list in filter_inventory[product]:
        if len(element_list) > 0:
            for batch in element_list:
                if batch[0] == row["best before"]:
                    return batch[1]

df_waste["amount"] = df_waste.apply(input_inv, axis = 1)
df_waste #amount that was received with particular expiry date

In [None]:
df_waste = df_waste[df_waste["best before"] < 365] #only keep rows where expiry date is in same year
df_waste

In [None]:
transactions["day"] = pd.to_datetime(transactions["day"])
filter_transactions = transactions[transactions["description"] == product]
#converts date then filters transactions to contain selected product purchases

first_restock = df_date[df_date["day"] == 3]["date"].values[0]
#first_restock check purchases mde before the first restock
filter_transactions[filter_transactions["day"] < first_restock]
#PROBLEM: IF A PRODUCT HASN'T BEEN BOUGHT IN ONE DAY IT HAS A ZERO, CAUSES AN ERROR FEW CELLS 

In [None]:
#cumulative sum of purchases from that product
purchases_per_day = filter_transactions.groupby("day").count()["product_id"].cumsum()
#purchases made per day from that product
purchases_per_day_plotting = filter_transactions.groupby("day").count()["product_id"]
#idk know what is this
purchases_per_day_plotting.index = purchases_per_day_plotting.index.dayofyear/7
purchases_per_day_plotting

In [None]:
purchases_per_day.index[0].year
weeknr = []
for ind in purchases_per_day.index:
    week = datetime.date(ind.year, ind.month, ind.day).isocalendar()[1]
    weeknr.append(week)
#week number for each purchase day

In [None]:
df_waste["purchases"] = [purchases_per_day[i-1] for i in df_waste["best before"]]
#add cumulative sum of purchases up until that day (based on expiry dates)
df_waste

In [None]:
df_waste["prev purchases"] = df_waste["purchases"].diff()
df_waste #difference in purchases since a previous batch of product expired

In [None]:
length = df_waste.shape[0]
lst_waste = [] #nr rows in waste dataframe
#amount left from the item
df_waste["remaining"] = df_waste["amount"] - df_waste["prev purchases"] 
#the first row is different as there is no prev pruchases available
df_waste["remaining"].iloc[0] = df_waste["amount"].iloc[0] - df_waste["purchases"].iloc[0]

for i in range(length):
    #iterates through the rows of the waste dataframe
    current_row = df_waste.iloc[i]
    if i == 0:
        #
        lst_waste.append(current_row["remaining"])
        continue     
    else:
        prev_row = df_waste.iloc[i-1] #previous row in dataframe
        if lst_waste[-1] < 0:
            #no idea whats happening here
            waste = lst_waste[-1] + current_row["remaining"]
            lst_waste.append(lst_waste[-1] + current_row["remaining"])
        else:
            lst_waste.append(current_row["remaining"])
    
df_waste["waste"] = lst_waste
def date_converter(row):
    converted_date = datetime.datetime(2018, 1, 1) + datetime.timedelta(row["best before"])
    return converted_date

df_waste["date"] = df_waste.apply(date_converter, axis = 1)
df_waste["week"] = df_waste.apply(week_of_year, axis = 1)
df_waste[df_waste["waste"] < 0]["waste"] = 0
df_waste

In [None]:
df_waste["waste nn"] = [i if i > 0 else 0 for i in df_waste["waste"]]
#takes nonnegative values which is counted as waste
waste_group = df_waste.groupby("week").sum()
waste_group["weekly purchases"] = waste_group["purchases"].diff()


## Promotions Analysis

In [None]:
transactions['day'] = pd.to_datetime(transactions['day'])
transactions['time'] = pd.to_datetime(transactions['time'])
transactions['week'] = transactions['day'].dt.strftime('%U')
transactions['week'] = pd.to_numeric(transactions['week'])
transactions['hour'] = transactions['time'].dt.strftime('%H')

In [None]:
# how much does each customer (incl. repeats) spend per trip?

In [None]:
spent_each = transactions.groupby('customer')['purchase_price'].sum()
# on average, each customer spends 31.64 per trip
spent_each = spent_each.sample(n=100)
plt.scatter(spent_each.index, spent_each)
plt.show()

In [None]:
# What percentage of customers is paying by card and is a regular customer?

In [None]:
customers = transactions[["customer", "bank acount"]]
# new row for every product a customer buys, so customer and bank act columns are identical for two products
unique_customers = customers.drop_duplicates()
# nrs appearing for bank act means they paid w one
bankact = unique_customers[unique_customers['bank acount'] > 0]
# % of customers w bank act
num_bank_act = len(bankact) / len(unique_customers)
# at what time do customers come most often?
cust_freq = transactions.drop_duplicates(subset=['customer'])
# Of those paying with a bank acct, what percentage of the items are on discount?
bankact_full = transactions[transactions['bank acount'] > 0]

In [None]:
bankact_full = bankact_full.drop_duplicates(subset=['customer'])
repeat_customers = bankact_full[bankact_full.duplicated(subset=['bank acount'], keep=False) == True]
repeat_customers
# repeat_customers['bank acount'].value_counts().mean()
#
# of the customers that visit more than once, on avg they visited 38 times in a year
repeat_customers.groupby('bank acount')['purchase_price'].sum()

In [None]:
cash_full = transactions[transactions['bank acount'].isnull()]
discounts_amount = discounts[['category', 'description', 'discount']].copy()
# discounts_amount[discounts_amount.duplicated(keep=False) == True]
discounts_category = discounts_amount.groupby('category').describe()
# discounts_amount = discounts_amount.groupby(['category', 'description', 'discount']).size()
discounts_category['discount']['mean']

In [None]:
discounts_description = discounts_amount[discounts_amount['category'] == 'snack']
discounts_description.groupby('description').describe()

In [None]:
discounts_week = discounts[['week', 'product_id']].copy()
bankact_week = bankact_full[['week', 'product_id']].copy()
bankact_unique_week = bankact_week.drop_duplicates().reset_index(drop=True)
result = bankact_unique_week.append(discounts_week)
result = result.reset_index(drop=True)
result[result.duplicated(keep='last')]

In [None]:
#which item appears the most often in the discount?

In [None]:
# What percentage of items are customers paying w bankact buying vs w cash?

In [None]:
bankact_full['category'].value_counts()) / len(bankact_full
cash_full['category'].value_counts() / len(cash_full)

In [None]:
category_product = products['category'].value_counts()
products[products['product_id'] == 'f_4']
products[products['product_id'] == 'v_9']

In [None]:
discounts['product_id'].value_counts()

In [None]:
category_discount = discounts['category'].value_counts()
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
types = ['meat', 'bread', 'vegetable', 'fruit', 'snack']
ax.set_title('Percentage of discounted products per category per week')
ax.bar(types, [(category_discount[0]/52)/category_product[0]*100, (category_discount[1]/52)/category_product[1]*100, (category_discount[2]/52)/category_product[2]*100, (category_discount[3]/52)/category_product[3]*100, (category_discount[4]/52)/category_product[4]*100])
plt.savefig('percentproducts.png',bbox_inches='tight', dpi=600)
# try box plot instead

## Promotions and Transactions

In [None]:
transactions["day"] = pd.to_datetime(transactions["day"])
transactions['week'] = ""

In [None]:
for n in range(0,len(transactions)):
    transactions['week'][n] = transactions['day'][n].weekofyear

In [None]:
transactions.to_csv("transactions_with weeks.csv")

In [None]:
t = transactions.groupby(['week','product_id'])

In [None]:
t2 = t.count()['day']
t2 = t2.reset_index()
t2['on sale'] = False
t2 = t2[:-1]

In [None]:
lastweek = 0 
count = 0
for week in promotions['week']:
    #print(week)
    if week != lastweek:
        lastweek = week                #if still week 1 won't go into loop
        weekx = promotions[promotions['week'] == week] 
        weeky = t2[t2['week'] == week]
        for product in weeky['product_id']:
            #print(product)
            count = count + 1
            for product2 in weekx['product_id']:
                if product == product2:
                    t2['on sale'][count - 1] = True


In [9]:
#t_delete = t2[(t2['on sale'] == True) & (t2['week'] == 1)]
#p_delete = promotions[promotions['week'] == 1]

In [None]:
#t_delete.count()
#p_delete.count()

In [None]:
t3 = t2.copy()
t3['num of transactions'] = t3['day']
t3.drop(['day'],axis = 1)

In [None]:
b_1 = t3[t3['product_id'] == "b_1"]
x0 = b_1[b_1['on sale'] == True]

In [None]:
plt.plot(b_1['week'],b_1['num of transactions'],"-o")
plt.plot(x0['week'],x0['num of transactions'], "s")
plt.title('week vs num of transactions of b_1', fontsize=14)
plt.xlabel('week', fontsize=14)
plt.ylabel('num of transac', fontsize=14)
plt.grid(True)
plt.show()