#### First create an empty dataframe with rows for each day, then fill it with the sales data

In [1]:
import os
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
from utils import add_delimiters

# Create a dataframe with rows for every day in the years 2020 to 2022, add additional date columns and 
df = pd.DataFrame(pd.date_range('2020-01-01', '2022-12-31', ), columns=['date'])
df['day_of_week'] = df['date'].dt.day_name()
df['month'] = df['date'].dt.month_name()
df['year'] = df['date'].dt.year
df['week_of_year'] = df['date'].dt.isocalendar().week

df

Unnamed: 0,date,day_of_week,month,year,week_of_year
0,2020-01-01,Wednesday,January,2020,1
1,2020-01-02,Thursday,January,2020,1
2,2020-01-03,Friday,January,2020,1
3,2020-01-04,Saturday,January,2020,1
4,2020-01-05,Sunday,January,2020,1
...,...,...,...,...,...
1091,2022-12-27,Tuesday,December,2022,52
1092,2022-12-28,Wednesday,December,2022,52
1093,2022-12-29,Thursday,December,2022,52
1094,2022-12-30,Friday,December,2022,52


#### Define the specific names of the relevant rows/columns/variables in the raw data

In [2]:
category_index = "Unnamed: 0"
sales_date_index = "Unnamed: 1"
total_count_index = "Unnamed: 2"
total_sales_index = "Unnamed: 3"

years = ["2021", "2022"]
months = ["01jan", "02feb", "03mar", "04apr", "05may", "06jun", "07jul", "08aug", "09sep", "10oct", "11nov", "12dec"]

coffeeproducts_text = open("coffeeproducts.txt",'r', encoding="utf-8").readlines()
coffeeproducts = ['Getränke', 'Coffee']
for row in coffeeproducts_text:
    coffeeproducts.append(str(row.strip()))

#print("all_categories", coffeeproducts)

coffeeproducts_list_sales = []
coffeeproducts_list_count = []
# create new columns in the df for each category
for coffeeproduct in coffeeproducts:
    df[f"{coffeeproduct}_sales"] = 0.0
    coffeeproducts_list_sales.append(f"{coffeeproduct}_sales")
    df[f"{coffeeproduct}_count"] = 0.0
    coffeeproducts_list_count.append(f"{coffeeproduct}_count")
    
coffeeproducts_list_sales = coffeeproducts_list_sales[2:]
coffeeproducts_list_count = coffeeproducts_list_count[2:]



#### Load the sales data for all years, write it to its corresponding date into the dataframe

In [3]:
for year in years:
    for month in months:
        data_dir = os.path.join('raw_data', year, month)
        data_names = os.listdir(data_dir)
        for day_file in data_names:
            data = pd.read_csv(add_delimiters(os.path.join(data_dir, day_file), delimiter=';'), sep=';', encoding="utf-8")
            data_date = data[sales_date_index][1]
            data_date = pd.to_datetime(data_date, format='%d.%m.%Y').strftime('%Y-%m-%d')
            # merge the main_categories and all_categories together
            for category in coffeeproducts:
                try:
                    df.loc[df['date'] == data_date, f"{category}_sales"] = data[data[category_index] == category][total_sales_index].values[0]
                    df.loc[df['date'] == data_date, f"{category}_count"] = data[data[category_index] == category][total_count_index].values[0]
                except IndexError:
                    print(f"No {category} sold on {data_date}")

# sum up all the coffee products
df["Coffee_sales"] = df[coffeeproducts_list_sales].apply(pd.to_numeric, errors='coerce').sum(axis=1)
df["Coffee_count"] = df[coffeeproducts_list_count].apply(pd.to_numeric, errors='coerce').sum(axis=1)

df.info()

No Coffee sold on 2021-01-02
No Extra Shot sold on 2021-01-02
No Oat Cappu klein sold on 2021-01-02
No Oat Flat sold on 2021-01-02
No Oat Cappu groß sold on 2021-01-02
No Oat Latte klein sold on 2021-01-02
No Oat Latte groß sold on 2021-01-02
No Hafer Cappuccino Groß sold on 2021-01-02
No Hafer Cappuccino sold on 2021-01-02
No Hafer Flat White sold on 2021-01-02
No Caffè Latte sold on 2021-01-02
No Caffè Latte Groß sold on 2021-01-02
No Hafer Espresso Macchiato sold on 2021-01-02
No Hafer Caffè Latte sold on 2021-01-02
No Hafer Caffè Latte Groß sold on 2021-01-02
No Decaf Caffè Latte Groß * sold on 2021-01-02
No Decaf Cappuccino * sold on 2021-01-02
No Decaf Flat White * sold on 2021-01-02
No Decaf Cappuccino Groß * sold on 2021-01-02
No Decaf Caffè Latte * sold on 2021-01-02
No Flat White auf Eis sold on 2021-01-02
No Hafer Flat White auf ice sold on 2021-01-02
No Hafer Espresso Macchiato dopio sold on 2021-01-02
No Decaf Espresso sold on 2021-01-02
No Decaf Flat White auf Ice sold on

In [4]:
df = df[df["Coffee_sales"] != 0]
#df = df.reset_index(drop=True)
df.head()

Unnamed: 0,date,day_of_week,month,year,week_of_year,Getränke_sales,Getränke_count,Coffee_sales,Coffee_count,Cappuccino_sales,...,Filterkaffee Free_sales,Filterkaffee Free_count,Café Latte Free_sales,Café Latte Free_count,Flat White Free_sales,Flat White Free_count,Americano Free_sales,Americano Free_count,_sales,_count
367,2021-01-02,Saturday,January,2021,53,692.9,486,596.4,183.0,230.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
369,2021-01-04,Monday,January,2021,1,429.2,297,371.9,117.0,192.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
370,2021-01-05,Tuesday,January,2021,1,392.8,309,378.0,117.0,144.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
372,2021-01-07,Thursday,January,2021,1,493.2,290,414.2,128.0,179.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
373,2021-01-08,Friday,January,2021,1,557.5,350,483.5,150.0,185.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# print the top k maximal values of Getränke_sales and the corresponding date
k = 15
var = 'Getränke'
# first convert the sales to float
df[f'{var}_sales'] = df[f'{var}_sales'].astype(float)
df[f'{var}_count'] = df[f'{var}_count'].astype(int)
print(df.nlargest(k, f'{var}_sales')[['date', f'{var}_sales']])
print(df.nlargest(k, f'{var}_count')[['date', f'{var}_count']])

In [5]:
# Manually correct certain dates, where the sales data for two or more days is entered in one day

vars = ['Getränke', 'Coffee']
sal_cnt = ['sales', 'count']

for var in vars:
    # first convert the sales to float
    df[f'{var}_sales'] = df[f'{var}_sales'].astype(float)
    df[f'{var}_count'] = df[f'{var}_count'].astype(int)

    for sal in sal_cnt:
        # 2021-10-16 has all data for 2021-10-16 and 2021-10-17,
        print(df[df["date"] == "2021-10-16"][["date", f'{var}_{sal}']])
        print(df[df["date"] == "2021-10-17"][["date", f'{var}_{sal}']])
        combined_sales = df[df["date"] == "2021-10-16"][f'{var}_{sal}'].values[0]
        print("combined_sales", combined_sales)
        getr_sales_on_2021_10_16 = combined_sales * 0.58
        print("getr_sales_on_2021_10_16", getr_sales_on_2021_10_16)
        getr_sales_on_2021_10_17 = combined_sales - getr_sales_on_2021_10_16
        print("getr_sales_on_2021_10_17", getr_sales_on_2021_10_17)
        df.loc[df['date'] == "2021-10-16", f'{var}_{sal}'] = getr_sales_on_2021_10_16
        df.loc[df['date'] == "2021-10-17", f'{var}_{sal}'] = getr_sales_on_2021_10_17
        print(df[df["date"] == "2021-10-16"][["date", f'{var}_{sal}']])
        print("\n ---------------------------- \n")

        # 2022-02-13 has all data for 2022-02-13 and 2022-02-14
        print(df[df["date"] == "2022-02-13"][["date", f'{var}_{sal}']])
        print(df[df["date"] == "2022-02-14"][["date", f'{var}_{sal}']])
        combined_sales = df[df["date"] == "2022-02-13"][f'{var}_{sal}'].values[0]
        getr_sales_on_2022_02_13 = combined_sales * 0.739
        getr_sales_on_2022_02_14 = combined_sales - getr_sales_on_2022_02_13
        df.loc[df['date'] == "2022-02-13", f'{var}_{sal}'] = getr_sales_on_2022_02_13
        df.loc[df['date'] == "2022-02-14", f'{var}_{sal}'] = getr_sales_on_2022_02_14
        print(df[df["date"] == "2022-02-13"][f'{var}_{sal}'])
        print("\n ---------------------------- \n")

        # 2021-07-31 has all data for 2021-07-31 and 2021-08-01
        print(df[df["date"] == "2021-07-31"][["date", f'{var}_{sal}']])
        print(df[df["date"] == "2021-08-01"][["date", f'{var}_{sal}']])
        combined_sales = df[df["date"] == "2021-07-31"][f'{var}_{sal}'].values[0]
        getr_sales_on_2021_07_31 = combined_sales * 0.618
        getr_sales_on_2021_08_01 = combined_sales - getr_sales_on_2021_07_31
        df.loc[df['date'] == "2021-07-31", f'{var}_{sal}'] = getr_sales_on_2021_07_31
        df.loc[df['date'] == "2021-08-01", f'{var}_{sal}'] = getr_sales_on_2021_08_01
        print(df[df["date"] == "2021-07-31"][f'{var}_{sal}'])
        print("\n ---------------------------- \n")

        # 2022-01-08 has all data for 2022-01-08 and 2022-01-09
        print(df[df["date"] == "2022-01-08"][["date", f'{var}_{sal}']])
        print(df[df["date"] == "2022-01-09"][["date", f'{var}_{sal}']])
        combined_sales = df[df["date"] == "2022-01-08"][f'{var}_{sal}'].values[0]
        getr_sales_on_2022_01_08 = combined_sales * 0.523
        getr_sales_on_2022_01_09 = combined_sales - getr_sales_on_2022_01_08
        df.loc[df['date'] == "2022-01-08", f'{var}_{sal}'] = getr_sales_on_2022_01_08
        df.loc[df['date'] == "2022-01-09", f'{var}_{sal}'] = getr_sales_on_2022_01_09
        print(df[df["date"] == "2022-01-08"][f'{var}_{sal}'])
        print("\n ---------------------------- \n")

        # 2022-07-25 has all data for 2022-07-25 and 2022-07-26
        print(df[df["date"] == "2022-07-25"][["date", f'{var}_{sal}']])
        print(df[df["date"] == "2022-07-26"][["date", f'{var}_{sal}']])
        combined_sales = df[df["date"] == "2022-07-25"][f'{var}_{sal}'].values[0]
        getr_sales_on_2022_07_25 = combined_sales * 0.469
        getr_sales_on_2022_07_26 = combined_sales - getr_sales_on_2022_07_25
        df.loc[df['date'] == "2022-07-25", f'{var}_{sal}'] = getr_sales_on_2022_07_25
        df.loc[df['date'] == "2022-07-26", f'{var}_{sal}'] = getr_sales_on_2022_07_26
        print(df[df["date"] == "2022-07-25"][f'{var}_{sal}'])
        print("\n ---------------------------- \n")

        # 2022-08-11 has all data for 2022-08-11 and 2022-08-12
        print(df[df["date"] == "2022-08-11"][["date", f'{var}_{sal}']])
        print(df[df["date"] == "2022-08-12"][["date", f'{var}_{sal}']])
        combined_sales = df[df["date"] == "2022-08-11"][f'{var}_{sal}'].values[0]
        getr_sales_on_2022_08_11 = combined_sales * 0.486
        getr_sales_on_2022_08_12 = combined_sales - getr_sales_on_2022_08_11
        df.loc[df['date'] == "2022-08-11", f'{var}_{sal}'] = getr_sales_on_2022_08_11
        df.loc[df['date'] == "2022-08-12", f'{var}_{sal}'] = getr_sales_on_2022_08_12
        print(df[df["date"] == "2022-08-11"][f'{var}_{sal}'])
        print("\n ---------------------------- \n")

        # 2021-12-13 has all data for 2021-12-13 and 2021-12-14 and 2021-12-15
        print(df[df["date"] == "2021-12-13"][["date", f'{var}_{sal}']])
        print(df[df["date"] == "2021-12-14"][["date", f'{var}_{sal}']])
        print(df[df["date"] == "2021-12-15"][["date", f'{var}_{sal}']])
        combined_sales = df[df["date"] == "2021-12-13"][f'{var}_{sal}'].values[0]
        getr_sales_on_2021_12_13 = combined_sales * 0.2895
        getr_sales_on_2021_12_14 = combined_sales * 0.377
        getr_sales_on_2021_12_15 = combined_sales - getr_sales_on_2021_12_13 - getr_sales_on_2021_12_14
        df.loc[df['date'] == "2021-12-13", f'{var}_{sal}'] = getr_sales_on_2021_12_13
        df.loc[df['date'] == "2021-12-14", f'{var}_{sal}'] = getr_sales_on_2021_12_14
        df.loc[df['date'] == "2021-12-15", f'{var}_{sal}'] = getr_sales_on_2021_12_15
        print(df[df["date"] == "2021-12-13"][f'{var}_{sal}'])
        print("\n ---------------------------- \n")

        # 2021-12-22 has all data for 2021-12-22 and 2021-12-23
        print(df[df["date"] == "2021-12-22"][["date", f'{var}_{sal}']])
        print(df[df["date"] == "2021-12-23"][["date", f'{var}_{sal}']])
        combined_sales = df[df["date"] == "2021-12-22"][f'{var}_{sal}'].values[0]
        getr_sales_on_2021_12_22 = combined_sales * 0.516
        getr_sales_on_2021_12_23 = combined_sales - getr_sales_on_2021_12_22
        df.loc[df['date'] == "2021-12-22", f'{var}_{sal}'] = getr_sales_on_2021_12_22
        df.loc[df['date'] == "2021-12-23", f'{var}_{sal}'] = getr_sales_on_2021_12_23
        print(df[df["date"] == "2021-12-22"][f'{var}_{sal}'])
        print("\n ---------------------------- \n")

        # 2021-11-09 has all data for 2021-11-09 and 2021-11-10
        print(df[df["date"] == "2021-11-09"][["date", f'{var}_{sal}']])
        print(df[df["date"] == "2021-11-10"][["date", f'{var}_{sal}']])
        combined_sales = df[df["date"] == "2021-11-09"][f'{var}_{sal}'].values[0]
        getr_sales_on_2021_11_09 = combined_sales * 0.546
        getr_sales_on_2021_11_10 = combined_sales - getr_sales_on_2021_11_09
        df.loc[df['date'] == "2021-11-09", f'{var}_{sal}'] = getr_sales_on_2021_11_09
        df.loc[df['date'] == "2021-11-10", f'{var}_{sal}'] = getr_sales_on_2021_11_10
        print(df[df["date"] == "2021-11-09"][f'{var}_{sal}'])
        print("\n ---------------------------- \n")
    
    df[f'{var}_count'] = df[f'{var}_count'].astype(int)


          date  Getränke_sales
654 2021-10-16          3386.2
Empty DataFrame
Columns: [date, Getränke_sales]
Index: []
combined_sales 3386.2
getr_sales_on_2021_10_16 1963.9959999999999
getr_sales_on_2021_10_17 1422.204
          date  Getränke_sales
654 2021-10-16        1963.996

 ---------------------------- 

          date  Getränke_sales
774 2022-02-13          2512.8
Empty DataFrame
Columns: [date, Getränke_sales]
Index: []
774    1856.9592
Name: Getränke_sales, dtype: float64

 ---------------------------- 

          date  Getränke_sales
577 2021-07-31          2499.2
Empty DataFrame
Columns: [date, Getränke_sales]
Index: []
577    1544.5056
Name: Getränke_sales, dtype: float64

 ---------------------------- 

          date  Getränke_sales
738 2022-01-08          2227.0
Empty DataFrame
Columns: [date, Getränke_sales]
Index: []
738    1164.721
Name: Getränke_sales, dtype: float64

 ---------------------------- 

          date  Getränke_sales
936 2022-07-25          2112.3
Emp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the do

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 687 entries, 367 to 1080
Columns: 175 entries, date to _count
dtypes: UInt32(1), datetime64[ns](1), float64(4), int32(2), int64(1), object(166)
memory usage: 937.2+ KB


In [7]:
# drop all rows where no sales were made/are not available
#df = df[df["Getränke_sales"] != 0]
# save the df as a csv file
df.to_csv('processed_data/sales_2021-2022_NEW_CATEGORIES.csv', index=False)

# play around with some plots

In [None]:
days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

fig, ax = plt.subplots(figsize=(30, 8))
ax.plot(df["date"].astype("datetime64"), df["Getränke_sales"].astype("float"))
# add a vline where the day of the week is Saturday
#x.vlines(df[df["day_of_week"] == "Saturday"]["date"].astype("datetime64"), 0, 3000, colors='r', linestyles='dashed')
# add a legend
ax.legend(["Getränke_sales"])

# plot an histogram where on the x axis are the seven days of the week and on the y axis the sales, avereaged over all months, in two subplots for each year
fig, ax = plt.subplots(1, 2, figsize=(20, 10))

for year in df["year"].unique():
    for day in days:
        ax[int(year) - 2021].bar(day, df[(df["day_of_week"] == day) & (df["year"] == year)]["Getränke_sales"].astype("float").mean(), label=day)
        ax[int(year) - 2021].set_title(year)
        ax[int(year) - 2021].legend()

# plot the mean sales of each week in the year
fig, ax = plt.subplots(figsize=(30, 8))
for week in df["week_of_year"].unique():
    ax.bar(week, df[df["week_of_year"] == week]["Getränke_sales"].astype("float").mean(), label=week)



In [None]:
# plot a bar graph where on the x axis are the weeks of the year and on the y axis the sales, each bar is split for 2021 and 2022
fig, ax = plt.subplots(figsize=(30, 8))

width = 0.3
for week in df["week_of_year"].unique():
    ax.bar(week - width, df[(df["week_of_year"] == week) & (df["year"] == 2021)]["Getränke_sales"].astype("float").mean(), width, color="orange")
    ax.bar(week + width, df[(df["week_of_year"] == week) & (df["year"] == 2022)]["Getränke_sales"].astype("float").mean(), width, color="lightblue")
    ax.bar(week, df[df["week_of_year"] == week]["Getränke_sales"].astype("float").mean(), width, color="darkgrey")
ax.legend(["2021", "2022", "Mean"])

plt.show()



In [None]:
# plot the sales for each monday in the year
fig, ax = plt.subplots(figsize=(30, 8))
for week in df[df["day_of_week"] == "Monday"]["week_of_year"].unique():
    ax.bar(week, df[(df["week_of_year"] == week) & (df["day_of_week"] == "Monday")]["Getränke_sales"].astype("float"), label=week)
plt.title("Mondays sales")

# plot the sales for each tuesday in the year
fig, ax = plt.subplots(figsize=(30, 8))
for week in df[df["day_of_week"] == "Tuesday"]["week_of_year"].unique():
    ax.bar(week, df[(df["week_of_year"] == week) & (df["day_of_week"] == "Tuesday")]["Getränke_sales"].astype("float"), label=week)
plt.title("Tuesdays sales")

# plot the sales for each wednesday in the year
fig, ax = plt.subplots(figsize=(30, 8))
for week in df[df["day_of_week"] == "Wednesday"]["week_of_year"].unique():
    ax.bar(week, df[(df["week_of_year"] == week) & (df["day_of_week"] == "Wednesday")]["Getränke_sales"].astype("float"), label=week)
plt.title("Wednesdays sales")

# plot the sales for each thursday in the year
fig, ax = plt.subplots(figsize=(30, 8))
for week in df[df["day_of_week"] == "Thursday"]["week_of_year"].unique():
    ax.bar(week, df[(df["week_of_year"] == week) & (df["day_of_week"] == "Thursday")]["Getränke_sales"].astype("float"), label=week)
plt.title("Thursdays sales")

# plot the sales for each friday in the year
fig, ax = plt.subplots(figsize=(30, 8))
for week in df[df["day_of_week"] == "Friday"]["week_of_year"].unique():
    ax.bar(week, df[(df["week_of_year"] == week) & (df["day_of_week"] == "Friday")]["Getränke_sales"].astype("float"), label=week)
plt.title("Fridays sales")