# shop results

## goal

* find factors influencing this shop's results
* predict results

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

## Load data

In [2]:
from datetime import datetime
from os import path, scandir

daily_datadir = "./data/CaisseJour/"
datadirs = [path.join(daily_datadir, d.name) for d in scandir(daily_datadir)]
data_files = [
    path.join(datadir, file.name)
    for datadir in datadirs
    for file in scandir(datadir)
]

In [3]:
def parse_caisse(filename, keywords=("Chiffre", "TVA", "nombre", "moyen", "ticket")):
    """Parse file "caisse jour"
    
    Args:
        filename (string): file to parse
        keywords (list): list of keywords for  one line data
    """
    data = {}
    with open(filename, "br") as fd:
        for line in fd:
            line = line.decode("Windows-1252",errors="ignore")
            if "à" in line:
                # try with date
                try:
                    date = [int(d) for d in  line.split(" ")[0].split("/")]
                except ValueError:
                    # "à" in cocktail name
                    continue
                data["date"] = datetime(date[2], date[1], date[0])
            elif any(keyword in line for keyword in keywords):
                data[line.split(";")[0].strip()] = line.split(";")[1].strip()
        #TODO: add small tables
    return data

In [4]:
daily = pd.DataFrame(parse_caisse(f) for f in data_files)
daily.index = daily["date"]  # keep date and index

In [5]:
daily.head()

Unnamed: 0_level_0,Chiffre d'Affaires HT,Chiffre d'Affaires TTC,Coefficient moyen,Nom TVA,Nombre moyen de produits / Ticket,TVA Collecté,TVA Vente 10%,TVA Vente 20%,Ticket moyen TTC,date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-11-09,"545,26 €","633,90 €",0,Taux TVA,16,"88,64 €",10 %,20 %,"10,06 €",2018-11-09
2018-11-03,"242,34 €","285,80 €",0,Taux TVA,16,"43,46 €",10 %,20 %,"10,99 €",2018-11-03
2018-11-10,"1370,21 €","1616,00 €",0,Taux TVA,18,"245,79 €",10 %,20 %,"11,14 €",2018-11-10
2018-11-22,"153,58 €","182,00 €",0,Taux TVA,12,"28,42 €",10 %,20 %,"7,00 €",2018-11-22
2018-11-02,"394,43 €","459,70 €",0,Taux TVA,15,"65,27 €",10 %,20 %,"10,95 €",2018-11-02


## Calendar

In [6]:
start_date = min(daily["date"])
end_date = max(daily["date"])

#start_date = datetime(2018, 09, 01)
#end_date = datetime(2019, 09, 01)

calendar = pd.DataFrame(pd.date_range(start_date, end_date), columns=("date",))
#calendar["day", "month", "year", "wod"] = list(map(lambda x: (x.day, x.month, x.year, x.weekday()), calendar["date"]))
calendar["day"] = list(map(lambda x: x.day, calendar["date"]))
calendar["month"] = list(map(lambda x: x.month, calendar["date"]))
calendar["year"] = list(map(lambda x: x.year, calendar["date"]))
calendar["wod"] = list(map(lambda x: x.weekday(), calendar["date"]))

### Holidays

from https://date.nager.at/PublicHoliday/DownloadCSV/FR/2018

In [7]:
datadir = "./data/calendars"
data_files = [path.join(datadir, file.name) for file in scandir(datadir)]

holidays = pd.concat(
    [
        pd.read_csv(file) 
        for file in data_files
    ]
)

# reformat date
holidays["Date"] = pd.Series(
    [
        datetime(int(x.split("-")[0]), int(x.split("-")[1]), int(x.split("-")[2]))
        for x in holidays["Date"]
    ]
)

In [8]:
holidays.head()

Unnamed: 0,Date,LocalName,Name,CountryCode,Fixed,Global,LaunchYear
0,2018-01-01,Jour de l'an,New Year's Day,FR,True,True,1967.0
1,2018-03-30,Vendredi saint,Good Friday,FR,False,False,
2,2018-04-02,Lundi de Pâques,Easter Monday,FR,False,True,1642.0
3,2018-05-01,Fête du premier mai,Labour Day,FR,True,True,
4,2018-05-08,Fête de la Victoire,Victory in Europe Day,FR,True,True,


In [9]:
calendar["public holidays"] = list(map(lambda x: x in list(holidays["Date"]), calendar["date"]))

In [10]:
calendar = calendar.set_index("date")

In [11]:
calendar.head()

Unnamed: 0_level_0,day,month,year,wod,public holidays
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-08-29,29,8,2018,2,False
2018-08-30,30,8,2018,3,False
2018-08-31,31,8,2018,4,False
2018-09-01,1,9,2018,5,False
2018-09-02,2,9,2018,6,False


join data

In [12]:
# the pandas way
daily = daily.join([calendar])

# the spark.sql way

daily.head()

Unnamed: 0_level_0,Chiffre d'Affaires HT,Chiffre d'Affaires TTC,Coefficient moyen,Nom TVA,Nombre moyen de produits / Ticket,TVA Collecté,TVA Vente 10%,TVA Vente 20%,Ticket moyen TTC,date,day,month,year,wod,public holidays
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2018-08-29,"88,99 €","105,60 €",0,Taux TVA,13,"16,61 €",10 %,20 %,"7,04 €",2018-08-29,29,8,2018,2,False
2018-08-30,"115,37 €","134,10 €",0,Taux TVA,27,"18,73 €",10 %,20 %,"19,16 €",2018-08-30,30,8,2018,3,False
2018-08-31,"91,39 €","108,30 €",0,Taux TVA,21,"16,91 €",10 %,20 %,"10,83 €",2018-08-31,31,8,2018,4,False
2018-09-01,"196,80 €","231,70 €",0,Taux TVA,22,"34,90 €",10 %,20 %,"14,48 €",2018-09-01,1,9,2018,5,False
2018-09-05,"56,00 €","67,20 €",0,Taux TVA,100,"11,20 €",,20 %,"67,20 €",2018-09-05,5,9,2018,2,False


## Weather

from meteofrance

## Data exploration