In [394]:
import pandas as pd
import os
import glob
from datetime import datetime

# ETL

## Merging original data

In [395]:
path = '../../data/original_untidy/Transactions_v1/'
files = glob.glob(path + "*.csv") + glob.glob(path + "*.xlsx")
dataRaw = pd.DataFrame()
for i,f in enumerate(files):
    fsplit=os.path.splitext(f)
    if (fsplit[1] == '.csv'):
        dataRaw = dataRaw.append(pd.read_csv(f,skiprows=5),ignore_index=True)
    if (fsplit[1] == '.xlsx') & (not 'MSTR Definitions (1).xlsx' in f):
        dataRaw = dataRaw.append(pd.read_excel(f,skiprows=5),ignore_index=True)

dataRaw=dataRaw.rename(columns={"Customer":"Customer ID"
                                ,"Unnamed: 7": "Customer"
                                ,"Unnamed: 10":"Product Type ID"
                                ,"Vendor":"Vendor"
                                ,"Unnamed: 12":"Vendor ID"
                                ,"Unnamed: 16":"Product ID"})
dataRaw.head()


Unnamed: 0,Year,Month,House,Account Status,Premise,Chain,Customer ID,Customer,Beverage Type,Product Type,Product Type ID,Vendor,Vendor ID,Brand,Size,Product,Product ID,Metrics,STD. Cases
0,2018,Nov-2018,ELP,Active,OFF,EMPLOYEES,730100375,"ALARCON, EDWARD C",SPIRITS,BOURBON,53,SSW-DEVILS RIVER,4536,DEVILS RIVER,750 ML,DEVILS RIVER SM BATCH TX BBN 6PK 750M,80848,,2.0
1,2018,Nov-2018,ELP,Active,OFF,EMPLOYEES,730100375,"ALARCON, EDWARD C",SPIRITS,FLAVORED,152,PROXIMO SPIRITS,1748,JC AUTH MARGARITA'S,1.75 LIT / 1.5L,JC AUTH RED SANGRIA MARG 1.75L,51239,,0.3
2,2018,Nov-2018,ELP,Active,OFF,EMPLOYEES,730100375,"ALARCON, EDWARD C",SPIRITS,FLAVORED,152,PROXIMO SPIRITS,1748,JC AUTH MARGARITA'S,1.75 LIT / 1.5L,JC AUTH WHT SANGRIA MARG 1.75L,51237,,0.3
3,2018,Nov-2018,ELP,Active,OFF,EMPLOYEES,730100375,"ALARCON, EDWARD C",SPIRITS,RTD MIXERS,333,PROXIMO SPIRITS,1748,JC GOLDEN MARGARITA,1.75 LIT / 1.5L,JC GOLDEN ROSE MARG 1.75L,47544,,0.2
4,2018,Nov-2018,ELP,Active,OFF,EMPLOYEES,700030388,"CERVERA, RICARDO",SPIRITS,BOURBON,53,BROWN FORMAN,5,JACK DANIELS SINGLE BARREL,750 ML,JACK DANIELS SNGL BRL BBN CF 750M,5473,,2.0


## Transformation

In [396]:
# merge iwht STD Cases Table
# https://www.ttb.gov/distilled-spirits/conversion-tables 
cases = pd.DataFrame({"Size":['750 ML', '1.75 LIT / 1.5L', '100 ML', '375 ML', '1.0 LITER','200/187 ML', '50 ML']
                      ,"Size Liters":[.750,1.75 ,.1 ,.375 ,.1 ,.2 ,.05]
                      ,"Case Bottles":[12  ,6    ,60 ,24   ,12 ,48 ,120]
                      ,"Case Liters": [9   ,10.50,6  ,9    ,9  ,9.6,6]})


data=pd.merge(dataRaw,cases,how='left',on="Size")

#fix Month
data['Month']=data['Month'].str.replace('-19$','-2019',regex=True)
data['Month'] = pd.to_datetime(data['Month'],format="%b-%Y")

#fix Std Cases
data['STD. Cases']=pd.to_numeric(data['STD. Cases'].astype(str).str.replace('\((.*)\)', '-\\1',regex=True))

#calculate totals
data['Liters']=data['Case Liters'] * data['STD. Cases']
data['Bottles']=data['Case Bottles'] * data['STD. Cases']

#adding a 'transactions' column
data['Transactions']=1
#Done
data.head()

Unnamed: 0,Year,Month,House,Account Status,Premise,Chain,Customer ID,Customer,Beverage Type,Product Type,...,Product,Product ID,Metrics,STD. Cases,Size Liters,Case Bottles,Case Liters,Liters,Bottles,Transactions
0,2018,2018-11-01,ELP,Active,OFF,EMPLOYEES,730100375,"ALARCON, EDWARD C",SPIRITS,BOURBON,...,DEVILS RIVER SM BATCH TX BBN 6PK 750M,80848,,2.0,0.75,12,9.0,18.0,24.0,1
1,2018,2018-11-01,ELP,Active,OFF,EMPLOYEES,730100375,"ALARCON, EDWARD C",SPIRITS,FLAVORED,...,JC AUTH RED SANGRIA MARG 1.75L,51239,,0.3,1.75,6,10.5,3.15,1.8,1
2,2018,2018-11-01,ELP,Active,OFF,EMPLOYEES,730100375,"ALARCON, EDWARD C",SPIRITS,FLAVORED,...,JC AUTH WHT SANGRIA MARG 1.75L,51237,,0.3,1.75,6,10.5,3.15,1.8,1
3,2018,2018-11-01,ELP,Active,OFF,EMPLOYEES,730100375,"ALARCON, EDWARD C",SPIRITS,RTD MIXERS,...,JC GOLDEN ROSE MARG 1.75L,47544,,0.2,1.75,6,10.5,2.1,1.2,1
4,2018,2018-11-01,ELP,Active,OFF,EMPLOYEES,700030388,"CERVERA, RICARDO",SPIRITS,BOURBON,...,JACK DANIELS SNGL BRL BBN CF 750M,5473,,2.0,0.75,12,9.0,18.0,24.0,1


# Check nulls

In [397]:
cols=data.columns
data[cols].isna().sum()

Year                    0
Month                   0
House                   0
Account Status          0
Premise                 0
Chain                   0
Customer ID             0
Customer                0
Beverage Type           0
Product Type            0
Product Type ID         0
Vendor                  0
Vendor ID               0
Brand                   0
Size                    0
Product                 0
Product ID              0
Metrics            110350
STD. Cases              0
Size Liters             0
Case Bottles            0
Case Liters             0
Liters                  0
Bottles                 0
Transactions            0
dtype: int64

## Saving

In [398]:
path = '../../data/tidy_data/Transactions_v1/'
data.to_csv(path + "Transactions.csv")
data.to_pickle(path+"Transactions.pkl")