# FILTERING THE DATASET BY OUR PRODUCTS:

In [4]:
# Importing packages:

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

In [5]:
# Defining the search path of the file, the name and the separator:

file_path = "../../data/01_raw/"
file_name = "b2-transactions.csv"

sep=";"

In [93]:
# We create the list of selected products, and will try with it to obtain the values of the ids of the indicated products:

list_of_products=['croissant',
                  'croissant petit',
                  'tarta mousse 3 chocolates',
                  'tarta de manzana 2º',
                  'palmera', 
                  'tarta opera',
                  'postre fresas y mascarpone',
                  'milhojas frambuesa 2º',
                  'tortel',
                  'baguette']

In [106]:
# We import the dataframe:

df=pd.read_csv(file_path+file_name, nrows=1000000, sep=sep)

In [107]:
# According to what we saw in the previous notebook, we have to do some cleaning:

df.dropna(how='any', inplace=True)
df=df.drop('Unnamed: 0', axis=1)

In [108]:
# Most of the descriptions are in uppercase, we first reduce everything to lowercase:

df['description_lower']=df['description'].str.lower()

In [109]:
# Now we construct the dictionaries of the selected ids and descriptions that each product of the list finds:

# What you are seeing is the final version both of the list of the selected products, and the way they are looked for in the
# dataframe. We started looking within the string of the description, with a ".contains", but it was noticed that startswith 
# was more suited for the job. Also, the values in the list of the products were changed in order to capture more names
# each time:

rel_prod_list_ids=dict()
rel_prod_list_descrip=dict()

for product in list_of_products:
    rel_prod_list_ids[product]=df[df['description_lower'].str.startswith(product)]['product_id'].unique()
    rel_prod_list_descrip[product]=df[df['description_lower'].str.startswith(product)]['description_lower'].unique()

In [110]:
# For minor, additional checks:

df[df['product_id']==107].groupby('description').first()

Unnamed: 0_level_0,product_id,order_date,section,store,units_ordered,description_lower
description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CROISSANT ALMENDRA LARGO,107.0,3/9/2019 0:00:00,0,BmUP,0,croissant almendra largo
CROISSANT VACIOS,107.0,16/6/2009 0:00:00,0,BmUP,300,croissant vacios


In [111]:
# This is the dictionary with the word and its associated codes:

rel_prod_list_ids

{'croissant': array([ 102.,  103.,  105.,  107.,  101.,  132., 5001.,  100.,  214.,
         189.,  198.,  197., 9999.,  513.,  512.,  112.]),
 'croissant petit': array([103., 102.]),
 'tarta mousse 3 chocolates': array([9999.,  453.]),
 'tarta de manzana 2º': array([462.]),
 'palmera': array([ 140.,  182.,  190., 9999.,  141.]),
 'tarta opera': array([ 9999.,   414.,   426.,   427.,   403., 14998.,   402.]),
 'postre fresas y mascarpone': array([4511., 9999.]),
 'milhojas frambuesa 2º': array([459.]),
 'tortel': array([ 112., 3352., 9999., 3375.]),
 'baguette': array([ 115., 8739., 9999.])}

In [112]:
# The same but with the descriptions: Just for checks:

rel_prod_list_descrip[list_of_products[5]]

array(['tarta opera del 2 escrito sobre la trta felicidades rafael ',
       'tarta opera del 2º con cartel "felicidades"',
       'tarta opera 5º con cartel "felicidades lili" y adornada con frutas naturales',
       'tarta opera 2º', 'tarta opera del 5º',
       'tarta opera del 4º con cartel " felicidades dolly "',
       'tarta opera del 2º con cartel " felicidades raul "',
       'tarta opera del 4º con cartel que ponga "felicidades gaës" (ojo que la letra e lleva dieresis)',
       'tarta opera del 2º escrito encima " happy birthay nano  aba y papa "',
       'tarta opera 2º escrito en un cartel felicidades 18',
       'tarta opera 3º',
       'tarta opera 32 rac. escrito sobre la tarta " felicidades jose. feliz 60 cumpleaños"ccccccccccc',
       'tarta opera del 4  con cartel escrito- hugo y mar, muchas felicidades de vuestra familia-',
       'tarta opera 5º', 'tarta opera 2º felicidades alejandra',
       'tarta opera del 6º escrito feliz cumpleaños',
       'tarta opera 10 ra

In [105]:
# For some additional checks:

df[df['product_id']==450]['description'].unique()

array(['POSTRE MOUSSE TRES CHOCOLATES',
       'POSTRE VIRUTA CHOCOLATE  RECTANGULAR',
       'POSTRES  MILHOJAS  NATA CREMA',
       'POSTRE MILHOJAS  FRAMBUESA RECTANGULAR',
       'POSTRE MILHOJASM  FRAMBUESA',
       'POSTRE MILHOJAS FRAMBUESA  rectangulares',
       'POSTRE MILHOJAS FRAMBUESA  RECTANGULAR',
       'POSTRE MILHOJAS  CON FRUTAS  SURTIDAS  RECTANGULAR',
       'POSTRES  YOGUR  Y MUESLI', 'POSTRE  YEMA  TOSTADA',
       'POSTRE   CHEESCAKE', 'POSTRE  SELVA NEGRA',
       'POSTRE MILHOJAS  FRAMBUESA',
       'POSTRE RECTANGULAR MANZANA  PRALINE',
       'POSTRE  MILHOJAS FRAMBUESA  RECTANGULAR',
       'POSTRES  MILHOJAS  FRAMBUESA  RECTANGULAR',
       'POSTRE MANZANA PRALINE', 'POSTRE  YOGUR', 'POSTRE  CHESE',
       'POSTRE  SELVA  NEGRA', 'POSTRE  VIRUTA CHOCOLATE  RECTANGULAR',
       'POSTRE MILHOJAS  FRUTAS  SURTIDAS', 'POSTRE  CHEESECAKE',
       'POSTRE  SAN MARCOS', 'POSTRE MILHOJAS   FRAMBUESA',
       'POSTRE MOUSSE TRES CHOCOLATES  CON CANUTILLO BLANCO Y C

In [114]:
# At end, we reach the following conclussions:

dict_of_products={'croissant': 100, # tengo muchas dudas, creo que haría la suma de 100+101+102
                  'croissant petit': 103,
                  'tarta mousse 3 chocolates': 9999, # esta está prácticamente sólo por encargo
                  'tarta de manzana 2º': 462,
                  'palmeras de trufa': 182, # palmeras: 140
                  'tarta opera': 414, # 9999, por encargo la mayoría
                  'postre fresas y mascarpone':4511,
                  'milhojas frambuesa 2º': 459,
                  'torteles': 112,
                  'baguette':115}

In [115]:
dict_of_products

{'croissant': 100,
 'croissant petit': 103,
 'tarta mousse 3 chocolates': 9999,
 'tarta de manzana 2º': 462,
 'palmeras de trufa': 182,
 'tarta opera': 414,
 'postre fresas y mascarpone': 4511,
 'milhojas frambuesa 2º': 459,
 'torteles': 112,
 'baguette': 115}