# Instacart (Marianos) Prices: File Concatenation & Initial processing (to intermediate)

In [194]:
# read in kickstarter intermediate data 
import numpy as np
import os
import pandas as pd 
import seaborn as sns
import sys
import re
# import warnings
# warnings.filterwarnings('ignore')

src_dir = os.path.join(os.getcwd(), '..', '..', 'src')
sys.path.append(src_dir)

from d01_data.utils import read_multiple_csv_and_concat

In [195]:
pd.set_option('display.max_columns', 500)

## Concatenation

Use a personal function to read in all the marianos data and concatenate it (153 files) 

In [196]:
groceries = read_multiple_csv_and_concat('../../data/01_raw/grocery_data_insta/prod_aile*')

In [197]:
groceries.head()

Unnamed: 0,product,unit_price,item_size,prod_aile
0,"Halls Defense Dietary Supplement Drops, Assort...",$1.79,"<li class=""item-card"" data-radium=""true""><div ...","Cold, Flu & Allergy"
1,Halls Suppressant/Oral Anesthetic Halls Relief...,$1.79,"<li class=""item-card"" data-radium=""true""><div ...","Cold, Flu & Allergy"
2,Kroger Co. Mucus Relief Expectorant & Cough Su...,$9.29,"<li class=""item-card"" data-radium=""true""><div ...","Cold, Flu & Allergy"
3,Ricola Sugar Free Lemon Mint Herb Throat Drops,$2.29,"<li class=""item-card"" data-radium=""true""><div ...","Cold, Flu & Allergy"
4,Benadryl Allergy Ultratabs Tablets,$4.99,"<li class=""item-card"" data-radium=""true""><div ...","Cold, Flu & Allergy"


## Split up the item column 

### Price Per LB

In [198]:
item_size = groceries['item_size']

In [199]:
per_lb_final = []
per_lb_pattern = r'\d*\.[0-9]{2}\/lb'
for item in item_size:
    try:
        per_lb_final.append(re.search(per_lb_pattern, item).group())
    except:
        per_lb_final.append(np.nan)

In [200]:
groceries['price_per_lb'] = per_lb_final

### Measure Words

In [204]:
measure_word_pattern = 'each'
measure_word_list = []
for item in item_size:
    try:
        measure_word_list.append(re.search(measure_word_pattern, item).group())
    except:
        measure_word_list.append(np.nan)

In [205]:
groceries['measure_words_main_price'] = measure_word_list

In [206]:
groceries.rename(columns={'unit_price':'main_price'}, inplace=True)

### Item Weight/ Count/ Vol

In [207]:
item_weight_count_vol = []
for item in item_size:
    try:
        item_weight_count_vol.append(re.search('aria-label=\"\.(.*)\">\d+', item).group(1))
    except:
        item_weight_count_vol.append(np.nan)

In [208]:
groceries['item_weight_count_vol'] = item_weight_count_vol

In [211]:
groceries.head()

Unnamed: 0,product,main_price,item_size,prod_aile,price_per_lb,item_weight_count_vol,measure_words_main_price
0,"Halls Defense Dietary Supplement Drops, Assort...",$1.79,"<li class=""item-card"" data-radium=""true""><div ...","Cold, Flu & Allergy",,30 count,
1,Halls Suppressant/Oral Anesthetic Halls Relief...,$1.79,"<li class=""item-card"" data-radium=""true""><div ...","Cold, Flu & Allergy",,30 count,
2,Kroger Co. Mucus Relief Expectorant & Cough Su...,$9.29,"<li class=""item-card"" data-radium=""true""><div ...","Cold, Flu & Allergy",,14 count,
3,Ricola Sugar Free Lemon Mint Herb Throat Drops,$2.29,"<li class=""item-card"" data-radium=""true""><div ...","Cold, Flu & Allergy",,19 count,
4,Benadryl Allergy Ultratabs Tablets,$4.99,"<li class=""item-card"" data-radium=""true""><div ...","Cold, Flu & Allergy",,24 count,


## Let's continue to Look at the item size column 

In [212]:
groceries.item_size[10]

'<li class="item-card" data-radium="true"><div style="position: relative;"><a href="/store/items/item_230364003?context=low_stock_item" data-bypass="true" tabindex="0" data-radium="true" style="cursor: pointer; text-decoration: none; color: inherit;"><div class="item-card-contents" data-radium="true"><div class="media"><div class="item-card-image-wrapper"><img class="no-aliasing-image item-image" src="https://d2d8wwwkmhfcva.cloudfront.net/310x/filters:fill(FFF,true):format(jpg)/d2lnr5mha7bycj.cloudfront.net/product-image/file/large_49e539cd-49b0-4daf-983e-55d0fa0c701e.png" alt=""></div></div><div id="itemInfo-item_230364003" class="item-info" data-radium="true"><div class="item-name item-row" data-radium="true"><div><div style="display: flex;"><div class="item-price" style="flex: 1 1 0%;"><span class=""><span>$5.99</span></span></div></div></div><span class="full-item-name" data-radium="true" style="display: -webkit-box; -webkit-line-clamp: 2; -webkit-box-orient: vertical; max-height: 

It looks like we have pulled all the information we can from this column. Let's drop it for now. 

In [213]:
groceries.drop(columns=['item_size'], inplace=True)

In [216]:
groceries['date_collected'] = '2019-08-28'
groceries['store'] = 'Marianos'
groceries['location'] = '60615'

In [218]:
groceries.head()

Unnamed: 0,product,main_price,prod_aile,price_per_lb,item_weight_count_vol,measure_words_main_price,date_collected,store,location
0,"Halls Defense Dietary Supplement Drops, Assort...",$1.79,"Cold, Flu & Allergy",,30 count,,2019-08-28,Marianos,60615
1,Halls Suppressant/Oral Anesthetic Halls Relief...,$1.79,"Cold, Flu & Allergy",,30 count,,2019-08-28,Marianos,60615
2,Kroger Co. Mucus Relief Expectorant & Cough Su...,$9.29,"Cold, Flu & Allergy",,14 count,,2019-08-28,Marianos,60615
3,Ricola Sugar Free Lemon Mint Herb Throat Drops,$2.29,"Cold, Flu & Allergy",,19 count,,2019-08-28,Marianos,60615
4,Benadryl Allergy Ultratabs Tablets,$4.99,"Cold, Flu & Allergy",,24 count,,2019-08-28,Marianos,60615


## Save to Intermediate 

In [219]:
groceries.to_csv('../../data/02_intermediate/grocery_prices_marianos.csv', index=False)