# Data Concat & Inter Cleaning - HMART

In [2]:
# read in kickstarter intermediate data 
import numpy as np
import os
import pandas as pd 
import seaborn as sns
import sys
import re
# import warnings
# warnings.filterwarnings('ignore')

src_dir = os.path.join(os.getcwd(), '..', '..', 'src')
sys.path.append(src_dir)

from d01_data.utils import read_multiple_csv_and_concat

pd.set_option('display.max_columns', 500)

In [3]:
hmart_prices = read_multiple_csv_and_concat('../../data/01_raw/grocery_prices_hmart/hmart_prices*')

In [4]:
len(hmart_prices)

1200

In [6]:
hmart_prices.columns

Index(['Unnamed: 0', 'product', 'main_price', 'full_item_desc',
       'prod_aile_count'],
      dtype='object')

In [7]:
hmart_prices.drop(columns=['Unnamed: 0'], inplace=True)

In [8]:
hmart_prices

Unnamed: 0,product,main_price,full_item_desc,prod_aile_count
0,Dragon Bean Curd Stick 50/6oz,$2.79,"<li class=""item-card"" data-radium=""true""><div ...",Baking Supplies & Decor
1,Choripdong Roasted Laver,$7.83,"<li class=""item-card"" data-radium=""true""><div ...",Dried Seaweed
2,Choripdong Roasted Laver,$7.83,"<li class=""item-card"" data-radium=""true""><div ...",Dried Seaweed
3,Choripdong Roasted Laver With Grape Seed Oil,$7.27,"<li class=""item-card"" data-radium=""true""><div ...",Dried Seaweed
4,"Wel-Pac Seaweed, Fueru Wakame, Dried",$2.79,"<li class=""item-card"" data-radium=""true""><div ...",Dried Seaweed
5,Nagai Seaweed Nori Sheets Roasted - 10 CT,$4.47,"<li class=""item-card"" data-radium=""true""><div ...",Dried Seaweed
6,Seasoned Laver,$3.35,"<li class=""item-card"" data-radium=""true""><div ...",Dried Seaweed
7,Haitai Green Laver,$3.91,"<li class=""item-card"" data-radium=""true""><div ...",Dried Seaweed
8,Haio Roasted Jaerae Seaweed,$6.71,"<li class=""item-card"" data-radium=""true""><div ...",Dried Seaweed
9,Haioreum Haio Yaki Sushi Nori,$3.35,"<li class=""item-card"" data-radium=""true""><div ...",Dried Seaweed


## Split up the item column 

### Price Per LB

In [10]:
per_lb = hmart_prices['full_item_desc']

In [11]:
per_lb_final = []
per_lb_pattern = r'\d*\.[0-9]{2}\/lb'
for item in per_lb:
    try:
        per_lb_final.append(re.search(per_lb_pattern, item).group())
    except:
        per_lb_final.append(np.nan)

In [12]:
per_lb_final

[nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan

In [13]:
hmart_prices['price_per_lb'] = per_lb_final

### Measure Words

In [17]:
full_item_desc = hmart_prices['full_item_desc']

In [18]:
measure_word_pattern = 'each'
measure_word_list = []
for item in full_item_desc:
    try:
        measure_word_list.append(re.search(measure_word_pattern, item).group())
    except:
        measure_word_list.append(np.nan)

In [19]:
hmart_prices['measure_words_main_price'] = measure_word_list

### Item Weight/ Count/ Vol

In [20]:
item_weight_count_vol = []
for item in full_item_desc:
    try:
        item_weight_count_vol.append(re.search('aria-label=\"\.(.*)\">\d+', item).group(1))
    except:
        item_weight_count_vol.append(np.nan)

In [21]:
hmart_prices['item_weight_count_vol'] = item_weight_count_vol

In [22]:
hmart_prices.head()

Unnamed: 0,product,main_price,full_item_desc,prod_aile_count,price_per_lb,measure_words_main_price,item_weight_count_vol
0,Dragon Bean Curd Stick 50/6oz,$2.79,"<li class=""item-card"" data-radium=""true""><div ...",Baking Supplies & Decor,,,6 ounce
1,Choripdong Roasted Laver,$7.83,"<li class=""item-card"" data-radium=""true""><div ...",Dried Seaweed,,,2.04 ounce
2,Choripdong Roasted Laver,$7.83,"<li class=""item-card"" data-radium=""true""><div ...",Dried Seaweed,,,2.04 ounce
3,Choripdong Roasted Laver With Grape Seed Oil,$7.27,"<li class=""item-card"" data-radium=""true""><div ...",Dried Seaweed,,,2.4 ounce
4,"Wel-Pac Seaweed, Fueru Wakame, Dried",$2.79,"<li class=""item-card"" data-radium=""true""><div ...",Dried Seaweed,,,2 ounce


In [23]:
hmart_prices.drop(columns=['full_item_desc'], inplace=True)

In [24]:
hmart_prices.head()

Unnamed: 0,product,main_price,prod_aile_count,price_per_lb,measure_words_main_price,item_weight_count_vol
0,Dragon Bean Curd Stick 50/6oz,$2.79,Baking Supplies & Decor,,,6 ounce
1,Choripdong Roasted Laver,$7.83,Dried Seaweed,,,2.04 ounce
2,Choripdong Roasted Laver,$7.83,Dried Seaweed,,,2.04 ounce
3,Choripdong Roasted Laver With Grape Seed Oil,$7.27,Dried Seaweed,,,2.4 ounce
4,"Wel-Pac Seaweed, Fueru Wakame, Dried",$2.79,Dried Seaweed,,,2 ounce


In [26]:
hmart_prices['date_collected'] = '2019-08-30'
hmart_prices['store'] = 'HMART'
hmart_prices['location'] = '60043'

In [27]:
hmart_prices.to_csv('../../data/02_intermediate/grocery_prices_hmart.csv', index=False)