In [3]:
#! /usr/bin/env python3
"""
Created on Oct 24 2018

Text preprocessing and sequential labeling.

TO do list in the future:
    - pull data again using pyspark for getting original item_name when need feature engineering.

@author: Ray

"""

import sys
import os
import ast # real json-like string
import pandas as pd
import gc
sys.path.append('../../brand_recognition_bio_FE/preprocessing')
from clean_helpers import clean_name

pd.options.display.max_columns = 100
pd.options.display.max_rows = 500
pd.options.display.max_colwidth = 1000


def filter_data_for_ner_task(df, out_col, FLAG, attr = ' Brand'):
    """
    Return reliable data that we generally make sure attribute you assigned exist in the title.
    """
    num_filter = 0
    keep = []
    attr_tags = []
    sku_need_to_replenish_attribute = []
    for ix, row in df.iterrows():
        if FLAG == 'shopee':
            title = row['title'] # str
            #tagging_dict = ast.literal_eval(row['mobile']) # dict 
            tagging_dict = ast.literal_eval(row[9]) 
            #--------------
            # preprocessing for tagging filed in raw shaopee data
            #--------------
            try:
                if tagging_dict[attr] != 'no value':
                    attr_tag = tagging_dict[attr][0][0] # str
                else:
                    attr_tag = tagging_dict[attr] # str
            except:
                try:
                    attr1 = 'Brand'
                    if tagging_dict[attr1] != 'no value':
                        attr_tag = tagging_dict[attr1][0][0] # str
                    else:
                        attr_tag = tagging_dict[attr1] # str
                except:
                    attr2 = ' brand'
                    if tagging_dict[attr2] != 'no value':
                        attr_tag = tagging_dict[attr2][0][0] # str
                    else:
                        attr_tag = tagging_dict[attr2] # str
                   
            #--------------
            # core
            #--------------
            if attr_tag == 'no value':
                num_filter += 1
                keep.append(0)
                attr_tags.append(0)
                sku_need_to_replenish_attribute.append(title)
            elif attr_tag.lower() in title.lower():
                keep.append(1)
                attr_tags.append(attr_tag)
            else:
                num_filter += 1
                keep.append(0)
                attr_tags.append(0)
        elif FLAG == 'lazada':
            title = row['item_name'] # str
            attr_tag = row['brand'] # str
            if attr_tag.lower() in title.lower():
                keep.append(1)
                attr_tags.append(attr_tag)
            else:
                num_filter += 1
                keep.append(0)
                attr_tags.append(0)
            
        else:
            # if the assertion fails, Python uses ArgumentExpression as the argument for the AssertionError. 
            assert False,  '========= the FLAG only accecpt shopee and lazada lah fuck u =========' # condition, AssertionError
    # save sku without attribute value
    df[df.title.isin(sku_need_to_replenish_attribute)].to_csv('../data/processed/{}_missing.csv'.format(category), index = False)

    #-----------------
    # output
    #-----------------
    df['keep'] = keep
    df['attr_tags'] = attr_tags

    if FLAG == 'shopee':
        pass
    elif FLAG == 'lazada':
        df.drop(['brand'], axis = 1, inplace = True)
    else:
        assert False, '========= the FLAG only accecpt shopee and lazada lah fuck u ========='

    df.rename(columns = {'title': out_col[0], 'attr_tags':out_col[1]}, inplace = True)
    df = df[df.keep == 1]
    df = df[out_col]
    gc.collect()
    return df, num_filter, sku_need_to_replenish_attribute

def sequence_labeling_w_bio_encoding(row, NORMALIZED = False):
    '''
    BIO encoding is a distant supervision approach to automatically generate training data for training machine-learning based model. 
    
        # B-B: 2
        # I-B: 1
        # O: 0
    Reference for distant supervision approach: http://deepdive.stanford.edu/distant_supervision
    Reference for BIO : Attribute Extraction from Product Titles in eCommerce.
   
    parameters:
    --------------
    df: DataFrame
    if_assumption: str. if True, we assume we only have one-single brand_word in one item_name. 
    Otherwise, we can have multiple token with positive lable in one item_name.
    '''

    # initialize variables
    word_list = []
    tagging = [] # multi-class label, {0:not part of the brand name, 1: intermediate part of the brand name, 2:beginning of the brand name}
    item_name = []
    val = [] 
    #---------------
    # sequential labeling with BIO encoding
    #---------------
    brand_started = False
    b_ix = 0
    brand = row.what_brand_name.iloc[0].split(' ')
    title = clean_name(row['item_name'].iloc[0]).split(' ')
    # filter
    title = [t for t in title if '' != t]
    for w_ix, word in enumerate(title):
        if word.lower() == brand[0].lower():
            tagging.append(2) # B-B: 2
            brand_started = True
            b_ix += 1
        elif (len(brand) > 1) and (brand_started):
            if b_ix >= len(brand):
                # for avoiding . For example, if 'BUMBLE AND BUMBLE by Bumble and Bumble: QUENCHING CONDITIONER 8.5 OZ'
                tagging.append(0) # O: 0
                brand_started = False  
                b_ix = 0                
            else:
                if word.lower() == brand[b_ix].lower():
                    tagging.append(1) # I-B: 1
                    b_ix += 1
                    if b_ix == len(brand):
                        # go back to orginal state because we already marked what we want
                        brand_started = False
                        b_ix = 0
                else:
                    tagging.append(0) # O: 0
                    brand_started = False     
                    # if we need to modified the labeling we priviously marked.
                    if b_ix < len(brand):
                        go_back_to_modified = 0
                        for i in range(b_ix):
                            #print ('w_ix', w_ix) # w_ix 對應的不是整個 tagging的list: 兩個解法, 1.groupby 2.w_ix要一直被加上
                            go_back_to_modified += 1
                            #print ('go back', w_ix - go_back_to_modified)
                            tagging[w_ix - go_back_to_modified] = 0 # O: 0
                        # Once removing privous labeling, we update b_ix to zero
                        b_ix = 0         
        else:
            brand_started = False
            tagging.append(0) # O: 0
        #---------------------------
        # for output dataframe
        #---------------------------
        if NORMALIZED == True:
            word_list.append(word.lower())
        else:
            word_list.append(word)
        item_name.append(clean_name(row['item_name'].iloc[0]))
        val.append(row['eval_set'].iloc[0])
    #---------------------------
    # output
    #---------------------------
    df = pd.DataFrame({'tokens':word_list, 
                'label': tagging,
                'eval_set': val,
                'item_name':item_name})[['item_name','tokens','label','eval_set']]
    return df

# Beauty-Lips
# Beauty-face
# Women top
# Women dress


In [2]:
os.listdir('/data/ner_task/dress/shopee_data_tagging_result/')

['lips', 'dress', 'face', 'women_top', 'mobile']

In [3]:
# shopee = pd.read_csv('/data/ner_task/dress/shopee_data_tagging_result/dress/dress_ID_attribute_tagging_v2.csv')
# shopee

In [4]:
# for ix, row in shoope_mobile.sample(100).iterrows():
#     print (row[9])

In [5]:
os.listdir('/data/ner_task/dress/shopee_data_tagging_result/')

['lips', 'dress', 'face', 'women_top', 'mobile']

In [6]:
for category in os.listdir('/data/ner_task/dress/shopee_data_tagging_result/')[:]:
    #--------------------------
    # setting
    #--------------------------
    print ('category : {}'.format(category))
    file_name = [i for i in os.listdir('/data/ner_task/dress/shopee_data_tagging_result/{}'.format(category)) if 'v2' in i ][0]
    date_path = '/data/ner_task/dress/shopee_data_tagging_result/{}/{}'.format(category, file_name)
    # output column
    output_column = ['item_name', 'brand', 'eval_set']
    # train/val ratio
    val_size = 0.1
    #--------------------------
    # loading data
    #--------------------------
    shopee = pd.read_csv(date_path)
    print ('# training sku : {}'.format(len(shopee[shopee.eval_set == 'train'])))
    print ('# testing sku : {}'.format(len(shopee[shopee.eval_set == 'test'])))

    #--------------------------
    # preprocessing data
    #--------------------------
    # droup duplicated: make our result trustworthy, don't count the same itemname with same brand 
    shopee.drop_duplicates(subset = ['title'], inplace = True)
    shopee = shopee.reset_index(drop = True)
    #--------------------------
    # filter: In order to get high-quality data, we remove the sku that his attribute name do not exist in title from our shopee data.
    #--------------------------
    attr = ' Brand'
    shopee, num_filter_s, sku_need_to_replenish_attribute = filter_data_for_ner_task(shopee, FLAG = 'shopee', out_col = output_column, attr =  attr)
    shopee.reset_index(drop = True, inplace = True)
    print ('eval_set-distribution', shopee.eval_set.value_counts())
    #--------------------------
    # data configuration
    #-------------------------- 
    # using time-series split method as validating strategy: switch to shuffle with item_id
    n = int(len(shopee[shopee.eval_set == 'train']) * val_size)
    print ('number of validating sku : {}'.format(n))
    val_item_name = set(pd.Series(shopee[shopee.eval_set == 'train'].tail(n).item_name))
    for ix, row in shopee[shopee.eval_set == 'train'].iterrows():
        if row['item_name'] in val_item_name:
            shopee['eval_set'].iloc[ix] = 'val' 

    print ('num_sku_removed from shopee: {}'.format(num_filter_s))
    print ('# of total sku : {}'.format(len(shopee)))
    print ('# training sku : {}'.format(len(shopee[shopee.eval_set == 'train'])))
    print ('# validating sku : {}'.format(len(shopee[shopee.eval_set == 'val'])))
    print ('# testing sku : {}'.format(len(shopee[shopee.eval_set == 'test'])))
    print ('testing ration : {}'.format(len(shopee[shopee.eval_set == 'test']) / len(shopee)))
    df = pd.concat([
        #lazada_mobile[output_column], 
        shopee[output_column]
    ], axis = 0)
    #del shopee
    gc.collect()
    #--------------------------
    # BIO tagging
    #--------------------------
    df.rename(columns = {'brand':'what_brand_name'}, inplace = True)

    df = df.groupby('item_name').apply(lambda x: sequence_labeling_w_bio_encoding(x, NORMALIZED = False)).reset_index(drop = True)

    #-------------------------
    # post-processing: remove some sku we cannot tag trough our sequential labeling.
    #-------------------------
    '''
    It's very little and the reason why this happening after text processomg is, for example, let's say a title of sku and brand is buy 1 get 1 free kasus and asus
    Then, he won't be filtered by text processing.
    '''
    no_label_item_name = df.groupby('item_name').label.mean().to_frame().reset_index()
    no_label_item_name = no_label_item_name[no_label_item_name.label == 0].item_name.tolist()
    df = df[~df.item_name.isin(no_label_item_name)]

    gc.collect()
    #--------------------------
    # save
    #--------------------------
    base_path = '../data/processed'

    if not os.path.isdir(base_path):
        os.makedirs(base_path)

    df.to_csv(os.path.join(base_path,'{}_training.csv'.format(category)) , index = False)
    print ('sucessfully saving the {} data '.format(category))

category : lips
# training sku : 112415
# testing sku : 28104
eval_set-distribution train    35291
test      8857
Name: eval_set, dtype: int64
number of validating sku : 3529
num_sku_removed from shopee: 62685
# of total sku : 44148
# training sku : 31762
# validating sku : 3529
# testing sku : 8857
testing ration : 0.20062063966657606
sucessfully saving the lips data
category : dress
# training sku : 302864
# testing sku : 75716
eval_set-distribution train    3498
test      492
Name: eval_set, dtype: int64
number of validating sku : 349
num_sku_removed from shopee: 332061
# of total sku : 3990
# training sku : 3149
# validating sku : 349
# testing sku : 492
testing ration : 0.12330827067669173
sucessfully saving the dress data
category : face
# training sku : 211920
# testing sku : 52990
eval_set-distribution train    58844
test     15121
Name: eval_set, dtype: int64
number of validating sku : 5884
num_sku_removed from shopee: 138161
# of total sku : 73965
# training sku : 52960
# val

In [7]:
assert 1==0

AssertionError: 

In [None]:
8857/44148

In [None]:
df

In [None]:
sku_need_to_replenish_attribute

In [None]:
#shopee.eval_set.value_counts()

In [None]:
"""
drop by itemid:
lips
num_sku_removed from shopee: 509
# of total sku : 491
# training sku : 416
# validating sku : 75
# testing sku : 0
category : dress
number of validating sku : 100
num_sku_removed from shopee: 997
# of total sku : 3
# training sku : 3
# validating sku : 0
# testing sku : 0
category : face
number of validating sku : 100
num_sku_removed from shopee: 618
# of total sku : 382
# training sku : 333
# validating sku : 49
# testing sku : 0
category : women_top
number of validating sku : 100
num_sku_removed from shopee: 988
# of total sku : 12
# training sku : 10
# validating sku : 2
# testing sku : 0
category : mobile
number of validating sku : 100
num_sku_removed from shopee: 279
# of total sku : 721
# training sku : 643
# validating sku : 78
# testing sku : 0

drop by itemname:
lips
num_sku_removed from shopee: 470
# of total sku : 439
# training sku : 394
# validating sku : 45
# testing sku : 0
category : dress
number of validating sku : 98
num_sku_removed from shopee: 982
# of total sku : 3
# training sku : 3
# validating sku : 0
# testing sku : 0
category : face
number of validating sku : 95
num_sku_removed from shopee: 604
# of total sku : 355
# training sku : 317
# validating sku : 38
# testing sku : 0
category : women_top
number of validating sku : 97
num_sku_removed from shopee: 967
# of total sku : 12
# training sku : 10
# validating sku : 2
# testing sku : 0
category : mobile
number of validating sku : 98
num_sku_removed from shopee: 275
# of total sku : 713
# training sku : 638
# validating sku : 75
# testing sku : 0
"""

In [None]:
shopee.drop_duplicates('item_name').shape

In [None]:
shopee.reset_index(drop = True)

In [None]:
pd.read_csv('/data/ner_task/data_for_brand_detection_model/face_training.csv')

In [None]:
'women_top_training.csv'[:-13]

# check out the original brand column built from ko.wen, dose she provie multiple brand?

In [4]:
date_path = '/data/ner_task/dress/shopee_data_tagging_result/dress/dress_ID_attribute_tagging_v2.csv'
df = pd.read_csv(date_path)

In [5]:
df

Unnamed: 0,itemid,title,image_urls,l1,l2,l3,1_gram,2_gram,3_gram,dresse,item_ctime,eval_set
0,103565,dress import hijau sifon onlineshop murah cd-252,"['f987aa28f068a9cbbd9439cd298216e5', '0717ea95c740cef9ea1a94d53260b8df', '18272a42c0230146987dd9108702181d', '4d3a63ae02de8b273144f4a5a65d7fa4', 'd65085ef595dbffe26a1d720b15c49ea', 'dc2eb02f21853dc6de0714ea5209f947', '108c10816e45a56615d7deef2880dfda']",Women Clothes,Dress,Casual Dress,"[('dress', '2'), ('sifon', '1'), ('casual', '1'), ('import', '1')]","[('import casual', '1'), ('sifon import', '1'), ('casual dress', '1'), ('dress sifon', '1')]","[('sifon import casual', '1'), ('dress sifon import', '1'), ('import casual dress', '1')]","{' Dress Length': 'no value', ' Collar Type': 'no value', ' Pattern': 'no value', ' Brand': 'no value', 'Dress Type': 'no value', ' Sleeves': 'no value', ' Clothing Material': [('Chiffon', '6')], ' Color Family': [('green', '1')], ' Fashion Trend': 'no value'}",2015-05-21 21:06:49,train
1,193241,londgress brukat,['9ff95c8c47b92890fe57b7563b7e9a19'],Women Clothes,Dress,Casual Dress,"[('brukat', '3'), ('dress', '2'), ('brokat', '1')]","[('dress brukat', '2')]",[],"{' Dress Length': 'no value', ' Collar Type': 'no value', ' Pattern': [('Brocade', '1')], ' Brand': 'no value', 'Dress Type': 'no value', ' Sleeves': 'no value', ' Clothing Material': 'no value', ' Color Family': 'no value', ' Fashion Trend': 'no value'}",2015-06-17 14:00:51,train
2,254350,floral blue dress by zalora,"['be22750cd3e28cbf7355b6a8276cc4b3', '4898c36118f5b7d1f87888a1f1fcd743', '18e5e892d3fb4653330a9a88878c85b3']",Women Clothes,Dress,Casual Dress,"[('dress', '1'), ('zalora', '1')]","[('zalora dress', '1')]",[],"{' Dress Length': 'no value', ' Collar Type': 'no value', ' Pattern': [('Floral', '1')], ' Brand': 'no value', 'Dress Type': 'no value', ' Sleeves': 'no value', ' Clothing Material': 'no value', ' Color Family': [('Floral', '1'), ('Blue', '1')], ' Fashion Trend': 'no value'}",2015-06-30 10:53:59,train
3,385458,white dior dress,['9d7c249a9fbdc0fcd068613606580906'],Women Clothes,Dress,Casual Dress,"[('dior', '1')]",[],[],"{' Dress Length': 'no value', ' Collar Type': 'no value', ' Pattern': 'no value', ' Brand': 'no value', 'Dress Type': 'no value', ' Sleeves': 'no value', ' Clothing Material': 'no value', ' Color Family': [('White', '1')], ' Fashion Trend': 'no value'}",2015-07-19 02:26:30,train
4,505033,maxi dress jeans button,['9405bc1c5a2e1e92ae355464a9c0133e'],Women Clothes,Dress,Maxi Dress,"[('jeans', '2'), ('button', '2'), ('maxi', '1'), ('dress', '1')]","[('button maxi', '1'), ('maxi jeans', '1'), ('dress jeans', '1'), ('jeans button', '1')]","[('dress jeans button', '1'), ('button maxi jeans', '1')]","{' Dress Length': [('Maxi', '5')], ' Collar Type': 'no value', ' Pattern': 'no value', ' Brand': 'no value', 'Dress Type': [('Maxi Dress', '5')], ' Sleeves': 'no value', ' Clothing Material': 'no value', ' Color Family': 'no value', ' Fashion Trend': 'no value'}",2015-07-30 19:57:57,train
5,528784,little black sparkling dress,"['68aec3dc0f20f18f27a1ab9a201c769a', 'aa63a61f6b2214d5b91695e013986235']",Women Clothes,Dress,Casual Dress,"[('dress', '1'), ('little', '1'), ('black', '1')]","[('little black', '1'), ('black dress', '1')]","[('little black dress', '1')]","{' Dress Length': 'no value', ' Collar Type': 'no value', ' Pattern': 'no value', ' Brand': 'no value', 'Dress Type': 'no value', ' Sleeves': 'no value', ' Clothing Material': 'no value', ' Color Family': [('Black', '5')], ' Fashion Trend': 'no value'}",2015-08-01 17:43:37,train
6,550030,dress princess grey gatsuone,"['dc17ec7b215fd84e6cbaa8443aa3399f', '7792bb4b5d23cd49a74252f2a4747e09', '911e685366de69ae2e90b72ee28bc96d', 'e44696f34207d152087bb623f0a462fc']",Women Clothes,Dress,Casual Dress,"[('dress', '3'), ('wanita', '1'), ('casual', '1')]","[('dress casual', '1'), ('dress wanita', '1'), ('casual dress', '1')]","[('dress casual dress', '1')]","{' Dress Length': 'no value', ' Collar Type': 'no value', ' Pattern': 'no value', ' Brand': 'no value', 'Dress Type': 'no value', ' Sleeves': 'no value', ' Clothing Material': 'no value', ' Color Family': [('Grey', '1')], ' Fashion Trend': 'no value'}",2015-08-03 16:08:29,train
7,550070,dress dora light grey black gatsuone,"['9f06e90dbd2447508bfd121966f07b19', '9de2aecaa499c8ba47c362d69443a969', 'd5bfdca55496a7e9b57cf3f2865e95c4', '4a92ecf62892664833709a5b9926f33b']",Women Clothes,Dress,Casual Dress,"[('dress', '1'), ('casual', '1')]","[('casual dress', '1')]",[],"{' Dress Length': 'no value', ' Collar Type': 'no value', ' Pattern': 'no value', ' Brand': 'no value', 'Dress Type': 'no value', ' Sleeves': 'no value', ' Clothing Material': 'no value', ' Color Family': [('Grey', '1'), ('Black', '1')], ' Fashion Trend': 'no value'}",2015-08-03 16:10:31,train
8,550361,dress calista blue polkadot gatsuone,"['885347f16d9b087b5e13e1443441b958', '004217a262f0ab2f2b5cdb2326c736b1', 'a33c18de42aecbc17c6310d49201e19b', 'f14bb68d4afd6da94b46d67785eb5a3a']",Women Clothes,Dress,Casual Dress,"[('dress', '4'), ('casual', '2'), ('dres', '1'), ('polkadot', '1')]","[('casual dress', '2'), ('polkadot dress', '1'), ('dres casual', '1')]","[('dres casual dress', '1')]","{' Dress Length': 'no value', ' Collar Type': 'no value', ' Pattern': 'no value', ' Brand': 'no value', 'Dress Type': 'no value', ' Sleeves': 'no value', ' Clothing Material': 'no value', ' Color Family': [('Blue', '1')], ' Fashion Trend': 'no value'}",2015-08-03 16:33:28,train
9,550384,dress clarisa blue denim gatsuone,"['7da9cd9655da79f737c595740adbab77', '09c7072ee807d5b4966d3c1ae420a477', '0df8e717c11dfa5196b3cbe92ca1747b', '290b2dd243730641ba1104ee812b66f5']",Women Clothes,Dress,Casual Dress,"[('dress', '3'), ('dres', '1'), ('casual', '1'), ('clarisa', '1')]","[('clarisa dress', '1'), ('casual dress', '1'), ('dres casual', '1')]","[('dres casual dress', '1')]","{' Dress Length': 'no value', ' Collar Type': 'no value', ' Pattern': 'no value', ' Brand': 'no value', 'Dress Type': 'no value', ' Sleeves': 'no value', ' Clothing Material': [('Denim', '1')], ' Color Family': [('Blue', '1')], ' Fashion Trend': 'no value'}",2015-08-03 16:36:53,train


In [8]:
attr = ' Brand'
for ix, row in df.head().iterrows():
    title = row['title'] # str
    #tagging_dict = ast.literal_eval(row['mobile']) # dict 
    tagging_dict = ast.literal_eval(row[9]) 
    print ('tagging_dict',tagging_dict)
    #--------------
    # preprocessing for tagging filed in raw shaopee data
    #--------------
    try:
        if tagging_dict[attr] != 'no value':
            attr_tag = tagging_dict[attr][0][0] # str
        else:
            attr_tag = tagging_dict[attr] # str
    except:
        try:
            attr1 = 'Brand'
            if tagging_dict[attr1] != 'no value':
                attr_tag = tagging_dict[attr1][0][0] # str
            else:
                attr_tag = tagging_dict[attr1] # str
        except:
            attr2 = ' brand'
            if tagging_dict[attr2] != 'no value':
                attr_tag = tagging_dict[attr2][0][0] # str
            else:
                attr_tag = tagging_dict[attr2] # str
    

tagging_dict {' Dress Length': 'no value', ' Collar Type': 'no value', ' Pattern': 'no value', ' Brand': 'no value', 'Dress Type': 'no value', ' Sleeves': 'no value', ' Clothing Material': [('Chiffon', '6')], ' Color Family': [('green', '1')], ' Fashion Trend': 'no value'}
tagging_dict {' Dress Length': 'no value', ' Collar Type': 'no value', ' Pattern': [('Brocade', '1')], ' Brand': 'no value', 'Dress Type': 'no value', ' Sleeves': 'no value', ' Clothing Material': 'no value', ' Color Family': 'no value', ' Fashion Trend': 'no value'}
tagging_dict {' Dress Length': 'no value', ' Collar Type': 'no value', ' Pattern': [('Floral', '1')], ' Brand': 'no value', 'Dress Type': 'no value', ' Sleeves': 'no value', ' Clothing Material': 'no value', ' Color Family': [('Floral', '1'), ('Blue', '1')], ' Fashion Trend': 'no value'}
tagging_dict {' Dress Length': 'no value', ' Collar Type': 'no value', ' Pattern': 'no value', ' Brand': 'no value', 'Dress Type': 'no value', ' Sleeves': 'no value', ' 

In [9]:
attr_tag

'no value'