In [14]:
#! /usr/bin/env python3
"""
Created on Oct 24 2018

Text preprocessing and sequential labeling.

TO do list in the future:
    - pull data again using pyspark for getting original item_name
    
@author: Ray

"""

import sys
import os
import ast # real json-like string
import pandas as pd
import gc
import sys
sys.path.append('../../brand_recognition_bio_FE/preprocessing')
from clean_helpers import clean_name

def filter_data_for_ner_task(df, out_col, FLAG, attr = ' Brand'):
    """
    Return reliable data that we generally make sure attribute you assigned exist in the title.
    """
    num_filter = 0
    keep = []
    attr_tags = []
    for ix, row in df.iterrows():
        if FLAG == 'shopee':
            title = row['title'] # str
            tagging_dict = ast.literal_eval(row['mobile']) # dict
            attr_tag = tagging_dict[attr][0][0] # str
            if attr_tag.lower() in title.lower():
                keep.append(1)
                attr_tags.append(attr_tag)
            else:
                num_filter += 1
                keep.append(0)
                attr_tags.append(0)
        elif FLAG == 'lazada':
            title = row['item_name'] # str
            attr_tag = row['brand'] # str
            if attr_tag.lower() in title.lower():
                keep.append(1)
                attr_tags.append(attr_tag)
            else:
                num_filter += 1
                keep.append(0)
                attr_tags.append(0)
            
        else:
            # if the assertion fails, Python uses ArgumentExpression as the argument for the AssertionError. 
            assert False,  'the FLAG only accecpt shopee and lazada lah fuck u !!' # condition, AssertionError
    #-----------------
    # output
    #-----------------
    df['keep'] = keep
    df['attr_tags'] = attr_tags
    df.rename(columns = {'title': out_col[0], 'attr_tags':out_col[1]}, inplace = True)
    df = df[df.keep == 1]
    df = df[out_col]
    gc.collect()
    return df, num_filter

def sequence_labeling_w_bio_encoding(row, NORMALIZED = False):
    '''
    BIO encoding is a distant supervision approach to automatically generate training data for training machine- earning based model. 
    
        # B-B: 2
        # I-B: 1
        # O: 0
    Reference for distant supervision approach: http://deepdive.stanford.edu/distant_supervision
    Reference for BIO : Attribute Extraction from Product Titles in eCommerce.
    Assumption:
        - We assume that one sku only has one brand name.(kind of non-realistic)
    parameters:
    --------------
    df: DataFrame
    if_assumption: str. if True, we assume we only have one-single brand_word in one item_name. 
    Otherwise, we can have multiple token with positive lable in one item_name.
    '''

    # initialize variables
    word_list = []
    tagging = [] # multi-class label, {0:not part of the brand name, 1: intermediate part of the brand name, 2:beginning of the brand name}
    item_name = []
    val = [] 
    #---------------
    # sequential labeling with BIO encoding
    #---------------
    brand_started = False
    b_ix = 0
    brand = row.what_brand_name.iloc[0].split(' ')
    title = clean_name(row['item_name'].iloc[0]).split(' ')
    # filter
    title = [t for t in title if '' != t]
    for w_ix, word in enumerate(title):
        if word.lower() == brand[0].lower():
            tagging.append(2) # B-B: 2
            brand_started = True
            b_ix += 1
        elif (len(brand) > 1) and (brand_started):
            if b_ix >= len(brand):
                # for avoiding . For example, if 'BUMBLE AND BUMBLE by Bumble and Bumble: QUENCHING CONDITIONER 8.5 OZ'
                tagging.append(0) # O: 0
                brand_started = False  
                b_ix = 0                
            else:
                if word.lower() == brand[b_ix].lower():
                    tagging.append(1) # I-B: 1
                    b_ix += 1
                    if b_ix == len(brand):
                        # go back to orginal state because we already marked what we want
                        brand_started = False
                        b_ix = 0
                else:
                    tagging.append(0) # O: 0
                    brand_started = False     
                    # if we need to modified the labeling we priviously marked.
                    if b_ix < len(brand):
                        go_back_to_modified = 0
                        for i in range(b_ix):
                            #print ('w_ix', w_ix) # w_ix 對應的不是整個 tagging的list: 兩個解法, 1.groupby 2.w_ix要一直被加上
                            go_back_to_modified += 1
                            #print ('go back', w_ix - go_back_to_modified)
                            tagging[w_ix - go_back_to_modified] = 0 # O: 0
                        # Once removing privous labeling, we update b_ix to zero
                        b_ix = 0         
        else:
            brand_started = False
            tagging.append(0) # O: 0
        #---------------------------
        # for output dataframe
        #---------------------------
        if NORMALIZED == True:
            word_list.append(word.lower())
        else:
            word_list.append(word)
        item_name.append(clean_name(row['item_name'].iloc[0]))
        val.append(row['eval_set'].iloc[0])
    #---------------------------
    # output
    #---------------------------
    df = pd.DataFrame({'tokens':word_list, 
                'label': tagging,
                'eval_set': val,
                'item_name':item_name})[['item_name','tokens','label','eval_set']]
    return df
 

In [15]:
    #--------------------------
    # setting
    #--------------------------
    output_column = ['item_name', 'brand', 'eval_set']

    #--------------------------
    # loading data
    #--------------------------
    lazada_mobile = pd.read_csv('/data/ner_task/mobile/mobile_ID_attribute_brand.csv')
    shoope_mobile = pd.read_csv('/data/ner_task/mobile/mobile_ID_attribute_tagging.csv')

    #--------------------------
    # preprocessing data
    #--------------------------
    shoope_mobile.drop_duplicates(subset = ['itemid'], inplace = True)
    lazada_mobile.brand = lazada_mobile.brand.apply(lambda x : x.lower())
    lazada_mobile.rename(columns = {'title':'item_name'}, inplace = True)


    #--------------------------
    # data configuration: take Lazada data as testing and shoppe as training data.
    #-------------------------- 

    lazada_mobile['eval_set'] = ['test' for i in range(len(lazada_mobile))]
    shoope_mobile['eval_set'] = ['train' for i in range(len(shoope_mobile))]
    #--------------------------
    # filter: In order to get high-quality data, we remove the sku that his attribute name do not exist in title from our shopee data.
    #--------------------------
    attr = ' Brand'
    lazada_mobile, num_filter_l = filter_data_for_ner_task(lazada_mobile, FLAG = 'lazada', out_col = output_column, attr =  attr)
    shoope_mobile, num_filter_s = filter_data_for_ner_task(shoope_mobile, FLAG = 'shopee', out_col = output_column, attr =  attr)
    print ('num_sku_removed from shopee: {}'.format(num_filter_s))
    print ('num_sku_removed from lazada: {}'.format(num_filter_l))


num_sku_removed from shopee: 39873
num_sku_removed from lazada: 18004


In [16]:
shoope_mobile.shape

(269916, 3)

In [17]:
lazada_mobile.shape

(12404, 4)

In [18]:
lazada_mobile

Unnamed: 0,item_name,brand,brand.1,eval_set
0,Samsung Galaxy J2 Prime REFRESH SM-G532 - Abso...,samsung,samsung,test
1,Samsung Galaxy J1 Ace 2016 SM-J111 - 8GB - Hitam,samsung,samsung,test
2,Samsung Galaxy J2 Prime REFRESH SM-G532 - Meta...,samsung,samsung,test
3,Samsung Galaxy J1 Ace 2016 SM-J111 - 8GB - Putih,samsung,samsung,test
4,Samsung Galaxy J2 Prime SM-G532 - Silver,samsung,samsung,test
5,Samsung Galaxy J2 Pro - SMJ250 - Gold,samsung,samsung,test
6,Samsung Galaxy J1 Ace 2016 SM-J111 - 8GB - Biru,samsung,samsung,test
7,SamsungGalaxy J7 Core - SM-J701 -Gold,samsung,samsung,test
8,Samsung Guru Music SM-B310E - Biru,samsung,samsung,test
9,Samsung Phyton B310E - Dual SIM,samsung,samsung,test


In [None]:
df1.head() # is_brand --> tag

In [None]:
shoope_mobile = pd.read_csv('/data/ner_task/mobile/mobile_ID_attribute_tagging.csv')

In [None]:
shoope_mobile.shape

In [None]:
shoope_mobile.head()

In [None]:
c = 0
for ix, row in shoope_mobile.iterrows():
    title = row['title'] # str
    tagging_dict = ast.literal_eval(row['mobile']) # dict
    attr_tag = tagging_dict[' Brand'][0][0] # str
    if attr_tag.lower() in title.lower():
        pass
    else:
        print ('title', title)
        print ('attr_tag', attr_tag)
        if 'iphone' in title.lower():
            c+=1
print (c)

In [None]:
def filter_data_for_ner_task(df, out_col, attr = ' Brand'):
    """
    Return reliable data that we generally make sure attribute you assigned exist in the title.
    """
    num_filter = 0
    keep = []
    attr_tags = []
    for ix, row in df.iterrows():
        title = row['title'] # str
        tagging_dict = ast.literal_eval(row['mobile']) # dict
        attr_tag = tagging_dict[attr][0][0] # str
        if attr_tag.lower() in title.lower():
            keep.append(1)
            attr_tags.append(attr_tag)
        else:
            num_filter += 1
            keep.append(0)
            attr_tags.append(0)
    #-----------------
    # output
    #-----------------
    df['keep'] = keep
    df['attr_tags'] = attr_tags
    df.rename(columns = {'title': out_col[0], 'attr_tags':out_col[1]}, inplace = True)
    df[df.keep == 1]
    df = df[out_col]
    gc.collect()
    return df, num_filter

In [None]:
# df, num_filter = filter_data_for_ner_task(shoope_mobile.head(100), attr = ' Brand')
output_column = ['item_name', 'brand']
df, num_filter = filter_data_for_ner_task(shoope_mobile.head(100), out_col = output_column, attr =  ' Brand')

In [None]:
df

In [None]:
df.shape

In [None]:
39873 / shoope_mobile.shape[0]

In [None]:
df.head()

In [None]:
#--------------------------
# loading data
#--------------------------
lazada_mobile = pd.read_csv('/data/ner_task/mobile/mobile_ID_attribute_brand.csv')
shoope_mobile = pd.read_csv('/data/ner_task/mobile/mobile_ID_attribute_tagging.csv')

#--------------------------
# preprocessing data
#--------------------------
shoope_mobile.drop_duplicates(subset = ['itemid'], inplace = True)
lazada_mobile.brand = lazada_mobile.brand.apply(lambda x : x.lower())
lazada_mobile.rename(columns = {'title':'item_name'}, inplace = True)

brand_list_lazada = set(lazada_mobile.brand.unique())
# brand_list_shopee = set(shoope_mobile.brand.unique())
# print ('difference betwee target and source dataset', brand_list_lazada.difference(brand_list_shopee))


In [None]:
lazada_mobile.head()

In [None]:
#--------------------------
# setting
#--------------------------
output_column = ['item_name', 'brand']
#--------------------------
# loading data
#--------------------------
lazada_mobile = pd.read_csv('/data/ner_task/mobile/mobile_ID_attribute_brand.csv')
shoope_mobile = pd.read_csv('/data/ner_task/mobile/mobile_ID_attribute_tagging.csv')

#--------------------------
# preprocessing data
#--------------------------
shoope_mobile.drop_duplicates(subset = ['itemid'], inplace = True)
lazada_mobile.brand = lazada_mobile.brand.apply(lambda x : x.lower())
lazada_mobile.rename(columns = {'title':'item_name'}, inplace = True)


#--------------------------
# data configuration: take Lazada data as testing and shoppe as training data.
#-------------------------- 

lazada_mobile['eval_set'] = ['test' for i in range(len(lazada_mobile))]
shoope_mobile['eval_set'] = ['train' for i in range(len(shoope_mobile))]

# 
df = pd.concat([lazada_mobile[output_column], shoope_mobile[output_column]], axis = 0)


In [None]:
shoope_mobile.head()

In [10]:
lazada_mobile = pd.read_csv('/data/ner_task/mobile/mobile_ID_attribute_brand.csv')

In [12]:
lazada_mobile.columns.tolist()

['brand', 'title', 'price', 'page_num', 'item_url', 'image_url']

In [13]:
lazada_mobile.head()

Unnamed: 0,brand,title,price,page_num,item_url,image_url
0,Samsung,Samsung Galaxy J2 Prime REFRESH SM-G532 - Abso...,1255000.00Rp,1,https://www.lazada.co.id/products/samsung-gala...,https://id-live-01.slatic.net/original/975bb08...
1,Samsung,Samsung Galaxy J1 Ace 2016 SM-J111 - 8GB - Hitam,1029000.00Rp,1,https://www.lazada.co.id/products/samsung-gala...,https://id-test-11.slatic.net/p/2/samsung-gala...
2,Samsung,Samsung Galaxy J2 Prime REFRESH SM-G532 - Meta...,1255000.00Rp,1,https://www.lazada.co.id/products/samsung-gala...,https://id-live-01.slatic.net/original/aa5584e...
3,Samsung,Samsung Galaxy J1 Ace 2016 SM-J111 - 8GB - Putih,1029000.00Rp,1,https://www.lazada.co.id/products/samsung-gala...,https://id-test-11.slatic.net/p/2/samsung-gala...
4,Samsung,Samsung Galaxy J2 Prime SM-G532 - Silver,1255000.00Rp,1,https://www.lazada.co.id/products/samsung-gala...,https://id-test-11.slatic.net/p/2/samsung-gala...


In [3]:
#! /usr/bin/env python3
"""
Created on Oct 24 2018

It provides text preprocessing and 
@author: Ray

"""

import sys
import ast # real json-like string
import pandas as pd
import gc
import sys
import os
sys.path.append('../../brand_recognition_bio_FE/preprocessing')
sys.path.append('../../brand_recognition_bio_FE/py_model')
from preprocessing import sequence_labeling_w_bio_encoding
import logging

def filter_data_for_ner_task(df, out_col, attr = ' Brand'):
    """
    Return reliable data that we generally make sure attribute you assigned exist in the title.
    """
    num_filter = 0
    keep = []
    attr_tags = []
    for ix, row in df.iterrows():
        title = row['title'] # str
        tagging_dict = ast.literal_eval(row['mobile']) # dict
        attr_tag = tagging_dict[attr][0][0] # str
        if attr_tag.lower() in title.lower():
            keep.append(1)
            attr_tags.append(attr_tag)
        else:
            num_filter += 1
            keep.append(0)
            attr_tags.append(0)
    #-----------------
    # output
    #-----------------
    df['keep'] = keep
    df['attr_tags'] = attr_tags
    df.rename(columns = {'title': out_col[0], 'attr_tags':out_col[1]}, inplace = True)
    df = df[df.keep == 1]
    df = df[out_col]
    gc.collect()
    return df, num_filter

if __name__ == '__main__':
    #--------------------------
    # setting
    #--------------------------
    output_column = ['item_name', 'brand', 'eval_set']

    #--------------------------
    # loading data
    #--------------------------
    lazada_mobile = pd.read_csv('/data/ner_task/mobile/mobile_ID_attribute_brand.csv')
    shoope_mobile = pd.read_csv('/data/ner_task/mobile/mobile_ID_attribute_tagging.csv')

    #--------------------------
    # preprocessing data
    #--------------------------
    shoope_mobile.drop_duplicates(subset = ['itemid'], inplace = True)
    lazada_mobile.brand = lazada_mobile.brand.apply(lambda x : x.lower())
    lazada_mobile.rename(columns = {'title':'item_name'}, inplace = True)


    #--------------------------
    # data configuration: take Lazada data as testing and shoppe as training data.
    #-------------------------- 

    lazada_mobile['eval_set'] = ['test' for i in range(len(lazada_mobile))]
    shoope_mobile['eval_set'] = ['train' for i in range(len(shoope_mobile))]
    #--------------------------
    # filter: In order to get high-quality data, we remove the sku that his attribute name do not exist in title from our shopee data.
    #--------------------------
    attr = ' Brand'
    shoope_mobile, num_filter = filter_data_for_ner_task(shoope_mobile, out_col = output_column, attr =  attr)
    print ('num_sku_removed: {}'.format(num_filter))

    df = pd.concat([lazada_mobile[output_column], shoope_mobile[output_column]], axis = 0)
    del lazada_mobile, shoope_mobile
    gc.collect()

    #--------------------------
    # BIO tagging
    #--------------------------
    df.rename(columns = {'brand':'what_brand_name'}, inplace = True)


num_sku_removed: 39873


In [4]:
df

Unnamed: 0,item_name,what_brand_name,eval_set
0,Samsung Galaxy J2 Prime REFRESH SM-G532 - Abso...,samsung,test
1,Samsung Galaxy J1 Ace 2016 SM-J111 - 8GB - Hitam,samsung,test
2,Samsung Galaxy J2 Prime REFRESH SM-G532 - Meta...,samsung,test
3,Samsung Galaxy J1 Ace 2016 SM-J111 - 8GB - Putih,samsung,test
4,Samsung Galaxy J2 Prime SM-G532 - Silver,samsung,test
5,Samsung Galaxy J2 Pro - SMJ250 - Gold,samsung,test
6,Samsung Galaxy J1 Ace 2016 SM-J111 - 8GB - Biru,samsung,test
7,SamsungGalaxy J7 Core - SM-J701 -Gold,samsung,test
8,Samsung Guru Music SM-B310E - Biru,samsung,test
9,Samsung Phyton B310E - Dual SIM,samsung,test


In [5]:
df.head(300).groupby('item_name').apply(lambda x: sequence_labeling_w_bio_encoding(x, NORMALIZED = False)).reset_index(drop = True)

KeyError: 'item_name'

In [8]:
df1 = pd.read_csv('../data/processed/mobile_training_v2.csv')

In [9]:
df1

Unnamed: 0,item_name,tokens,label,eval_set
0,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,Samsung,2,test
1,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,Galaxy,0,test
2,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,J1,0,test
3,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,Ace,0,test
4,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,2016,0,test
5,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,SM-J111F,0,test
6,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,8GB,0,test
7,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,-,0,test
8,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,White,0,test
9,Blackview BV8000 Pro RAM 6GB 64GB IP68 Wa...,Blackview,2,test
