In [1]:
import pandas as pd
import jieba
import re
import jieba.analyse
from helper import *
from sklearn.feature_extraction.text import TfidfVectorizer
import operator
jieba.initialize() 
#jieba.set_dictionary("dict.txt.big")
jieba.load_userdict("user_dict.txt")
#jieba.analyse.set_stop_words("stopwords.txt")

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Zhijun\AppData\Local\Temp\jieba.cache
Loading model cost 1.267 seconds.
Prefix dict has been built succesfully.


## Step1: Read Data

In [2]:
train = pd.read_csv('train.csv', encoding = 'utf-8')
test = pd.read_csv('test.csv', encoding = 'utf-8')

Step 1.1 Remove anomalies in data

In [3]:
train = train[(train['Product Name'] != '#NAME?') & (train['Product Name'] != '#ERROR!')]

## Step2: Data Transformation

In [4]:
num_of_impression = train.groupby(['Product Name', 'Category','Query']).size().reset_index()
num_of_impression.columns.values[3] = 'num_of_impressions'

In [5]:
num_of_click = train[train.Event == 'Click'].groupby(['Product Name', 'Category','Query']).size().reset_index()
num_of_click.columns.values[3] = 'num_of_clicks'

In [6]:
train = train.merge(num_of_impression, how = 'left', on = ['Product Name', 'Category','Query'])
train = train.merge(num_of_click, how = 'left', on = ['Product Name', 'Category','Query'])
train.fillna(0, inplace = True)
train.head()

Unnamed: 0,Product Name,Category,Query,Event,Date,num_of_impressions,num_of_clicks
0,--- X 10 --- 七色 多層次搭配 圓下擺 LAYERED 素面 無袖背心 打底,Male Fashion,無袖,Impression,31/7/17,1,0.0
1,︱IBIT︱Gymshark 熱銷款 運動T恤 健身T恤 圓領短T 運動短T 健身鯊魚,Male Fashion,gymshark,Impression,31/7/17,1,0.0
2,︱IBIT︱Gymshark 超高彈性 短褲 運動短褲 跑步短褲 深蹲褲 訓練短褲,Male Fashion,gymshark,Impression,31/7/17,1,0.0
3,::另類情侶兄弟姊妹殼::電力滿格/不足黑白趣味浮雕手機軟殼i5/i5s/i5se/i6/i...,Mobile & Gadgets,軟殼,Click,31/7/17,1,1.0
4,：新舊手機商場：Iphone6 16金 （需要看細圖密我）,Mobile & Gadgets,iphone6 系列,Impression,30/7/17,1,0.0


In [7]:
train['click_per_impre'] = train['num_of_clicks'] / train['num_of_impressions']

## Step3: Split dataset by product categories

In [8]:
mg_train = train[train.Category == 'Mobile & Gadgets'].reset_index()
ff_train = train[train.Category == 'Female Fastion'].reset_index()
mf_train = train[train.Category == 'Male Fashion'].reset_index()
print(mg_train.shape)
print(ff_train.shape)
print(mf_train.shape)

(4169, 9)
(3064, 9)
(3070, 9)


## Step4: Clean text data

In [9]:
stopword_list = stopwordslist('stopwords.txt')

In [10]:
def disambiguation_mg(text):
    
    text = re.sub('i phone', 'iphone', text.lower())

    
    # plus, prime, edge (eg: "j7 prime" --> "j7prime")
    for word in ['plus', 'edge', 'prime']:
        pattern = '[0-9a-z三星samsung]+ ' + word
        old_parts = re.findall(pattern, text)
        new_parts = [re.sub(' ' + word, word, part) for part in old_parts]
    
        for i in range(len(old_parts)):
            text = re.sub(old_parts[i], new_parts[i], text)
            
    # iphone (eg: "iphone 6s" --> "iphone6s")
    if ('iphone' in text):
        old_parts = re.findall('iphone [0-9seplus]+', text)
        new_parts = [re.sub('iphone ', 'iphone', part) for part in old_parts]
        
        for i in range(len(old_parts)):
            text = re.sub(old_parts[i], new_parts[i], text)  
    
    # ipad
    if ('ipad' in text):
        old_parts = re.findall('ipad [0-9miniproair]+', text)
        new_parts = [re.sub('ipad ', 'ipad', part) for part in old_parts]
        
        for i in range(len(old_parts)):
            text = re.sub(old_parts[i], new_parts[i], text)  
            
    # ipod
    if ('ipod' in text):
        old_parts = re.findall('ipod [touchnano]+', text)
        new_parts = [re.sub('ipod ', 'ipod', part) for part in old_parts]
        
        for i in range(len(old_parts)):
            text = re.sub(old_parts[i], new_parts[i], text) 
            
    # 'note 4/5/6', or 'pro 4/5/6'
    for word in ['note', 'pro']:
        if (word in text):
            pattern = word + ' [0-9]+'
            old_parts = re.findall(pattern, text)
            new_parts = [re.sub(word + ' ', word, part) for part in old_parts]
        
            for i in range(len(old_parts)):
                text = re.sub(old_parts[i], new_parts[i], text) 
            
    # unit word
    for word in ['角', '入','万', '天','号', '代', '年','元', '折', 'mah', 'cm', '吋', '寸','毫安', '公分', '色']:
        if (word in text):
            pattern = '[0-9.]+ ' + word
            old_parts = re.findall(pattern, text)
            new_parts = [re.sub(' '+ word, word, part) for part in old_parts]
    
            for i in range(len(old_parts)):
                text = re.sub(old_parts[i], new_parts[i], text)    
    
    
    return text

In [11]:
def pre_processing(text, mode, cate_id, stopwords = [' '] + stopword_list):
    
    # replace emoji
    try:
        # UCS-4
        highpoints = re.compile(u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])|([\U000025A0-\U000025FF])|([\U00002500-\U0000257F])|([\U00002B50])|([\U000010E6])|(\U0000F8FF)')
    except re.error:
        # UCS-2
        highpoints = re.compile(u'([\u2600-\u27BF])|([\uD83C][\uDF00-\uDFFF])|([\uD83D][\uDC00-\uDE4F])|([\uD83D][\uDE80-\uDEFF])|([\u25A0-\u25FF])|([\u2500-\u257F])|([\u2B50])|([\u10e6])|(\uf8ff)')
    
    res = highpoints.sub(u'??', text)  
    
    # for english character: convert to lower case
    res = res.lower()
    
    if cate_id == 'mg':
        res = disambiguation_mg(res)
    elif cate_id == 'ff':
        res = res
    elif cate_id == 'mf':
        res = res
    
    # split words
    word_list = jieba.lcut_for_search(res, HMM=True)
    
    # remove punctuation & stopwords
    if mode == 'simplified':
        word_list = [Converter('zh-hans').convert(ele) for ele in word_list if ele not in stopwords]
    elif mode == 'traditional':
        word_list = [ele for ele in word_list if ele not in stopwords]
    
        
    # remove substring
    delete_words = []
    for idx in range(len(word_list)):
        for i in range((idx+1), len(word_list)):
            if ((word_list[idx] in word_list[i]) & (word_list[idx] != word_list[i])):
                delete_words.append(word_list[idx]) 
    word_list = [word for word in word_list if word not in delete_words]    
        
    # combine words as string 
    if len(word_list) < 1:
        word_list = text.strip()
    else:
        text = ' '.join(word_list)
        
    if cate_id == 'mg':
        text = disambiguation_mg(text)
    elif cate_id == 'ff':
        text = text
    
    
    return text

In [12]:
def data_prep(df, cate_id):
    
    stopword_list = stopwordslist('stopwords.txt')
    
    df['Product_Name_s'] = df.apply(lambda row: pre_processing(row['Product Name'], mode = 'simplified', cate_id = cate_id), axis = 1)
    df['Product_Name_t'] = df.apply(lambda row: pre_processing(row['Product Name'], mode = 'traditional', cate_id = cate_id), axis = 1)
    df['Query_s'] = df.apply(lambda row: pre_processing(row['Query'], mode = 'simplified', cate_id = cate_id), axis = 1)
    df['Query_t'] = df.apply(lambda row: pre_processing(row['Query'], mode = 'traditional', cate_id = cate_id), axis = 1)
    
    return df


In [13]:
mg_train = data_prep(mg_train, 'mg')
ff_train = data_prep(ff_train, 'ff')
mf_train = data_prep(mf_train, 'mf')

## Step5: Save processed data

In [14]:
mg_train.to_csv('mg_train.csv', index = False, encoding = 'utf_8_sig')
ff_train.to_csv('ff_train.csv', index = False, encoding = 'utf_8_sig')
mf_train.to_csv('mf_train.csv', index = False, encoding = 'utf_8_sig')
