In [69]:
#! /usr/bin/env python3
"""
Created on Aug 21 2018

In order to capture similarity between tokens and surrounding tokens.
@author: Ray

"""

import nltk, re, string, collections
from nltk.util import ngrams # function for making ngrams
import pandas as pd
import numpy as np
from datetime import datetime # for the newest version control
import os
import time
import multiprocessing as mp # for speeding up some process
import logging
from nltk import tag # for pos_tagging
from nltk.corpus import wordnet # for geting pos of wordnet
from nltk.stem import WordNetLemmatizer
import gc
import multiprocessing # for parallelling apply() in panda
from multiprocessing import Pool

def get_the_preceding_word(row, window_size = 1):
    '''
    Get the preceding word given the token. 
    It's a helper function to compute the sequential feature of the word.
    '''
    try:
        the_former_ix = row.item_name.split().index(row.tokens) - window_size
        if the_former_ix < 0:
            return -1 # It means the former word is non-existent. # -1 is bettern than missing value
        else:
            return row.item_name.split()[the_former_ix]
    except Exception:
        pass # It will make missing value on this feature but it's fine

def get_the_succeeding_word(row, window_size = 1):
    '''
    Get the succeeding word given the token. 
    It's a helper function to compute the sequential feature of the word.
    '''
    try:
        the_latter_ix = row.item_name.split().index(row.tokens) + window_size
        if the_latter_ix >= len(row.item_name.split()):
            return -1 # It means the latter word is non-existent. 
        else:
            return row.item_name.split()[the_latter_ix]
    except Exception:
        pass # It will make missing value on this feature but it's fine

def succeeding_2_gram_given_current_token(row, esBigramFreq):
    if row.the_succeeding_word_given_current_token_w_1 == -1:
        return -1
    else:
        key = (row.tokens.lower(), row.the_succeeding_word_given_current_token_w_1.lower())
        return esBigramFreq[key]
    return row

def preceding_2_gram_given_current_token(row, esBigramFreq):
    if row.the_preceding_word_given_current_token_w_1 == -1:
        return -1
    else:
        key = (row.tokens.lower(), row.the_preceding_word_given_current_token_w_1.lower())
        return esBigramFreq[key]
    return row

def preceding_3_gram_given_current_token(row, esTrigramFreq):
    if row.the_preceding_word_given_current_token_w_1 != -1 and row.the_preceding_word_given_current_token_w_2 != -1:
        key = (row.tokens.lower(), row.the_preceding_word_given_current_token_w_1.lower(),
               row.the_preceding_word_given_current_token_w_2.lower())
        return esTrigramFreq[key]
    else:
        return -1
    return row

def succeeding_3_gram_given_current_token(row, esTrigramFreq):
    if row.the_succeeding_word_given_current_token_w_1 != -1 and row.the_succeeding_word_given_current_token_w_2 != -1:
        key = (row.tokens.lower(), row.the_succeeding_word_given_current_token_w_1.lower(),
               row.the_succeeding_word_given_current_token_w_2.lower())
        return esTrigramFreq[key]
    else:
        return -1
    return row

def preceding_4_gram_given_current_token(row, esFgramFreq):
    if (row.the_preceding_word_given_current_token_w_1 != -1) \
    and (row.the_preceding_word_given_current_token_w_2 != -1) \
    and (row.the_preceding_word_given_current_token_w_3 != -1):
        key = (row.tokens.lower(), 
               row.the_preceding_word_given_current_token_w_1.lower(),
               row.the_preceding_word_given_current_token_w_2.lower(),
               row.the_preceding_word_given_current_token_w_3.lower())
        return esFgramFreq[key]
    else:
        return -1
    return row

def succeeding_4_gram_given_current_token(row, esFgramFreq):
    if (row.the_succeeding_word_given_current_token_w_1 != -1) \
    and (row.the_succeeding_word_given_current_token_w_2 != -1) \
    and (row.the_succeeding_word_given_current_token_w_3 != -1):
        key = (row.tokens.lower(), 
               row.the_succeeding_word_given_current_token_w_1.lower(),
               row.the_succeeding_word_given_current_token_w_2.lower(),
               row.the_succeeding_word_given_current_token_w_3.lower())
        return esFgramFreq[key]
    else:
        return -1
    return row

def preceding_5_gram_given_current_token(row, esFivegramFreq):
    if (row.the_preceding_word_given_current_token_w_1 != -1) \
    and (row.the_preceding_word_given_current_token_w_2 != -1) \
    and (row.the_preceding_word_given_current_token_w_3 != -1) \
    and (row.the_preceding_word_given_current_token_w_4 != -1):
        key = (row.tokens.lower(), 
               row.the_preceding_word_given_current_token_w_1.lower(),
               row.the_preceding_word_given_current_token_w_2.lower(),
               row.the_preceding_word_given_current_token_w_3.lower(),
               row.the_preceding_word_given_current_token_w_4.lower())
        return esFivegramFreq[key]
    else:
        return -1
    return row

def succeeding_5_gram_given_current_token(row, esFivegramFreq):
    if (row.the_succeeding_word_given_current_token_w_1 != -1) \
    and (row.the_succeeding_word_given_current_token_w_2 != -1) \
    and (row.the_succeeding_word_given_current_token_w_3 != -1) \
    and (row.the_succeeding_word_given_current_token_w_4 != -1):
        key = (row.tokens.lower(), 
               row.the_succeeding_word_given_current_token_w_1.lower(),
               row.the_succeeding_word_given_current_token_w_2.lower(),
               row.the_succeeding_word_given_current_token_w_3.lower(),
               row.the_succeeding_word_given_current_token_w_4.lower())
        return esFivegramFreq[key]
    else:
        return -1
    return row

def succeeding_strip_2_gram_given_current_token(row, esBigramFreq):
    if row.the_succeeding_word_given_current_token_w_2 == -1:
        return -1
    else:
        key = (row.tokens.lower(), row.the_succeeding_word_given_current_token_w_2.lower())
        return esBigramFreq[key]
    return row

def preceding_strip_2_gram_given_current_token(row, esBigramFreq):
    if row.the_preceding_word_given_current_token_w_2 == -1:
        return -1
    else:
        key = (row.tokens.lower(), row.the_preceding_word_given_current_token_w_2.lower())
        return esBigramFreq[key]
    return row

def succeeding_strip_3_gram_given_current_token(row, esBigramFreq):
    if row.the_succeeding_word_given_current_token_w_3 == -1:
        return -1
    else:
        key = (row.tokens.lower(), row.the_succeeding_word_given_current_token_w_3.lower())
        return esBigramFreq[key]
    return row

def preceding_strip_3_gram_given_current_token(row, esBigramFreq):
    if row.the_preceding_word_given_current_token_w_3 == -1:
        return -1
    else:
        key = (row.tokens.lower(), row.the_preceding_word_given_current_token_w_3.lower())
        return esBigramFreq[key]
    return row

def succeeding_strip_4_gram_given_current_token(row, esBigramFreq):
    if row.the_succeeding_word_given_current_token_w_4 == -1:
        return -1
    else:
        key = (row.tokens.lower(), row.the_succeeding_word_given_current_token_w_4.lower())
        return esBigramFreq[key]
    return row

def preceding_strip_4_gram_given_current_token(row, esBigramFreq):
    if row.the_preceding_word_given_current_token_w_4 == -1:
        return -1
    else:
        key = (row.tokens.lower(), row.the_preceding_word_given_current_token_w_4.lower())
        return esBigramFreq[key]
    return row

In [75]:
# preprocessed_data_path
input_base_path = '../brand_detector/data/preprocessed'
T = 1
#--------------------
# laod data including label
#--------------------	
if T == 1:
    name = 'tv_and_laptop' 
    df = pd.read_csv(os.path.join(input_base_path, 'tv_and_laptop.csv'))
elif T == 2:
    name = 'personal_care_and_beauty'
    df = pd.read_csv(os.path.join(input_base_path, 'personal_care_and_beauty.csv'))
elif T == 3:
    name = 'beauty_amazon'
    df = pd.read_csv(os.path.join(input_base_path, 'beauty_amazon.csv'))
elif T == 4:
    name = 'tv_laptop_amazon'
    df = pd.read_csv(os.path.join(input_base_path, 'tv_laptop_amazon.csv'))
else:
    pass
tokenized = [t.lower() for t in df.tokens.tolist()]

#----------------------------
# n-grame generator
#----------------------------
esBigrams = ngrams(tokenized, 2) # generater
esTrigrams = ngrams(tokenized, 3) # generater
esFgrams = ngrams(tokenized, 4) # generater
esFivegrams = ngrams(tokenized, 5) # generater

#----------------------------
# get the frequency of each bigram in our corpus
#----------------------------
esBigramFreq = collections.Counter(esBigrams)
esTrigramFreq = collections.Counter(esTrigrams)
esFgramFreq = collections.Counter(esFgrams)
esFivegramFreq = collections.Counter(esFivegrams)



In [76]:
#-------------------------
# drop itemname and tokens with nan
#-------------------------
df.dropna(subset = ['item_name', 'tokens'], axis = 0, inplace = True)
#--------------------------
# conver type
#--------------------------
df['tokens'] = df.tokens.astype(str)



# wo swifter

In [63]:
# #--------------------------
# # preprocessing for contextual information
# #--------------------------
# s = time.time()
# df['the_preceding_word_given_current_token_w_1'] = df.apply(lambda x: get_the_preceding_word(x, window_size = 1), axis = 1)
# df['the_succeeding_word_given_current_token_w_1'] = df.apply(lambda x: get_the_succeeding_word(x, window_size = 1), axis = 1)
# df['the_preceding_word_given_current_token_w_2'] = df.apply(lambda x: get_the_preceding_word(x, window_size = 2), axis = 1)
# df['the_succeeding_word_given_current_token_w_2'] = df.apply(lambda x: get_the_succeeding_word(x, window_size = 2), axis = 1)
# df['the_preceding_word_given_current_token_w_3'] = df.apply(lambda x: get_the_preceding_word(x, window_size = 3), axis = 1)
# df['the_succeeding_word_given_current_token_w_3'] = df.apply(lambda x: get_the_succeeding_word(x, window_size = 3), axis = 1)
# df['the_preceding_word_given_current_token_w_4'] = df.apply(lambda x: get_the_preceding_word(x, window_size = 4), axis = 1)
# df['the_succeeding_word_given_current_token_w_4'] = df.apply(lambda x: get_the_succeeding_word(x, window_size = 4), axis = 1)
# # increase the window_size
# df['the_preceding_word_given_current_token_w_5'] = df.apply(lambda x: get_the_preceding_word(x, window_size = 5), axis = 1)
# df['the_succeeding_word_given_current_token_w_5'] = df.apply(lambda x: get_the_succeeding_word(x, window_size = 5), axis = 1)
# df['the_preceding_word_given_current_token_w_6'] = df.apply(lambda x: get_the_preceding_word(x, window_size = 6), axis = 1)
# df['the_succeeding_word_given_current_token_w_6'] = df.apply(lambda x: get_the_succeeding_word(x, window_size = 6), axis = 1)
# df['the_preceding_word_given_current_token_w_7'] = df.apply(lambda x: get_the_preceding_word(x, window_size = 7), axis = 1)
# df['the_succeeding_word_given_current_token_w_7'] = df.apply(lambda x: get_the_succeeding_word(x, window_size = 7), axis = 1)
# e = time.time()
# print (e-s)


# w swifter.. slower than wo swifter. It need u to modify ur code for vectorizing

In [64]:
# import swifter

# #--------------------------
# # preprocessing for contextual information
# #--------------------------
# s = time.time()
# df['the_preceding_word_given_current_token_w_1'] = df.swifter.apply(lambda x: get_the_preceding_word(x, window_size = 1), axis = 1)
# df['the_succeeding_word_given_current_token_w_1'] = df.swifter.apply(lambda x: get_the_succeeding_word(x, window_size = 1), axis = 1)
# df['the_preceding_word_given_current_token_w_2'] = df.swifter.apply(lambda x: get_the_preceding_word(x, window_size = 2), axis = 1)
# df['the_succeeding_word_given_current_token_w_2'] = df.swifter.apply(lambda x: get_the_succeeding_word(x, window_size = 2), axis = 1)
# df['the_preceding_word_given_current_token_w_3'] = df.swifter.apply(lambda x: get_the_preceding_word(x, window_size = 3), axis = 1)
# df['the_succeeding_word_given_current_token_w_3'] = df.swifter.apply(lambda x: get_the_succeeding_word(x, window_size = 3), axis = 1)
# df['the_preceding_word_given_current_token_w_4'] = df.swifter.apply(lambda x: get_the_preceding_word(x, window_size = 4), axis = 1)
# df['the_succeeding_word_given_current_token_w_4'] = df.swifter.apply(lambda x: get_the_succeeding_word(x, window_size = 4), axis = 1)
# # increase the window_size
# df['the_preceding_word_given_current_token_w_5'] = df.swifter.apply(lambda x: get_the_preceding_word(x, window_size = 5), axis = 1)
# df['the_succeeding_word_given_current_token_w_5'] = df.swifter.apply(lambda x: get_the_succeeding_word(x, window_size = 5), axis = 1)
# df['the_preceding_word_given_current_token_w_6'] = df.swifter.apply(lambda x: get_the_preceding_word(x, window_size = 6), axis = 1)
# df['the_succeeding_word_given_current_token_w_6'] = df.swifter.apply(lambda x: get_the_succeeding_word(x, window_size = 6), axis = 1)
# df['the_preceding_word_given_current_token_w_7'] = df.swifter.apply(lambda x: get_the_preceding_word(x, window_size = 7), axis = 1)
# df['the_succeeding_word_given_current_token_w_7'] = df.swifter.apply(lambda x: get_the_succeeding_word(x, window_size = 7), axis = 1)
# e = time.time()
# print (e-s)


# multiprocessing

In [77]:
#---------------
# setting 
#---------------
num_partitions = 10
cpu_rate = 0.5
num_cores = int(multiprocessing.cpu_count() * cpu_rate)
num_cores = 10
def parallelize_dataframe(df, func):
    df1,df2,df3,df4,df5,df6,df7,df8,df9,df10 = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, [df1,df2,df3,df4,df5,df6,df7,df8,df9,df10]))
    pool.close()
    pool.join()
    return df

def speed_up_func_for_preprocessing(df):
    '''
    Put the columns u need to apply()
    
    data: DataFrame
    '''
    df['the_preceding_word_given_current_token_w_1'] = df.apply(lambda x: get_the_preceding_word(x, window_size = 1), axis = 1)
    df['the_succeeding_word_given_current_token_w_1'] = df.apply(lambda x: get_the_succeeding_word(x, window_size = 1), axis = 1)
    df['the_preceding_word_given_current_token_w_2'] = df.apply(lambda x: get_the_preceding_word(x, window_size = 2), axis = 1)
    df['the_succeeding_word_given_current_token_w_2'] = df.apply(lambda x: get_the_succeeding_word(x, window_size = 2), axis = 1)
    df['the_preceding_word_given_current_token_w_3'] = df.apply(lambda x: get_the_preceding_word(x, window_size = 3), axis = 1)
    df['the_succeeding_word_given_current_token_w_3'] = df.apply(lambda x: get_the_succeeding_word(x, window_size = 3), axis = 1)
    df['the_preceding_word_given_current_token_w_4'] = df.apply(lambda x: get_the_preceding_word(x, window_size = 4), axis = 1)
    df['the_succeeding_word_given_current_token_w_4'] = df.apply(lambda x: get_the_succeeding_word(x, window_size = 4), axis = 1)
    # increase the window_size
    df['the_preceding_word_given_current_token_w_5'] = df.apply(lambda x: get_the_preceding_word(x, window_size = 5), axis = 1)
    df['the_succeeding_word_given_current_token_w_5'] = df.apply(lambda x: get_the_succeeding_word(x, window_size = 5), axis = 1)
    df['the_preceding_word_given_current_token_w_6'] = df.apply(lambda x: get_the_preceding_word(x, window_size = 6), axis = 1)
    df['the_succeeding_word_given_current_token_w_6'] = df.apply(lambda x: get_the_succeeding_word(x, window_size = 6), axis = 1)
    df['the_preceding_word_given_current_token_w_7'] = df.apply(lambda x: get_the_preceding_word(x, window_size = 7), axis = 1)
    df['the_succeeding_word_given_current_token_w_7'] = df.apply(lambda x: get_the_succeeding_word(x, window_size = 7), axis = 1)
    return df

def speed_up_func_for_feature_engineering(df):
    '''
    Put the columns u need to apply()
    
    data: DataFrame
    '''
    # succeeding_2_gram_given_current_token
    df['succeeding_2_gram_given_current_token'] = df.apply(lambda x: succeeding_2_gram_given_current_token(x, esBigramFreq = esBigramFreq), axis = 1) 
    # preceding_2_gram_given_current_token
    df['preceding_2_gram_given_current_token'] = df.apply(lambda x: preceding_2_gram_given_current_token(x, esBigramFreq = esBigramFreq), axis = 1) 
    # succeeding_3_gram_given_current_token
    df['succeeding_3_gram_given_current_token'] = df.apply(lambda x: succeeding_3_gram_given_current_token(x, esTrigramFreq = esTrigramFreq), axis = 1) 
    # preceding_3_gram_given_current_token
    df['preceding_3_gram_given_current_token'] = df.apply(lambda x: preceding_3_gram_given_current_token(x, esTrigramFreq = esTrigramFreq), axis = 1) 
    # succeeding_4_gram_given_current_token
    df['succeeding_4_gram_given_current_token'] = df.apply(lambda x: succeeding_4_gram_given_current_token(x, esFgramFreq = esFgramFreq), axis = 1) 
    # preceding_4_gram_given_current_token
    df['preceding_4_gram_given_current_token'] = df.apply(lambda x: preceding_4_gram_given_current_token(x, esFgramFreq = esFgramFreq), axis = 1) 
    # succeeding_5_gram_given_current_token
    df['succeeding_5_gram_given_current_token'] = df.apply(lambda x: succeeding_5_gram_given_current_token(x, esFivegrams = esFivegrams), axis = 1) 
    # preceding_5_gram_given_current_token
    df['preceding_5_gram_given_current_token'] = df.apply(lambda x: preceding_5_gram_given_current_token(x, esFivegrams = esFivegrams), axis = 1) 
    # succeeding_strip_2_gram_given_current_token
    df['succeeding_strip_2_gram_given_current_token'] = df.apply(lambda x: succeeding_strip_2_gram_given_current_token(x, esBigramFreq = esBigramFreq), axis = 1) 
    # preceding_strip_2_gram_given_current_token
    df['preceding_strip_2_gram_given_current_token'] = df.apply(lambda x: preceding_strip_2_gram_given_current_token(x, esBigramFreq = esBigramFreq), axis = 1) 
    # succeeding_strip_3_gram_given_current_token
    df['succeeding_strip_3_gram_given_current_token'] = df.apply(lambda x: succeeding_strip_3_gram_given_current_token(x, esBigramFreq = esBigramFreq), axis = 1) 
    # preceding_strip_3_gram_given_current_token
    df['preceding_strip_3_gram_given_current_token'] = df.apply(lambda x: preceding_strip_3_gram_given_current_token(x, esBigramFreq = esBigramFreq), axis = 1) 
    # succeeding_strip_4_gram_given_current_token
    df['succeeding_strip_4_gram_given_current_token'] = df.apply(lambda x: succeeding_strip_4_gram_given_current_token(x, esBigramFreq = esBigramFreq), axis = 1) 
    # preceding_strip_4_gram_given_current_token
    df['preceding_strip_4_gram_given_current_token'] = df.apply(lambda x: preceding_strip_4_gram_given_current_token(x, esBigramFreq = esBigramFreq), axis = 1) 
    return df

In [78]:
s = time.time()
df = parallelize_dataframe(df, speed_up_func_for_preprocessing)
e = time.time()
print (e-s)


3.9531095027923584


In [79]:
df.shape

(41443, 18)

In [80]:
df_parell = parallelize_dataframe(df, speed_up_func_for_feature_engineering)


TypeError: ("'generator' object is not subscriptable", 'occurred at index 0')