In [1]:
import pandas as pd
import numpy as np
import os
import time

# example to demonstrate tf_idf
### Reference:https://medium.freecodecamp.org/how-to-process-textual-data-using-tf-idf-in-python-cd2bbc0a94a3

In [None]:
def computeTF(wordDict, bow):
    '''
    compute term freq given the word occurence in self document 
    so TF it will change as the item_name changes even the word is the same.
    '''
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count/float(bowCount)
    return tfDict

def computeIDF(docList, smoothing = False):
    import math
    idfDict = {}
    N = len(docList) # total number of item_name
    
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for doc in docList:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        # val: number of item_name containing these words.
        if smoothing == True:
            idfDict[word] = math.log10(1.0 + N / float(val))
        else:
            idfDict[word] = math.log10(N / float(val))
        
    return idfDict # word of iverse document freq won't is fixed given all sentence

def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        # val: term freq
        # idfs
        tfidf[word] = val*idfs[word]
    return tfidf

#-------------
# data
#-------------
docA = "The cat sat on my face"
docB = "The dog sat on my bed"
bowA = docA.split(" ")
bowB = docB.split(" ")
print ('bowA', bowA)
print ('bowB', bowB)

In [None]:
# vocabulary
wordSet = set(bowA).union(set(bowB))

# The method fromkeys() creates a new dictionary with keys from seq and values set to value.

# initalize 
wordDictA = dict.fromkeys(wordSet, 0) 
wordDictB = dict.fromkeys(wordSet, 0)
print ('wordDictA', wordDictA)
print ('wordDictB', wordDictB)

# corpust states computation
for word in bowA:
    wordDictA[word]+=1
    
for word in bowB:
    wordDictB[word]+=1

In [None]:
# to_dataframe
pd.DataFrame([wordDictA, wordDictB])
# compute tf
tfBowA = computeTF(wordDictA, bowA)
tfBowB = computeTF(wordDictB, bowB)
df = pd.DataFrame([tfBowA, tfBowB], index = ['s1', 's2'])
df = df.add_suffix('_TF')
df

In [None]:
# compute idf
idfs = computeIDF([wordDictA, wordDictB], smoothing = True)
tfidfBowA = computeTFIDF(tfBowA, idfs)
tfidfBowB = computeTFIDF(tfBowB, idfs)
df = pd.DataFrame([tfidfBowA, tfidfBowB])
df = df.add_suffix('_TFIDF')
df

# sklearn
# Note : tf-idfs computed in scikit-learn’s TfidfTransformer and TfidfVectorizer differ slightly from the standard textbook notation 

# TfidfVectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
'''
Input: String which is not suitable for us since our token is cleaned.

'''
tfidf = TfidfVectorizer()
response = tfidf.fit_transform([docA, docB]) # parse matrix 
vocabulary = tfidf.get_feature_names() # v
print ('vocabulary', vocabulary)
for col in response.nonzero()[1]:
    print (feature_names[col], ' - ', response[0, col])

# TfidfTransformer
# http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

In [9]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf = False)
transformer   


TfidfTransformer(norm='l2', smooth_idf=False, sublinear_tf=False,
         use_idf=True)

In [None]:
counts = [[3, 0, 1],
          [2, 0, 0],
          [3, 0, 0],
          [4, 0, 0],
          [3, 2, 0],
          [3, 0, 2]]

tfidf = transformer.fit_transform(counts)
tfidf                         



tfidf.toarray()   

# real_case

In [6]:
# preprocessed_data_path
input_base_path = '../brand_recognition_bio/data/preprocessed'
T = 3
#--------------------
# laod data including label
#--------------------
if T == 1:
    name = 'tv_and_laptop' 
    df = pd.read_csv(os.path.join(input_base_path, 'tv_and_laptop.csv'))
elif T == 2:
    name = 'personal_care_and_beauty'
    df = pd.read_csv(os.path.join(input_base_path, 'personal_care_and_beauty.csv'))
elif T == 3:
    name = 'beauty_amazon'
    df = pd.read_csv(os.path.join(input_base_path, 'beauty_amazon.csv')) # 40649 x 87402
elif T == 4:
    name = 'tv_laptop_amazon'
    df = pd.read_csv(os.path.join(input_base_path, 'tv_laptop_amazon.csv')) # 16103 x 8324
else:
    pass

In [7]:
16103 /5 * 8324 / (40649/32 * 87402)

0.24146133741941755

In [8]:
#-------------------------
# drop itemname and tokens with nan
#-------------------------
df.dropna(subset = ['item_name', 'tokens'], axis = 0, inplace = True)
#--------------------------
# conver type
#--------------------------
df['tokens'] = df.tokens.astype(str)
#--------------------------
# preprocessing
#--------------------------

df['tokens'] = df.tokens.apply(lambda x: x.lower() if type(x)==str else x)
wordSet = df.tokens.unique()
print (len(wordSet))

40649


In [5]:
df.item_name.nunique()

8324

In [None]:
item_name_ls = list(df.item_name.unique())
item_name_num = df.item_name.nunique()
num_partitions = 10
n = int(item_name_num /num_partitions)
n * 9

In [None]:
from billiard import Pool

def parallelize_dataframe(df, func, name):
    '''
    speeding up DataFrame.apply() via parallelizing.

    '''
    if name == 'beauty_amazon':
        #---------------
        # setting
        #---------------
        num_partitions = 32
        num_cores = 32

        # core
        item_name_ls = list(df.item_name.unique())
        item_name_num = df.item_name.nunique()
        n = int(item_name_num /num_partitions)
        # split df based on item_name
        df1 = df[df.item_name.isin(item_name_ls[:1*n])]
        df2 = df[df.item_name.isin(item_name_ls[1*n:2*n])]
        df3 = df[df.item_name.isin(item_name_ls[2*n:3*n])]
        df4 = df[df.item_name.isin(item_name_ls[3*n:4*n])]
        df5 = df[df.item_name.isin(item_name_ls[4*n:5*n])]
        df6 = df[df.item_name.isin(item_name_ls[5*n:6*n])]
        df7 = df[df.item_name.isin(item_name_ls[6*n:7*n])]
        df8 = df[df.item_name.isin(item_name_ls[7*n:8*n])]
        df9 = df[df.item_name.isin(item_name_ls[8*n:9*n])]
        df10 = df[df.item_name.isin(item_name_ls[9*n:10*n])]
        df11 = df[df.item_name.isin(item_name_ls[10*n:11*n])]
        df12 = df[df.item_name.isin(item_name_ls[11*n:12*n])]
        df13 = df[df.item_name.isin(item_name_ls[12*n:13*n])]
        df14 = df[df.item_name.isin(item_name_ls[13*n:14*n])]
        df15 = df[df.item_name.isin(item_name_ls[14*n:15*n])]
        df16 = df[df.item_name.isin(item_name_ls[15*n:16*n])]
        df17 = df[df.item_name.isin(item_name_ls[16*n:17*n])]
        df18 = df[df.item_name.isin(item_name_ls[17*n:18*n])]
        df19 = df[df.item_name.isin(item_name_ls[18*n:19*n])]
        df20 = df[df.item_name.isin(item_name_ls[19*n:20*n])]
        df21 = df[df.item_name.isin(item_name_ls[20*n:21*n])]
        df22 = df[df.item_name.isin(item_name_ls[21*n:22*n])]
        df23 = df[df.item_name.isin(item_name_ls[22*n:23*n])]
        df24 = df[df.item_name.isin(item_name_ls[23*n:24*n])]
        df25 = df[df.item_name.isin(item_name_ls[24*n:25*n])]
        df26 = df[df.item_name.isin(item_name_ls[25*n:26*n])]
        df27 = df[df.item_name.isin(item_name_ls[26*n:27*n])]
        df28 = df[df.item_name.isin(item_name_ls[27*n:28*n])]
        df29 = df[df.item_name.isin(item_name_ls[28*n:29*n])]
        df30 = df[df.item_name.isin(item_name_ls[29*n:30*n])]
        df31 = df[df.item_name.isin(item_name_ls[30*n:31*n])]
        df32 = df[df.item_name.isin(item_name_ls[31*n:])]

        pool = Pool(num_cores)
        df = pd.concat(pool.map(func, [df1,df2,df3,df4,df5,df6,df7,df8,df9,df10,
                                       df11,df12,df13,df14,df15,df16,df17,df18,df19,df20,
                                       df21,df22,df23,df24,df25,df26,df27,df28,df29,df30,
                                       df31,df32]))
        pool.close()
        pool.join()
    	
    else:
        #---------------
        # setting
        #---------------
        num_partitions = 5
        num_cores = 5

        # core
        item_name_ls = list(df.item_name.unique())
        item_name_num = df.item_name.nunique()
        n = int(item_name_num /num_partitions)
        # split df based on item_name
        df1 = df[df.item_name.isin(item_name_ls[:n])]
        df2 = df[df.item_name.isin(item_name_ls[n:2*n])]
        df3 = df[df.item_name.isin(item_name_ls[2*n:3*n])]
        df4 = df[df.item_name.isin(item_name_ls[3*n:4*n])]
        df5 = df[df.item_name.isin(item_name_ls[4*n:])]
        pool = Pool(num_cores)
        df = pd.concat(pool.map(func, [df1,df2,df3,df4,df5]))
        pool.close()
        pool.join()

    return df

def speed_up_func_for_feature_engineering(df):
    '''
    Put the columns u need to apply()
    
    data: DataFrame
    '''
    df = df.groupby('item_name').apply(lambda x: get_count_metrix(x, wordSet)).reset_index()
    return df

def get_count_metrix(df, wordSet):
    '''
    return the matrix, row is number of item_names, column is number of words aka size of vocabulary.
    Note:
        the element in matrix is called term frequency given the item title.
    args:
    ---------
    df: DataFrame
    wordSet: set
    '''
    # initalize empty dict
    wordDictA = dict.fromkeys(wordSet, 0) 
    # corpust states computation
    for word in df.tokens.tolist():
        wordDictA[word]+=1
    return pd.DataFrame([wordDictA])

s = time.time()
#df_count_ = df.head(100).groupby('item_name').apply(lambda x: get_count_metrix(x, wordSet)).reset_index()
df_count = parallelize_dataframe(df, speed_up_func_for_feature_engineering, name)
e = time.time()

print (e-s)

In [None]:
df_count.reset_index(inplace=True, drop=True)
counts = df_count.values[:,1:]
tfidf = transformer.fit_transform(counts.tolist())
# output
tf_idf_df = pd.DataFrame(tfidf.toarray())
tf_idf_df.columns = df_count.columns.tolist()[1:]
output = pd.concat([df_count[['item_name']], tf_idf_df], axis = 1).set_index('item_name')

In [None]:
output

In [None]:
tf_idf = []
for ix, row in df.iterrows():
    i_n = row.item_name
    t = row.tokens
    tf_idf.append(output.loc[i_n,t])# loc: for index which is name not int
df['tf_idf'] = tf_idf

In [6]:
file_path = '../brand_detector/features/tv_and_laptop/tf_idf.h5'
if os.path.exists(file_path) == True:
    print ('f')

f
