#Chapter 23

Vocabulary Implementation

determine negative/positive feed back

In [1]:
import numpy as np
import pandas as pd
import torch
import sys
import string
from collections import Counter


vocab = {}
df = pd.read_csv(r'../data/Womens Clothing E-Commerce Reviews.csv')


def initializeVocabulary():
    unkToken = '<UNK>'
    vocab['t_2_i'] = {}
    vocab['i_2_t'] = {}
    idx = addToken(unkToken)
    vocab['addUnk'] = True
    vocab['unkToken'] = unkToken
    vocab['unkTokenIdx'] = idx

def addToken(token):
    if token in vocab['t_2_i']:
        idx = vocab['t_2_i'][token]
    else:
        idx = len(vocab['t_2_i'])
        vocab['t_2_i'][token] = idx
        vocab['i_2_t'][idx] = token
    return idx

def addManyTokens(tokens):
    idxes = [addToken(token) for token in tokens]
    return idxes

def lookUpToken(token):
    if vocab['unkTokenIdx'] >= 0:
        return vocab['t_2_i'].get(token,vocab['unkTokenIdx'])
    else:
        return vocab['t_2_i'][token]

def lookUpIndex(idx):
    if idx not in vocab['i_2_t']:
        raise KeyError("the index (%d) is not there" % idx)
    return vocab['i_2_t'][idx]

def vocabularyFromDataFrame(df,cutoff=25):
    initializeVocabulary()
    wordCounts = Counter()
    for r in df["Review Text"].fillna('').apply(str):
        if(r):
            for word in r.split(" "):
                if word not in string.punctuation:
                    wordCounts[word] += 1
    for word,count in wordCounts.items():
        if count > cutoff:
            addToken(word)

def vectorize(review):
    isFirst = True

    for token in str(review).split(" "):
        if token not in string.punctuation:
            oneHot = np.zeros((len(vocab['t_2_i']),1))
            oneHot[lookUpToken(token)] = 1
            if isFirst:
                xF = oneHot
                isFirst = False
            else:
                xF = np.hstack((xF,oneHot))
    return xF

#vocabularyFromDataFrame(df,cutoff=25)

#lookUpToken('this')

#lookUpIndex(8)


In [2]:


#xF = vectorize(df['Review Text'][1])

smallDf_pos = df[df['Rating'] >= 3].iloc[:300]#get 5 positive rows from the data set df
smallDf_neg = df[df['Rating'] < 3].iloc[:300]#get 5 negative rows from the data set df
df_small = pd.concat([smallDf_pos,smallDf_neg],axis=0)

#df_small


In [3]:
vocabularyFromDataFrame(df_small,cutoff=10)

numFeatures = len(vocab['t_2_i'])
hiddenUnits = 10
h0 = torch.tensor(np.zeros((hiddenUnits,1)))
Wx = torch.tensor(np.random.uniform(0,1,(hiddenUnits,numFeatures)),requires_grad=True)
Wh = torch.tensor(np.random.uniform(0,1,(hiddenUnits,hiddenUnits)),requires_grad=True)
Wy = torch.tensor(np.random.uniform(0,1,(1,hiddenUnits)),requires_grad=True)# only 

def stepForward(xt,Wx,Wh,Wy,prevMemory):
    x_frd = torch.matmul(Wx,torch.from_numpy(xt[:,np.newaxis]))
    h_frd = torch.matmul(Wh,prevMemory)
    ht = torch.tanh(x_frd+h_frd)
    yt_hat = torch.sigmoid(torch.matmul(Wy,ht))#sigmoid as we have only one output
    return ht,yt_hat

def fullForwardRNN(X,Wx,Wh,Wy,prevMemory):
    y_hat = 0
    for t in range(X.shape[1]):
        ht,yt_hat = stepForward(X[:,t],Wx,Wh,Wy,prevMemory)
        prevMemory = ht
        y_hat = yt_hat
    return y_hat  

def computeLoss(y,y_hat):
    loss = 0
    for yi,yi_hat in zip(y,y_hat):
        if yi == 1:
            loss += -torch.log2(yi_hat)
        else:
            loss += -torch.log2(1-yi_hat)
    return loss/len(y)

def updateParams(Wx,Wh,Wy,dWx,dWh,dWy,lr):
    with torch.no_grad():
        Wx -= lr*dWx
        Wh -= lr*dWh
        Wy -= lr*dWy
    return Wx,Wh,Wy

def trainRNN(train_df,Wx,Wh,Wy,prevMemory,lr,nepoch):
    losses = []
    for epoch in range(nepoch):
        y,y_hat = [],[]
        for rv,rt in zip(train_df['Review Text'],train_df['Rating']):
            X = vectorize(rv)
            yi_hat = fullForwardRNN(X,Wx,Wh,Wy,prevMemory)
            yi = 0
            if rt >= 3:
                yi = 1
            y.append(yi)
            y_hat.append(yi_hat)
            
        loss = computeLoss(y,y_hat)
        loss.backward()
        losses.append(loss)
        print("Loss after epoch=%d: %f" %(epoch,loss))
        sys.stdout.flush()
        dWx = Wx.grad.data
        dWh = Wh.grad.data
        dWy = Wy.grad.data
        Wx,Wh,Wy = updateParams(Wx,Wh,Wy,dWx,dWh,dWy,lr)
        Wx.grad.data.zero_()
        Wh.grad.data.zero_()
        Wy.grad.data.zero_()
    return Wx,Wh,Wy,losses

Wx,Wh,Wy,losses = trainRNN(df_small,Wx,Wh,Wy,h0,0.01,50)

#r = df_small['Review Text'].iloc[6]
#y = df_small['Rating'].iloc[6]

#X = vectorize(r)

#y_hat = fullForwardRNN(X,Wx,Wh,Wy,h0)

#print(y_hat)

#print(y)

#print(r)

Loss after epoch=0: 4.155557
Loss after epoch=1: 4.105794
Loss after epoch=2: 4.056082
Loss after epoch=3: 4.006422
Loss after epoch=4: 3.956820
Loss after epoch=5: 3.907278
Loss after epoch=6: 3.857801
Loss after epoch=7: 3.808393
Loss after epoch=8: 3.759058
Loss after epoch=9: 3.709801
Loss after epoch=10: 3.660627
Loss after epoch=11: 3.611541
Loss after epoch=12: 3.562549
Loss after epoch=13: 3.513657
Loss after epoch=14: 3.464872
Loss after epoch=15: 3.416201
Loss after epoch=16: 3.367650
Loss after epoch=17: 3.319227
Loss after epoch=18: 3.270942
Loss after epoch=19: 3.222801
Loss after epoch=20: 3.174815
Loss after epoch=21: 3.126994
Loss after epoch=22: 3.079346
Loss after epoch=23: 3.031885
Loss after epoch=24: 2.984620
Loss after epoch=25: 2.937564
Loss after epoch=26: 2.890731
Loss after epoch=27: 2.844132
Loss after epoch=28: 2.797783
Loss after epoch=29: 2.751697
Loss after epoch=30: 2.705892
Loss after epoch=31: 2.660382
Loss after epoch=32: 2.615185
Loss after epoch=33:

In [17]:
X1 = vectorize("great")

y1_hat = fullForwardRNN(X1,Wx,Wh,Wy,h0)

print(y1_hat)

tensor([[0.7095]], dtype=torch.float64, grad_fn=<SigmoidBackward0>)
