In [1]:
#config

# define the data to be used
dataDict = {
    "sentence":[
        "Avengers is a great movie.",
        "I love Avengers it is great.",
        "Avengers is a bad movie.",
        "I hate Avengers.",
        "I didnt like the Avengers movie.",
        "I think Avengers is a bad movie.",
        "I love the movie.",
        "I think it is great."
    ],
    "sentiment":[
        "good",
        "good",
        "bad",
        "bad",
        "bad",
        "bad",
        "good",
        "good"
    ]
}
# define a list of stopwords
stopWrds = ["is", "a", "i", "it"] 
# define model training parameters
epochs = 30
batchSize = 10
# define number of dense units
denseUnits = 50

In [2]:
#data preprocessing

# import the necessary packages
import re

def preprocess(sentDf, stopWords, key="sentence"):
    # loop over all the sentences
    for num in range(len(sentDf[key])):
        # lowercase the string and remove punctuation
        sentence = sentDf[key][num]
        sentence = re.sub(
            r"[^a-zA-Z0-9]", " ", sentence.lower()
        ).split()
        # define a list for processed words
        newWords = list()
        # loop over the words in each sentence and filter out the
        # stopwords
        for word in sentence:
            if word not in stopWords:
                # append word if not a stopword    
                newWords.append(word)
        # replace sentence with the list of new words   
        sentDf[key][num] = newWords
    
    # return the preprocessed data
    return sentDf


def prepare_tokenizer(df, sentKey="sentence", outputKey="sentiment"):
    # counters for tokenizer indices
    wordCounter = 0
    labelCounter = 0
    # create placeholder dictionaries for tokenizer
    textDict = dict()
    labelDict = dict()
    # loop over the sentences
    for entry in df[sentKey]:
        # loop over each word and
        # check if encountered before
        for word in entry:
            if word not in textDict.keys():
                textDict[word] = wordCounter
                # update word counter if new
                # word is encountered
                wordCounter += 1
    
    # repeat same process for labels  
    for label in df[outputKey]:
        if label not in labelDict.keys():
            labelDict[label] = labelCounter
            labelCounter += 1
    
    # return the dictionaries 
    return (textDict, labelDict)

In [3]:
#bow

#The Bag-of-Words model is a simple method for extracting features from text data.

def calculate_bag_of_words(text, sentence):
    # create a dictionary for frequency check
    freqDict = dict.fromkeys(text, 0)
    # loop over the words in sentences
    for word in sentence:
        # update word frequency
        freqDict[word]=sentence.count(word)
    # return dictionary 
    return freqDict

In [4]:
#tf wrapper

# import the necessary packages
from tensorflow.keras.preprocessing.text import Tokenizer 

def tensorflow_wrap(df):
    # create the tokenizer for sentences
    tokenizerSentence = Tokenizer()
    # create the tokenizer for labels
    tokenizerLabel = Tokenizer()
    # fit the tokenizer on the documents
    tokenizerSentence.fit_on_texts(df["sentence"])
    # fit the tokenizer on the labels
    tokenizerLabel.fit_on_texts(df["sentiment"])
    # create vectors using tensorflow
    encodedData = tokenizerSentence.texts_to_matrix(
        texts=df["sentence"], mode="count")
    # add label column
    labels = df["sentiment"]
    # correct label vectors
    for i in range(len(labels)):
        labels[i] = tokenizerLabel.word_index[labels[i]] - 1
    # return data and labels
    return (encodedData[:, 1:], labels.astype("float32"))

In [5]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

def build_shallow_net():
    # define the model
    model = Sequential()
    model.add(Dense(denseUnits, input_dim=10, activation="relu"))
    model.add(Dense(denseUnits, activation="relu"))
    model.add(Dense(1, activation="sigmoid"))
    # compile the keras model
    model.compile(loss="binary_crossentropy", optimizer="adam",
        metrics=["accuracy"]
    )
    # return model
    return model

In [6]:
import pandas as pd

# convert the input data dictionary to a pandas data frame
df = pd.DataFrame.from_dict(dataDict)

In [7]:
df.shape

(8, 2)

In [8]:
# preprocess the data frame and create data dictionaries
preprocessedDf = preprocess(sentDf=df, stopWords=stopWrds)

In [9]:
preprocessedDf

Unnamed: 0,sentence,sentiment
0,"[avengers, great, movie]",good
1,"[love, avengers, great]",good
2,"[avengers, bad, movie]",bad
3,"[hate, avengers]",bad
4,"[didnt, like, the, avengers, movie]",bad
5,"[think, avengers, bad, movie]",bad
6,"[love, the, movie]",good
7,"[think, great]",good


In [10]:
(textDict, labelDict) = prepare_tokenizer(df)

In [11]:
textDict,labelDict

({'avengers': 0,
  'great': 1,
  'movie': 2,
  'love': 3,
  'bad': 4,
  'hate': 5,
  'didnt': 6,
  'like': 7,
  'the': 8,
  'think': 9},
 {'good': 0, 'bad': 1})

In [12]:
# create an empty list for vectors
freqList = list()

In [13]:
for sentence in df["sentence"]:
    # create entries for each sentence and update the vector list   
    entryFreq = calculate_bag_of_words(text=textDict,
        sentence=sentence)
    freqList.append(entryFreq)

In [14]:
freqList

[{'avengers': 1,
  'great': 1,
  'movie': 1,
  'love': 0,
  'bad': 0,
  'hate': 0,
  'didnt': 0,
  'like': 0,
  'the': 0,
  'think': 0},
 {'avengers': 1,
  'great': 1,
  'movie': 0,
  'love': 1,
  'bad': 0,
  'hate': 0,
  'didnt': 0,
  'like': 0,
  'the': 0,
  'think': 0},
 {'avengers': 1,
  'great': 0,
  'movie': 1,
  'love': 0,
  'bad': 1,
  'hate': 0,
  'didnt': 0,
  'like': 0,
  'the': 0,
  'think': 0},
 {'avengers': 1,
  'great': 0,
  'movie': 0,
  'love': 0,
  'bad': 0,
  'hate': 1,
  'didnt': 0,
  'like': 0,
  'the': 0,
  'think': 0},
 {'avengers': 1,
  'great': 0,
  'movie': 1,
  'love': 0,
  'bad': 0,
  'hate': 0,
  'didnt': 1,
  'like': 1,
  'the': 1,
  'think': 0},
 {'avengers': 1,
  'great': 0,
  'movie': 1,
  'love': 0,
  'bad': 1,
  'hate': 0,
  'didnt': 0,
  'like': 0,
  'the': 0,
  'think': 1},
 {'avengers': 0,
  'great': 0,
  'movie': 1,
  'love': 1,
  'bad': 0,
  'hate': 0,
  'didnt': 0,
  'like': 0,
  'the': 1,
  'think': 0},
 {'avengers': 0,
  'great': 1,
  'movie':

In [15]:
# create an empty data frame for the vectors
finalDf = pd.DataFrame() 

In [16]:
# loop over the vectors and concat them
for vector in freqList:
    vector = pd.DataFrame([vector])
    finalDf = pd.concat([finalDf, vector], ignore_index=True)

In [17]:
# add label column to the final data frame
finalDf["label"] = df["sentiment"]

In [18]:
finalDf

Unnamed: 0,avengers,great,movie,love,bad,hate,didnt,like,the,think,label
0,1,1,1,0,0,0,0,0,0,0,good
1,1,1,0,1,0,0,0,0,0,0,good
2,1,0,1,0,1,0,0,0,0,0,bad
3,1,0,0,0,0,1,0,0,0,0,bad
4,1,0,1,0,0,0,1,1,1,0,bad
5,1,0,1,0,1,0,0,0,0,1,bad
6,0,0,1,1,0,0,0,0,1,0,good
7,0,1,0,0,0,0,0,0,0,1,good


In [19]:
# convert label into corresponding vector
for i in range(len(finalDf["label"])):
    finalDf["label"][i] = labelDict[finalDf["label"][i]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  finalDf["label"][i] = labelDict[finalDf["label"][i]]


In [20]:
shallowModel = build_shallow_net()
print("[Info] Compiling model...")
# fit the Keras model on the dataset
shallowModel.fit(
    finalDf.iloc[:,0:10],
    finalDf.iloc[:,10].astype("float32"),
    epochs=epochs,
    batch_size=batchSize
)

[Info] Compiling model...
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f619c25fa00>

In [21]:
# create dataset using TensorFlow
trainX, trainY = tensorflow_wrap(df)
# initialize the new model for tf wrapped data
tensorflowModel = build_shallow_net()
print("[Info] Compiling model with tensorflow wrapped data...")
# fit the keras model on the tensorflow dataset
tensorflowModel.fit(
    trainX,
    trainY,
    epochs=epochs,
    batch_size=batchSize
)

[Info] Compiling model with tensorflow wrapped data...
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f61947ac1f0>