In [8]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from numpy.random import seed # for reproducible results
seed(1)
from tensorflow import set_random_seed # TensorFlow has its own random number generator
set_random_seed(2)

In [2]:
# read data
df = pd.read_csv('Musical_instruments_reviews.csv')
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014"
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013"
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5.0,It Does The Job Well,1377648000,"08 28, 2013"
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014"
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.,1392940800,"02 21, 2014"


In [3]:
Feature_Columns = ['reviewText', 'overall']
new_df = df[Feature_Columns]
new_df = new_df.dropna()

# convert to binary target
def to_binary(df):
    replacements = {1:0,
                    2:0,
                    3:0,
                    4:1,
                    5:1}
    
    #if statement so it applies the replace just the first time
    if df['overall'].unique().sum() > 1:
        df['overall'].replace(replacements, inplace=True)    
    return df

new_df = to_binary(new_df)

# balance data
target_numbers = new_df['overall'].value_counts()
#extract portion of positives equal to the size of negatives and shuffle
balanced_df = new_df.loc[new_df['overall']==1].sample(n=target_numbers[0], random_state=0)
#append the negatives and shuffle
balanced_df = balanced_df.append(new_df.loc[new_df['overall']==0]).sample(frac=1)

print(balanced_df.head())

                                             reviewText  overall
8588  I can't say much about this product.  I tried ...      0.0
7259  These are fine but didn't fit well; but they a...      0.0
5129  I bought this pedal years ago, and like most g...      0.0
8452  This delay is really nice, especially for the ...      1.0
4439  This system works just okay.  Try to turn it u...      0.0


In [4]:
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

from keras.preprocessing.text import Tokenizer

# convert reviews to list
review_list = list(balanced_df['reviewText'])

# separate words into tokens
t = Tokenizer()
t.fit_on_texts(review_list)
vocab_size = len(t.word_index) + 1

# creates hashs 
encoded_docs = t.texts_to_sequences(review_list)

# padd sequences (generates matrix with each word tag)
max_length = 1000
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

# target
labels = list(balanced_df['overall'])

Using TensorFlow backend.


The model is a simple binary classification model. Importantly, the output from the Embedding layer will be 4 vectors of 8 dimensions each

In [5]:
# define the model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=max_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1000, 50)          564050    
_________________________________________________________________
flatten_1 (Flatten)          (None, 50000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 50001     
Total params: 614,051
Trainable params: 614,051
Non-trainable params: 0
_________________________________________________________________
None


In [6]:
# fit the model
model.fit(padded_docs, labels, epochs=10, verbose=1)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 100.000000


In [7]:
# make prediction in particular
text = ['I dont like it', 'I like it']

# pre-process
text_encoded = t.texts_to_sequences(text)
text_padded = pad_sequences(text_encoded, maxlen=max_length, padding='post')

model.predict(text_padded)

array([[0.577113  ],
       [0.59269047]], dtype=float32)

# Important points

It is important to remember that the data must always be balanced. This reduces the bias but forces us to drop important information of the predominating class.

The model gives good results with sentences that contain the same words of the training subset, but does poorly with words that were not learned. This is one drawback of having a small dataset.



# Conclusion

- It would be more suitable to use a machine learning algorithm, like KNN, Logistic Regression or Naive Bayes.
- Another option is to train the neural network with a bigger dataset and then use it to classify this particular dataset.