In [27]:
import tensorflow as tf
import numpy as np
import pandas as pd
import csv
import json
from sklearn.model_selection import train_test_split

In [18]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [19]:
datasets = ['Electronics', 'Cell_Phones_and_Accessories', 'Luxury_Beauty']

for i in datasets:
    input_file = f'/content/gdrive/My Drive/fashion/{i}.json'
    output_file = f'/content/gdrive/My Drive/fashion/{i}.csv'

    with open(input_file, "r", encoding="utf-8") as input_json, open(output_file, "w", encoding="utf-8") as output_csv:
        csv_writer = csv.writer(output_csv)
        flag = 0
        line_count = 0 #delete?
        for line in input_json:
            if line_count < 1000: #delete
                dic = json.loads(line)
                if flag == 0:
                    csv_writer.writerow(dic.keys())
                    flag = 1
                csv_writer.writerow(dic.values())
                line_count += 1 #delte
            else: #delete
                break #delete

print("Done")

Done


In [69]:
df_e = pd.read_csv('/content/gdrive/My Drive/fashion/Electronics.csv', usecols=['overall', 'reviewText'])
df_cpaa = pd.read_csv('/content/gdrive/My Drive/fashion/Cell_Phones_and_Accessories.csv', usecols=['overall', 'reviewText'])
df_lb = pd.read_csv('/content/gdrive/My Drive/fashion/Luxury_Beauty.csv', usecols=['overall', 'reviewText'])

In [77]:
df_train = pd.concat([df_e, df_cpaa], axis=0, ignore_index=True)
df_test = df_lb
df_train = df_train.dropna()
df_test = df_test.dropna()

df_train = df_train[df_train["overall"] != '3']
df_train["label"] = df_train["overall"].apply(lambda rating : 1 if str(rating) > '3' else 0)

df_test = df_test[df_test["overall"] != '3']
df_test["label"] = df_test["overall"].apply(lambda rating : 1 if str(rating) > '3' else 0)

In [78]:
X = pd.DataFrame(df_train['reviewText'])
y = pd.DataFrame(df_train['label'])

train_X, val_X, trian_y, val_y = train_test_split(X, y, random_state=50, shuffle=True, train_size=0.2)

In [79]:
print(f'First review = {df_train.loc[0, "reviewText"]}')
print(f'First review has length = {len(df_train.loc[0, "reviewText"])}\n ')
print(f'First review overall rating = {df_train.loc[0, "overall"]}')
print(f'First review binary rating = {df_train.loc[0, "label"]}')

First review = This was the first time I read Garcia-Aguilera.  I came upon the name of this book on Live with Regis and Kelly. This book was exactly what I was looking for ... it hit the spot.  I really enjoyed this book because it was well written. Once I started this book it kept me coming back for more. It had culture, family, friendship and romance. I was looking for a little more romance when I picked this book but in the end it turned out to be just right.  I love the main chartachter Margarita (aka Daisy). I've never been to Miami but the way Daisy told the story I certainly felt I'd been there.
Also after going through all of Daisy's perils ... I closed the book with a feeling I had grown emotionally as well.
First review has length = 712
 
First review overall rating = 5.0
First review binary rating = 1


**Train:**

In [129]:
max_tokens = 1000
output_sequence_length = 100
pad_to_max_tokens = True

df_train['reviewText'] = df_train['reviewText'].fillna('').astype(str)

encoder = tf.keras.layers.TextVectorization(max_tokens=max_tokens, output_sequence_length=output_sequence_length, pad_to_max_tokens=pad_to_max_tokens)

text_ds = tf.data.Dataset.from_tensor_slices(df_train['reviewText']).batch(128)
encoder.adapt(text_ds)
vocab = np.array(encoder.get_vocabulary())

train_ds = tf.data.Dataset.from_tensor_slices((df_train['reviewText'], df_train['label'])).batch(128)
train_ds = train_ds.map(lambda x, y: (encoder(x), y))

AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)

**Test:**

In [130]:
df_test['reviewText'] = df_test['reviewText'].fillna('').astype(str)

text_test_ds = tf.data.Dataset.from_tensor_slices(df_test['reviewText']).batch(128)
test_ds = text_test_ds.map(lambda x: encoder(x))

AUTOTUNE = tf.data.AUTOTUNE
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [131]:
for text_batch, label_batch in train_ds.take(1):
    for i in range(5):
        print("Review:", text_batch.numpy()[i])
        print("Label:", label_batch.numpy()[i])
        print("---")

Review: [ 10  12   2  90  59   4  58   1   4  99   1   2 883   9  10  31  17 522
  19   1   5   1  10  31  12 241  53   4  12 178  11   3 896   2   1   4
  51 402  10  31  65   3  12  66 334 306   4 200  10  31   3 613  54 385
  96  11  64   3  43   1 319   1   5   1   4  12 178  11   6 104  64   1
  40   4 956  10  31  16  13   2 288   3 534  63   7  39  38 192   4  32
   2 369   1   1   1   1 149 180 107   7]
Label: 1
---
Review: [ 23  19  34   9 684   1 157   4 146  10   8   6 382  58   1   7 123 186
   1   1  52 544   1   1   1   2  27   1   9   2   1   1 134   5   1   1
   9   1   1   5   1   1   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0]
Label: 1
---
Review: [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [143]:
embedding_dimension = 128
embedding_model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=len(vocab),
                              output_dim=embedding_dimension,
                              input_length=100,
                              name="embedding"),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

embedding_model.compile(optimizer='adam',
                        loss='binary_crossentropy',
                        metrics=['accuracy'])

embedding_model.summary()


Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          128000    
                                                                 
 bidirectional (Bidirection  (None, 100, 256)          263168    
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 100, 256)          0         
                                                                 
 lstm_6 (LSTM)               (None, 64)                82176     
                                                                 
 dense_5 (Dense)             (None, 64)                4160      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                      

In [144]:
embedding_weights = embedding_model.get_layer('embedding').get_weights()[0]
print(f'Dimension of the embedding vector: \n{embedding_weights.shape}')

Dimension of the embedding vector: 
(1000, 128)


In [145]:
print(vocab[500:550])

['date' 'chargers' 'car' 'break' 'working' 'wonderful' 'war' 'wants'
 'update' 'top' 'told' 'today' 'show' 'short' 'seems' 'room' 'putting'
 'paint' 'paid' 'opened' 'often' 'multiple' 'live' 'light' 'itself' 'held'
 'forward' 'expect' 'dad' 'assistant' '700' '20' 'young' 'understand'
 'turned' 'thank' 'stop' 'someone' 'silver' 'sienna' 'service' 'reader'
 'quinn' 'point' 'past' 'others' 'ones' 'okay' 'oem' 'novels']


In [146]:
def get_word_index(word, encoder):
    try:
        return encoder.get_vocabulary().index(word)
    except ValueError:
        return None

words = ['dad', 'today']

for word in words:
    word_index = get_word_index(word, encoder)
    if word_index is not None:
        word_vector = embedding_weights[word_index]
        print(f'{word}: {np.round(word_vector, 3)}')
    else:
        print(f'"{word}" not in vocabulary')


dad: [ 0.015  0.042  0.046  0.022 -0.042 -0.027  0.001 -0.009  0.036  0.003
 -0.012 -0.049 -0.048  0.032 -0.006  0.027  0.003  0.031 -0.021  0.037
 -0.033  0.048 -0.039 -0.011  0.045 -0.003 -0.036 -0.029 -0.023 -0.007
 -0.009 -0.026  0.002 -0.018 -0.047 -0.003 -0.023 -0.006 -0.027 -0.017
  0.046 -0.014 -0.025 -0.049 -0.046 -0.045  0.003  0.017 -0.022  0.05
  0.038 -0.047  0.021 -0.048 -0.033  0.019  0.015  0.037 -0.029 -0.029
 -0.019 -0.003 -0.016 -0.02  -0.02  -0.04   0.049 -0.022 -0.022  0.024
  0.047  0.025  0.017  0.014 -0.021 -0.001  0.01  -0.041  0.002 -0.006
  0.039 -0.022  0.044  0.031 -0.047  0.016 -0.028 -0.001  0.021 -0.001
 -0.037  0.02   0.008 -0.025 -0.012 -0.015 -0.047 -0.049 -0.015  0.017
  0.025  0.006  0.015  0.003  0.003 -0.037 -0.032 -0.035 -0.034 -0.005
  0.041  0.032  0.045  0.018 -0.031  0.011  0.044  0.031  0.025  0.02
  0.035  0.046  0.021  0.033 -0.001  0.044  0.006 -0.006]
today: [-0.025 -0.047 -0.049 -0.032 -0.046  0.017 -0.04  -0.039 -0.039 -0.049
  0.012 -

In [147]:
embedding_model.compile(optimizer='adam',
                        loss='binary_crossentropy',
                        metrics=['accuracy'])

embedding_model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          128000    
                                                                 
 bidirectional (Bidirection  (None, 100, 256)          263168    
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 100, 256)          0         
                                                                 
 lstm_6 (LSTM)               (None, 64)                82176     
                                                                 
 dense_5 (Dense)             (None, 64)                4160      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                      

In [149]:
embedding_model.fit(train_ds, epochs=10, verbose=1)

Epoch 1/10

KeyboardInterrupt: ignored

In [141]:
predictions = embedding_model.predict(test_ds)



In [142]:
print(predictions)

[[0.8410684 ]
 [0.629067  ]
 [0.84107214]
 [0.84107214]
 [0.84107214]
 [0.8410719 ]
 [0.84107214]
 [0.84107214]
 [0.84107214]
 [0.84107214]
 [0.84107214]
 [0.841071  ]
 [0.84107196]
 [0.84107214]
 [0.84105265]
 [0.84107214]
 [0.84107214]
 [0.84107214]
 [0.84107214]
 [0.84107155]
 [0.84107214]
 [0.84107214]
 [0.84107214]
 [0.8405561 ]
 [0.84107214]
 [0.84107214]
 [0.84107214]
 [0.84107214]
 [0.8410718 ]
 [0.84107214]
 [0.84107214]
 [0.84107214]
 [0.8409079 ]
 [0.8361913 ]
 [0.8410677 ]
 [0.84107214]
 [0.84107167]
 [0.84107214]
 [0.84107214]
 [0.84107214]
 [0.84107214]
 [0.84107214]
 [0.8410719 ]
 [0.84107053]
 [0.8410721 ]
 [0.84107214]
 [0.84107214]
 [0.84107214]
 [0.84107214]
 [0.84107214]
 [0.84107214]
 [0.8410708 ]
 [0.84107214]
 [0.8410717 ]
 [0.84107214]
 [0.84107196]
 [0.84107214]
 [0.84107214]
 [0.84107214]
 [0.84107214]
 [0.84107214]
 [0.84107214]
 [0.84107214]
 [0.84107214]
 [0.84107214]
 [0.8410686 ]
 [0.8410721 ]
 [0.84107214]
 [0.84107214]
 [0.84107214]
 [0.8410721 ]
 [0.84