## Amazon_us_reviews 

## Load Dataset & Feature selection

In [None]:
!pip3 install -q --upgrade tensorflow-datasets

[K     |████████████████████████████████| 4.7 MB 4.3 MB/s 
[?25h

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds

In [None]:
ds, ds_info = tfds.load(
    name='amazon_us_reviews/Watches_v1_00',
    split='train',
    with_info='True'
)

In [None]:
#feature_selection
ds = ds.map(
    lambda x:{
        'customer_id':x['data']['customer_id'],
        'product_id':x['data']['product_id'],
        'product_category':x['data']['product_category'],
        'product_title':x['data']['product_title'],
        'star_rating':x['data']['star_rating'],
        'total_votes':x['data']['total_votes'],
        'review_headline':x['data']['review_headline']
    }
)

tfds.as_dataframe(ds.take(5))

Unnamed: 0,customer_id,product_category,product_id,product_title,review_headline,star_rating,total_votes
0,b'30756603',b'Watches',b'B00DKYC7TK',b'Ritche 22mm Black Stainless Steel Bracelet Watch Band Strap Pebble Time/Pebble Classic',b'This provided a nice upgraded look for my Pebble',4,1
1,b'45902750',b'Watches',b'B004VRD6FY',"b""Timex Men's Expedition Metal Field Watch""",b'Case diameter smaller than specs',3,1
2,b'44191588',b'Watches',b'B0000C9ZBY',"b""Skagen Men's 233LTTM Titanium Mesh Watch""",b'Three Stars',3,0
3,b'5043150',b'Watches',b'B005OCVYGI',b'Quiksilver Mens Watch Slam',"b'bad buy, the band was broken'",1,1
4,b'51201224',b'Watches',b'B0021AEDQY',"b""Bulova Men's 98A110 Mechanical Hand-Wind Automatic Silver White Dial Watch""",b'Great Watch!',5,0


## Splite train/test datasets

In [None]:
tf.random.set_seed(42)
ds = ds.shuffle(
    buffer_size = 960_872,
    reshuffle_each_iteration=False,
    seed=42
)

In [None]:
train = ds.take(768000)
test = ds.skip(768000)
print(train.__len__())
print(test.__len__())

tf.Tensor(768000, shape=(), dtype=int64)
tf.Tensor(192872, shape=(), dtype=int64)


## Feature preprocessing

### Numerical Features


*   star_rating
*   total_votes

In [None]:
#star_rating Normalization
star_rating_normalization_layer = \
tf.keras.layers.experimental.preprocessing.Normalization(axis=None)

star_rating_normalization_layer.adapt(
    train.map(
        lambda x:x['star_rating']
    )
)

In [None]:
for i in train.take(5).as_numpy_iterator():
  print(star_rating_normalization_layer(i['star_rating']))

In [None]:
#total_votes Normalization
total_votes_normalization_layer = \
tf.keras.layers.experimental.preprocessing.Normalization(axis=None)

total_votes_normalization_layer.adapt(
    train.map(
        lambda x:x['total_votes']
    )
)

In [None]:
for i in train.take(3).as_numpy_iteratore():
  print(total_votes_normalization_layer(i['total_votes']))

### Categorical Features


*   customer_id
*   product_id
*   product_category

In [None]:
#customer_id StringLookup
customer_id_layer = \
tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)

customer_id_layer.adapt(
    train.map(
        lambda x:x['customer_id']
    )
)
customer_id_embedding = tf.keras.layers.Embedding(
    input_dim = customer_id_layer.vocabulary_size(),
    output_dim = 32
)

In [None]:
# customer_id_model = tf.keras.Sequential([customer_id_layer,customer_id_embedding])
# print(customer_id_model(['-2','1']))

In [None]:
#product_id StringLookup
product_id_layer = \
tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)

product_id_layer.adapt(
    train.map(
        lambda x:x['product_id']
    )
)
product_id_embedding = tf.keras.layers.Embedding(
    input_dim = product_id_layer.vocabulary_size(),
    output_dim = 32
)

In [None]:
customer_id_model = tf.keras.Sequential([customer_id_layer, customer_id_embedding])
product_id_model = tf.keras.Sequential([product_id_layer, product_id_embedding])

In [None]:
#product_category StringLookup
product_category_layer = \
tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)

product_category_layer.adapt(
    train.map(
        lambda x:x['product_category']
    )
)
product_category_embedding = tf.keras.layers.Embedding(
    input_dim = product_category_layer.vocabulary_size(),
    output_dim = 32
)

### Textual features


*   product_title
*   review_headline

In [None]:
#product_title TextVectorization
product_title_layer = \
tf.keras.layers.experimental.preprocessing.TextVectorization()

product_title_layer.adapt(
    train.map(
        lambda x:x['product_title']
    )
)

product_title_embedding = \
tf.keras.layers.Embedding(
    input_dim = product_title_layer.vocabulary_size(),
    output_dim = 32
)

product_title_model = tf.keras.Sequential([
    product_title_layer,
    product_title_embedding,
    tf.keras.layers.AveragePooling1D()
])

In [None]:
# for row in train.batch(1).map(lambda x:x['product_title']).take(1):
#   print(product_title_model(row))

In [None]:
#review_headline TextVectorization
review_headline_layer = \
tf.keras.layers.experimental.preprocessing.TextVectorization()

review_headline_layer.adapt(
    train.map(
        lambda x:x['review_headline']
    )
)

review_headline_embedding = \
tf.keras.layers.Embedding(
    input_dim = review_headline_layer.vocabulary_size(),
    output_dim = 32
)

review_headline_model = tf.keras.Sequential([
    review_headline_layer,
    review_headline_embedding,
    tf.keras.layers.AveragePooling1D()
])