In [None]:
#!pip install nvtabular
#!pip install merlin-models
#!pip install tensorflow==2.10

In [None]:
from merlin.core.dispatch import get_lib
import nvtabular as nvt
from merlin.schema.tags import Tags
import numpy as np
import tensorflow
import merlin.models.tf as mm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_path = "/content/drive/MyDrive/DH2465-DH2655/BeerBrain (Aka cringe DL modeler för lätta VC pengar)/beer_reviews.csv"
data = get_lib().read_csv(file_path)

In [None]:
# Remvoe the rows with missing values
data = data.dropna()

In [None]:
#data.drop_duplicates(subset=["beer_beerid"], inplace=True)

In [None]:
# Integer encode the beer_style column
data["beer_style_code"] = data["beer_style"].astype("category")
data["beer_style_code"] = data["beer_style_code"].cat.codes

In [None]:
data = data.drop(columns=['review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'review_time', 'brewery_id', "brewery_name", "beer_name", "beer_style"])

In [None]:
# Use the review_profilename as the partition key for the training and test set
# Get the unique values of the review_profilename column
unique_review_profilename = data["review_profilename"].unique()

# Shuffle the unique values
np.random.shuffle(unique_review_profilename)

train_users = unique_review_profilename[:int(len(unique_review_profilename) * 0.8)]
test_users = unique_review_profilename[int(len(unique_review_profilename) * 0.8):]

In [None]:
# Use the train_users to create the training set as a pandas dataframe
train = data[data["review_profilename"].isin(train_users)]

# Use the test_users to create the test set as a pandas dataframe
test = data[data["review_profilename"].isin(test_users)]

In [None]:
train_ds = nvt.Dataset(train, npartitions=2)
valid_ds = nvt.Dataset(test)

train_ds, valid_ds



(<merlin.io.dataset.Dataset at 0x7a148bfd0eb0>,
 <merlin.io.dataset.Dataset at 0x7a148bfd13c0>)

In [None]:
train_ds.shuffle_by_keys('review_profilename')
valid_ds.shuffle_by_keys('review_profilename')



<merlin.io.dataset.Dataset at 0x7a148bfd0e80>

In [None]:
styles = ['beer_style_code'] >> nvt.ops.Categorify(freq_threshold=10)

In [None]:
def rating_to_binary(col):
    return (col > 3.5).astype('float')

In [None]:
binary_ratings_ops = [col >> nvt.ops.LambdaOp(rating_to_binary) >> nvt.ops.Rename(name=f'binary_{col}') for col in ['review_aroma', 'review_appearance', 'review_palate', 'review_taste']]

In [None]:
binary_ratings_tagged = []
for op in binary_ratings_ops:
    binary_ratings_tagged.append(op >> nvt.ops.AddTags(tags=[Tags.CONTEXT]))

In [None]:
binary_target = ['review_overall'] >> nvt.ops.LambdaOp(rating_to_binary) >> nvt.ops.Rename(name='binary_target')


In [None]:
userId = ['review_profilename'] >> nvt.ops.Categorify() >> nvt.ops.AddTags(tags=[Tags.USER_ID, Tags.CATEGORICAL, Tags.USER])
beerId = ['beer_beerid'] >> nvt.ops.Categorify() >> nvt.ops.AddTags(tags=[Tags.ITEM_ID, Tags.CATEGORICAL, Tags.ITEM])
binary_target = binary_target >> nvt.ops.AddTags(tags=[Tags.TARGET, Tags.BINARY_CLASSIFICATION])


In [None]:
workflow = nvt.Workflow(userId + beerId + styles + binary_target)

In [None]:
train_transformed = workflow.fit_transform(train_ds)
valid_transformed = workflow.transform(valid_ds)
valid_transformed.compute().head()



Unnamed: 0,review_profilename,beer_beerid,beer_style_code,binary_target
0,2,12334,53,1.0
1,2,2,61,1.0
2,2,621,41,0.0
3,2,5602,80,1.0
4,2,5602,80,1.0


In [None]:
model = mm.DLRMModel(
    train_transformed.schema,
    embedding_dim=64,
    bottom_block=mm.MLPBlock([512, 64]),
    top_block=mm.MLPBlock([512, 64, 32]),
    prediction_tasks=mm.BinaryClassificationTask('binary_target')
)

opt = tensorflow.optimizers.Adam(learning_rate=5e-3)
model.compile(optimizer=opt)
model.fit(train_transformed, validation_data=valid_transformed, batch_size=512, epochs=10)

model.optimizer.learning_rate = 1e-3
model.fit(train_transformed, validation_data=valid_transformed, batch_size=512, epochs=6)

Epoch 1/10
   6/2335 [..............................] - ETA: 1:40 - loss: 0.6727 - precision: 0.6327 - recall: 0.8814 - binary_accuracy: 0.6006 - auc: 0.4942 - regularization_loss: 0.0000e+00 - loss_batch: 0.6727



Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7a14b0175240>