In [2]:
from merlin.core.dispatch import get_lib
import nvtabular as nvt
from merlin.schema.tags import Tags
import numpy as np

  warn(f"Tensorflow dtype mappings did not load successfully due to an error: {exc.msg}")
  warn(f"PyTorch dtype mappings did not load successfully due to an error: {exc.msg}")
  warn(f"Triton dtype mappings did not load successfully due to an error: {exc.msg}")


In [3]:
data = get_lib().read_csv("../data/beer_reviews.csv")


In [4]:
# Remvoe the rows with missing values
data = data.dropna()

In [5]:
# Create mapping for the beer names to the beer ids based on the collums "beer_name" and "beer_beerid"
beer_name_to_id = {beer_name: beer_id for beer_name, beer_id in zip(data["beer_name"], data["beer_beerid"])}


In [6]:
# Integer encode the beer_style column
data["beer_style_code"] = data["beer_style"].astype("category")
data["beer_style_code"] = data["beer_style_code"].cat.codes

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1518478 entries, 0 to 1586613
Data columns (total 14 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   brewery_id          1518478 non-null  int64  
 1   brewery_name        1518478 non-null  object 
 2   review_time         1518478 non-null  int64  
 3   review_overall      1518478 non-null  float64
 4   review_aroma        1518478 non-null  float64
 5   review_appearance   1518478 non-null  float64
 6   review_profilename  1518478 non-null  object 
 7   beer_style          1518478 non-null  object 
 8   review_palate       1518478 non-null  float64
 9   review_taste        1518478 non-null  float64
 10  beer_name           1518478 non-null  object 
 11  beer_abv            1518478 non-null  float64
 12  beer_beerid         1518478 non-null  int64  
 13  beer_style_code     1518478 non-null  int8   
dtypes: float64(6), int64(3), int8(1), object(4)
memory usage: 163.6+ M

In [8]:
# Nomralize all columns starting with "review_" from 0-5 to 0-1
for col in data.columns:
    if col.startswith("review_") and col != "review_profilename":
        data[col] = data[col] / 5.0

# Normalize the beer_abv column from 0-80 to 0-1
data["beer_abv"] = data["beer_abv"] / 80.0

In [9]:
# Target = beer_id
# Features = beer_style_code, brewery_id, review_appearance, review_aroma, review_palate, review_taste, review_overall, beer_abv
# Partition keys = review_time, review_profilename

# Drop the columns we don't need
data = data.drop(columns=['beer_abv', 'review_time', 'brewery_id', "brewery_name", "beer_name", "beer_style"])

In [10]:
# Use the review_profilename as the partition key for the training and test set
# Get the unique values of the review_profilename column
unique_review_profilename = data["review_profilename"].unique()

# Shuffle the unique values
np.random.shuffle(unique_review_profilename)

train_users = unique_review_profilename[:int(len(unique_review_profilename) * 0.8)]
test_users = unique_review_profilename[int(len(unique_review_profilename) * 0.8):]

In [11]:
# Use the train_users to create the training set as a pandas dataframe
train = data[data["review_profilename"].isin(train_users)]

# Use the test_users to create the test set as a pandas dataframe
test = data[data["review_profilename"].isin(test_users)]


In [12]:
train_ds = nvt.Dataset(train, npartitions=2)
valid_ds = nvt.Dataset(test)

train_ds, valid_ds



(<merlin.io.dataset.Dataset at 0x1695e1dc0>,
 <merlin.io.dataset.Dataset at 0x1695e1250>)

In [13]:
train_ds.shuffle_by_keys('review_profilename')
valid_ds.shuffle_by_keys('review_profilename')



<merlin.io.dataset.Dataset at 0x17a1c5190>

In [14]:
styles = ['beer_style_code'] >> nvt.ops.Categorify(freq_threshold=10)

In [15]:
def rating_to_binary(col):
    return (col > 3).astype('int8')

In [16]:
binary_ratings_ops = [col >> nvt.ops.LambdaOp(rating_to_binary) >> nvt.ops.Rename(name=f'binary_{col}') for col in ['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste']]

In [17]:
binary_ratings_tagged = []
for op in binary_ratings_ops:
    binary_ratings_tagged.append(op >> nvt.ops.AddTags(tags=[Tags.TARGET, Tags.BINARY_CLASSIFICATION]))

In [18]:
userId = ['review_profilename'] >> nvt.ops.Categorify() >> nvt.ops.AddTags(tags=[Tags.USER_ID, Tags.CATEGORICAL, Tags.USER])
beerId = ['beer_beerid'] >> nvt.ops.Categorify() >> nvt.ops.AddTags(tags=[Tags.ITEM_ID, Tags.CATEGORICAL, Tags.ITEM])

In [19]:
workflow = nvt.Workflow(userId + beerId + styles + binary_ratings_tagged)

In [20]:
train_ds.head()

Unnamed: 0,review_overall,review_aroma,review_appearance,review_profilename,review_palate,review_taste,beer_beerid,beer_style_code
0,0.3,0.4,0.5,stcules,0.3,0.3,47986,65
1,0.6,0.5,0.6,stcules,0.6,0.6,48213,51
2,0.6,0.5,0.6,stcules,0.6,0.6,48215,59
3,0.6,0.6,0.7,stcules,0.5,0.6,47969,61
4,0.8,0.9,0.8,johnmichaelsen,0.8,0.9,64883,9


In [21]:
train_transformed = workflow.fit_transform(train_ds)
valid_transformed = workflow.transform(valid_ds)
valid_transformed.compute().head()



Unnamed: 0,review_profilename,beer_beerid,beer_style_code,binary_review_overall,binary_review_aroma,binary_review_appearance,binary_review_palate,binary_review_taste
0,2,656,41,0,0,0,0,0
1,2,2,60,0,0,0,0,0
2,2,5651,81,0,0,0,0,0
3,2,5651,81,0,0,0,0,0
4,2,5651,81,0,0,0,0,0


In [22]:
train_transformed.schema

Unnamed: 0,name,tags,dtype,is_list,is_ragged,properties.num_buckets,properties.freq_threshold,properties.max_size,properties.cat_path,properties.domain.min,properties.domain.max,properties.domain.name,properties.embedding_sizes.cardinality,properties.embedding_sizes.dimension
0,review_profilename,"(Tags.ID, Tags.CATEGORICAL, Tags.USER)","DType(name='int64', element_type=<ElementType....",False,False,,0.0,0.0,.//categories/unique.review_profilename.parquet,0.0,26328.0,review_profilename,26329.0,478.0
1,beer_beerid,"(Tags.ID, Tags.CATEGORICAL, Tags.ITEM)","DType(name='int64', element_type=<ElementType....",False,False,,0.0,0.0,.//categories/unique.beer_beerid.parquet,0.0,46306.0,beer_beerid,46307.0,512.0
2,beer_style_code,(Tags.CATEGORICAL),"DType(name='int64', element_type=<ElementType....",False,False,,10.0,0.0,.//categories/unique.beer_style_code.parquet,0.0,106.0,beer_style_code,107.0,22.0
3,binary_review_overall,"(Tags.TARGET, Tags.BINARY_CLASSIFICATION)","DType(name='int8', element_type=<ElementType.I...",False,False,,,,,,,,,
4,binary_review_aroma,"(Tags.TARGET, Tags.BINARY_CLASSIFICATION)","DType(name='int8', element_type=<ElementType.I...",False,False,,,,,,,,,
5,binary_review_appearance,"(Tags.TARGET, Tags.BINARY_CLASSIFICATION)","DType(name='int8', element_type=<ElementType.I...",False,False,,,,,,,,,
6,binary_review_palate,"(Tags.TARGET, Tags.BINARY_CLASSIFICATION)","DType(name='int8', element_type=<ElementType.I...",False,False,,,,,,,,,
7,binary_review_taste,"(Tags.TARGET, Tags.BINARY_CLASSIFICATION)","DType(name='int8', element_type=<ElementType.I...",False,False,,,,,,,,,


In [24]:
import tensorflow
import merlin.models.tf as mm

model = mm.DLRMModel(
    train_transformed.schema,
    embedding_dim=64,
    bottom_block=mm.MLPBlock([128, 64]),
    top_block=mm.MLPBlock([128, 64, 32]),
    prediction_tasks=mm.BinaryClassificationTask('rating_binary')
)

opt = tensorflow.optimizers.Adam(learning_rate=5e-3)
model.compile(optimizer=opt)
model.fit(train_transformed, validation_data=valid_transformed, batch_size=1024, epochs=5)

model.optimizer.learning_rate = 1e-3
model.fit(train_transformed, validation_data=valid_transformed, batch_size=1024, epochs=3)

ImportError: cannot import name 'to_dlpack' from 'tensorflow.python' (/opt/homebrew/Caskroom/miniforge/base/envs/BeerBrain/lib/python3.9/site-packages/tensorflow/python/__init__.py)

In [None]:
print(tensorflow.__version__)

2.14.0


In [None]:
import tensorflow as tf
from tensorflow.experimental.dlpack import to_dlpack, from_dlpack
to_dlpack(tf.constant([1, 2, 3]))

<capsule object "dltensor" at 0x2a484af40>