# Summary

- There're 18 duplicates in the 100k MovieLens dataset from TF.
- The list can be found below
- I've tried to set up a protoype to get rid of the duplicates, but it doesn't look like it went well (see example below)

Before transformation:
{'movie_genres': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([4])>, 'movie_id': <tf.Tensor: shape=(), dtype=string, numpy=b'1681'>, 'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'You So Crazy (1994)'>}

After transformation:
{'movie_id': <tf.Tensor: shape=(), dtype=string, numpy=b'1681'>, 'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'You So Crazy (1994)'>, 'movie_genres': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([4], dtype=int32)>}

In [67]:
duplicates_list

array([b'Fly Away Home (1996)', b'Sliding Doors (1998)',
       b'That Darn Cat! (1997)', b'Butcher Boy, The (1998)',
       b'Ice Storm, The (1997)', b'Hurricane Streets (1998)',
       b'Kull the Conqueror (1997)', b'Chairman of the Board (1998)',
       b"Ulee's Gold (1997)", b'Designated Mourner, The (1997)',
       b'Deceiver (1997)', b'Desperate Measures (1998)',
       b'Substance of Fire, The (1996)', b'Nightwatch (1997)',
       b'Chasing Amy (1997)', b'Body Snatchers (1993)',
       b'Hugo Pool (1997)', b'Money Talks (1997)'], dtype=object)

# Modules

In [1]:
import numpy as np
import pandas as pd

In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
from tensorflow.keras import layers

2024-09-10 00:32:11.245650: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-10 00:32:11.279350: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-10 00:32:11.279387: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-10 00:32:11.280409: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-10 00:32:11.285965: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-10 00:32:11.286734: I tensorflow/core/platform/cpu_feature_guard.cc:1

# Data

In [52]:
# Features of all the available movies.
movies = tfds.load("movielens/100k-movies", split="train")

In [53]:
len(movies)

1682

In [10]:
df = tfds.as_dataframe(movies)

# Analysis

In [12]:
# Identify duplicate rows in movie_title. With keep=False it returns all duplicates as TRUE and filters to return only those
duplicate_titles = df[df['movie_title'].duplicated(keep=False)]


duplicate_titles['movie_title']

2                 b'Fly Away Home (1996)'
154               b'Sliding Doors (1998)'
215              b'That Darn Cat! (1997)'
234            b'Butcher Boy, The (1998)'
335              b'Ice Storm, The (1997)'
344           b'Hurricane Streets (1998)'
436          b'Kull the Conqueror (1997)'
457       b'Chairman of the Board (1998)'
516                 b"Ulee's Gold (1997)"
527     b'Designated Mourner, The (1997)'
536       b'Chairman of the Board (1998)'
540                    b'Deceiver (1997)'
557            b'Butcher Boy, The (1998)'
599          b'Desperate Measures (1998)'
602      b'Substance of Fire, The (1996)'
626                  b'Nightwatch (1997)'
651                 b'Chasing Amy (1997)'
657              b'Body Snatchers (1993)'
719      b'Substance of Fire, The (1996)'
731                  b'Nightwatch (1997)'
740                   b'Hugo Pool (1997)'
768                 b'Chasing Amy (1997)'
793                 b"Ulee's Gold (1997)"
820          b'Kull the Conqueror 

In [16]:
duplicates_list = duplicate_titles['movie_title'].unique()
duplicates_list

array([b'Fly Away Home (1996)', b'Sliding Doors (1998)',
       b'That Darn Cat! (1997)', b'Butcher Boy, The (1998)',
       b'Ice Storm, The (1997)', b'Hurricane Streets (1998)',
       b'Kull the Conqueror (1997)', b'Chairman of the Board (1998)',
       b"Ulee's Gold (1997)", b'Designated Mourner, The (1997)',
       b'Deceiver (1997)', b'Desperate Measures (1998)',
       b'Substance of Fire, The (1996)', b'Nightwatch (1997)',
       b'Chasing Amy (1997)', b'Body Snatchers (1993)',
       b'Hugo Pool (1997)', b'Money Talks (1997)'], dtype=object)

In [18]:
len(duplicates_list)

18

# Preprocessing prototype (TF)

In [57]:
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd

# Load the movielens dataset
movies = tfds.load("movielens/100k-movies", split="train")

# Convert the TensorFlow Dataset to a pandas DataFrame
df = tfds.as_dataframe(movies)

# Remove duplicate rows based on the 'movie_title' column
df_unique = df.drop_duplicates(subset='movie_title', keep='first')

# Step 1: Handle the `movie_genres` column using `tf.ragged.constant`
movie_genres_ragged = tf.ragged.constant(df_unique['movie_genres'].tolist())

# Step 2: Convert other columns (non-list) to TensorFlow tensors
df_unique_no_genres = df_unique.drop(columns=['movie_genres'])

def pandas_to_tf_dataset(dataframe, genres_ragged):
    dict_data = {}
    
    # Convert non-list columns to TensorFlow tensors
    for col in dataframe.columns:
        dict_data[col] = tf.convert_to_tensor(dataframe[col].values)
    
    # Add the `movie_genres` as a ragged tensor
    dict_data['movie_genres'] = genres_ragged
    
    return tf.data.Dataset.from_tensor_slices(dict_data)

# Create the dataset
movies_unique = pandas_to_tf_dataset(df_unique_no_genres, movie_genres_ragged)

# Inspect the first few unique movies
for movie in movies_unique.take(5):
    print(movie)


{'movie_id': <tf.Tensor: shape=(), dtype=string, numpy=b'1681'>, 'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'You So Crazy (1994)'>, 'movie_genres': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([4], dtype=int32)>}
{'movie_id': <tf.Tensor: shape=(), dtype=string, numpy=b'1457'>, 'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'Love Is All There Is (1996)'>, 'movie_genres': <tf.Tensor: shape=(2,), dtype=int32, numpy=array([4, 7], dtype=int32)>}
{'movie_id': <tf.Tensor: shape=(), dtype=string, numpy=b'500'>, 'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'Fly Away Home (1996)'>, 'movie_genres': <tf.Tensor: shape=(2,), dtype=int32, numpy=array([1, 3], dtype=int32)>}
{'movie_id': <tf.Tensor: shape=(), dtype=string, numpy=b'838'>, 'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'In the Line of Duty 2 (1987)'>, 'movie_genres': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>}
{'movie_id': <tf.Tensor: shape=(), dtype=string, n

In [58]:
df_test = tfds.as_dataframe(movies_unique)

In [59]:
# Identify duplicate rows in movie_title. With keep=False it returns all duplicates as TRUE and filters to return only those
duplicate_titles_test = df_test[df_test['movie_title'].duplicated(keep=False)]


duplicate_titles_test['movie_title']

Series([], Name: movie_title, dtype: object)

## Comparision

## Length of tf ds

In [61]:
len(movies)

1682

In [60]:
len(movies_unique)

1664

In [65]:
# Inspect movies tf ds (without manipulation)
for movie in movies.take(5):
    print(movie)

{'movie_genres': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([4])>, 'movie_id': <tf.Tensor: shape=(), dtype=string, numpy=b'1681'>, 'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'You So Crazy (1994)'>}
{'movie_genres': <tf.Tensor: shape=(2,), dtype=int64, numpy=array([4, 7])>, 'movie_id': <tf.Tensor: shape=(), dtype=string, numpy=b'1457'>, 'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'Love Is All There Is (1996)'>}
{'movie_genres': <tf.Tensor: shape=(2,), dtype=int64, numpy=array([1, 3])>, 'movie_id': <tf.Tensor: shape=(), dtype=string, numpy=b'500'>, 'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'Fly Away Home (1996)'>}
{'movie_genres': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([0])>, 'movie_id': <tf.Tensor: shape=(), dtype=string, numpy=b'838'>, 'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'In the Line of Duty 2 (1987)'>}
{'movie_genres': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([7])>, 'movie_id': <tf.Tensor: shape

In [66]:
# Inspect movies tf ds (after manipulation)
for movie in movies_unique.take(5):
    print(movie)

{'movie_id': <tf.Tensor: shape=(), dtype=string, numpy=b'1681'>, 'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'You So Crazy (1994)'>, 'movie_genres': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([4], dtype=int32)>}
{'movie_id': <tf.Tensor: shape=(), dtype=string, numpy=b'1457'>, 'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'Love Is All There Is (1996)'>, 'movie_genres': <tf.Tensor: shape=(2,), dtype=int32, numpy=array([4, 7], dtype=int32)>}
{'movie_id': <tf.Tensor: shape=(), dtype=string, numpy=b'500'>, 'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'Fly Away Home (1996)'>, 'movie_genres': <tf.Tensor: shape=(2,), dtype=int32, numpy=array([1, 3], dtype=int32)>}
{'movie_id': <tf.Tensor: shape=(), dtype=string, numpy=b'838'>, 'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'In the Line of Duty 2 (1987)'>, 'movie_genres': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>}
{'movie_id': <tf.Tensor: shape=(), dtype=string, n