<a href="https://colab.research.google.com/github/mmaghajani/recommender-with-tf-sample/blob/main/recomm_movielens.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## MovieLens Recommender System

In [1]:
!pip install -q --upgrade tensorflow-datasets

[K     |████████████████████████████████| 3.9MB 32.1MB/s 
[?25h

In [4]:
import tensorflow_datasets as tfds
import tensorflow as tf

In [3]:
ratings_dataset, ratings_datasets_info = tfds.load(
    name='movielens/100k-ratings',
    with_info=True,
    split='train',
)

[1mDownloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 32.41 MiB, total: 37.10 MiB) to /root/tensorflow_datasets/movielens/100k-ratings/0.1.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Extraction completed...', max=1.0, styl…






HBox(children=(FloatProgress(value=0.0, description='Generating splits...', max=1.0, style=ProgressStyle(descr…

HBox(children=(FloatProgress(value=0.0, description='Generating train examples...', max=100000.0, style=Progre…

HBox(children=(FloatProgress(value=0.0, description='Shuffling movielens-train.tfrecord...', max=100000.0, sty…

[1mDataset movielens downloaded and prepared to /root/tensorflow_datasets/movielens/100k-ratings/0.1.0. Subsequent calls will reuse this data.[0m


In [5]:
assert isinstance(ratings_dataset, tf.data.Dataset)

In [6]:
len(ratings_dataset)

100000

In [10]:
ratings_dataset_head = ratings_dataset.take(5)

for rating in ratings_dataset_head.as_numpy_iterator():
  print(rating)

{'bucketized_user_age': 45.0, 'movie_genres': array([7]), 'movie_id': b'357', 'movie_title': b"One Flew Over the Cuckoo's Nest (1975)", 'raw_user_age': 46.0, 'timestamp': 879024327, 'user_gender': True, 'user_id': b'138', 'user_occupation_label': 4, 'user_occupation_text': b'doctor', 'user_rating': 4.0, 'user_zip_code': b'53211'}
{'bucketized_user_age': 25.0, 'movie_genres': array([ 4, 14]), 'movie_id': b'709', 'movie_title': b'Strictly Ballroom (1992)', 'raw_user_age': 32.0, 'timestamp': 875654590, 'user_gender': True, 'user_id': b'92', 'user_occupation_label': 5, 'user_occupation_text': b'entertainment', 'user_rating': 2.0, 'user_zip_code': b'80525'}
{'bucketized_user_age': 18.0, 'movie_genres': array([4]), 'movie_id': b'412', 'movie_title': b'Very Brady Sequel, A (1996)', 'raw_user_age': 24.0, 'timestamp': 882075110, 'user_gender': True, 'user_id': b'301', 'user_occupation_label': 17, 'user_occupation_text': b'student', 'user_rating': 4.0, 'user_zip_code': b'55439'}
{'bucketized_use

In [9]:
len(ratings_dataset_head)

5

In [11]:
tfds.as_dataframe(ds=ratings_dataset_head, ds_info=ratings_datasets_info)

Unnamed: 0,bucketized_user_age,movie_genres,movie_id,movie_title,raw_user_age,timestamp,user_gender,user_id,user_occupation_label,user_occupation_text,user_rating,user_zip_code
0,45.0,7 (Drama),b'357',"b""One Flew Over the Cuckoo's Nest (1975)""",46.0,879024327,True,b'138',4 (doctor/health care),b'doctor',4.0,b'53211'
1,25.0,4 (Comedy) 14 (Romance),b'709',b'Strictly Ballroom (1992)',32.0,875654590,True,b'92',5 (entertainment),b'entertainment',2.0,b'80525'
2,18.0,4 (Comedy),b'412',"b'Very Brady Sequel, A (1996)'",24.0,882075110,True,b'301',17 (student),b'student',4.0,b'55439'
3,50.0,5 (Crime) 7 (Drama),b'56',b'Pulp Fiction (1994)',50.0,883326919,True,b'60',4 (doctor/health care),b'healthcare',4.0,b'06472'
4,50.0,10 (Horror) 16 (Thriller),b'895',b'Scream 2 (1997)',55.0,891409199,True,b'197',18 (technician/engineer),b'technician',3.0,b'75094'


### Feature Selection

In [12]:
ratings_dataset = ratings_dataset.map(
    lambda rating: {
        'user_id': rating['user_id'],
        'movie_id': rating['movie_id'],
        'movie_title': rating['movie_title'],
        'user_rating': rating['user_rating'],
        'timestamp': rating['timestamp'],

    }
)

tfds.as_dataframe(ds=ratings_dataset.take(5), ds_info=ratings_datasets_info)

Unnamed: 0,movie_id,movie_title,timestamp,user_id,user_rating
0,b'357',"b""One Flew Over the Cuckoo's Nest (1975)""",879024327,b'138',4.0
1,b'709',b'Strictly Ballroom (1992)',875654590,b'92',2.0
2,b'412',"b'Very Brady Sequel, A (1996)'",882075110,b'301',4.0
3,b'56',b'Pulp Fiction (1994)',883326919,b'60',4.0
4,b'895',b'Scream 2 (1997)',891409199,b'197',3.0


### What is Dataset Metadata?

In [13]:
ratings_datasets_info

tfds.core.DatasetInfo(
    name='movielens',
    full_name='movielens/100k-ratings/0.1.0',
    description="""
    This dataset contains a set of movie ratings from the MovieLens website, a movie
    recommendation service. This dataset was collected and maintained by [GroupLens]
    (https://grouplens.org/), a research group at the University of Minnesota. There
    are 5 versions included: "25m", "latest-small", "100k", "1m", "20m". In all
    datasets, the movies data and ratings data are joined on "movieId". The 25m
    dataset, latest-small dataset, and 20m dataset contain only movie data and
    rating data. The 1m dataset and 100k dataset contain demographic data in
    addition to movie and rating data.
    
    - "25m": This is the latest stable version of the MovieLens dataset. It is
    recommended for research purposes.
    - "latest-small": This is a small subset of the latest version of the MovieLens
    dataset. It is changed and updated over time by GroupLens.
    - "10

In [16]:
ratings_datasets_info.splits['train'].num_examples

100000

### Train/Test split

In [17]:
tf.random.set_seed(42)
ratings_dataset_shuffled = ratings_dataset.shuffle(
    buffer_size=100_000,
    seed=42,
    reshuffle_each_iteration=False,
)

In [18]:
ratings_trainset = ratings_dataset_shuffled.take(80_000)
ratings_testset = ratings_dataset_shuffled.skip(80_000)

In [20]:
len(ratings_trainset), len(ratings_testset)

(80000, 20000)