In [1]:
# Packages
import json
import tensorflow as tf
import pandas as pd
from transformers import BertTokenizer, TFBertModel
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# Read in data
credits = pd.read_csv('data/tmdb_5000_credits.csv')
movies = pd.read_csv('data/tmdb_5000_movies.csv')
movies = movies.merge(credits, on='title')
movies = movies.dropna()

In [3]:
# Cleaning data
def col_clean(c, subkey='name'):
    ls =[]
    c = json.loads(c)
    for genre in c:
        ls.append(genre[subkey])
    return ls


movies['genres'] = movies['genres'].apply(col_clean)
movies['keywords'] = movies['keywords'].apply(col_clean)
movies['production_companies'] = movies['production_companies'].apply(col_clean)
movies['production_countries'] = movies['production_countries'].apply(col_clean)
movies['spoken_languages'] = movies['spoken_languages'].apply(col_clean)
movies['characters'] = movies['cast'].apply(col_clean, subkey='character')
movies['cast'] = movies['cast'].apply(col_clean)

In [4]:
movies['keywords_strings'] = [' '.join(i) for i in movies['keywords']]
movies['genres_strings'] = [' '.join(i) for i in movies['genres']]
movies['characters_strings'] = [' '.join(i[:10]) for i in movies['characters']]
movies['cast_strings'] = [' '.join(i[:10]) for i in movies['cast']]

In [5]:
movies['content'] = movies['keywords_strings'] + ' ' + movies['overview'] + ' ' + movies['genres_strings'] + ' ' + movies['characters_strings'] + ' ' + movies['cast_strings']

In [6]:
content = movies['content']

In [7]:
content

0       culture clash future space war space colony so...
1       ocean drug abuse exotic island east india trad...
2       spy based on novel secret agent sequel mi6 bri...
3       dc comics crime fighter terrorist secret ident...
4       based on novel mars medallion space travel pri...
                              ...                        
4779    salesclerk loser aftercreditsstinger Convenien...
4787    dating divorce sex scene sex comedy anti roman...
4797    home invasion Recently dumped by his girlfiren...
4802    distrust garage identity crisis time travel ti...
4807     When ambitious New York attorney Sam is sent ...
Name: content, Length: 1494, dtype: object

In [8]:
# Step 2: Extract features using BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

batch_size = 16  # reduce batch size to avoid OOM error
max_length = 128 # set maximum sequence length
encoded_content = []
for i in range(0, len(content), batch_size):
    batch = content[i:i+batch_size]
    encoded_batch = tokenizer.batch_encode_plus(batch, add_special_tokens=True, truncation=True, max_length=max_length, padding='longest', return_tensors='tf')
    encoded_content.append(encoded_batch)

embeddings = []
for batch in encoded_content:
    input_ids = batch['input_ids']
    token_type_ids = batch['token_type_ids']
    embeddings_batch = model(input_ids, token_type_ids=token_type_ids)[0][:, 0, :]
    embeddings.append(embeddings_batch)

embeddings = tf.concat(embeddings, axis=0).numpy()


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [9]:
# Step 3: Use PCA for feature reduction
pca = PCA(n_components=100)
pca_embeddings = pca.fit_transform(embeddings)

In [10]:
# Step 4: Calculate similarity scores
similarity_matrix = cosine_similarity(pca_embeddings)


In [34]:
# Step 5: Build the recommender system 299, 78, 69, 14
from random import randint
content_id = randint(0,len(movies))
num_recommendations = 10
similar_content_ids = np.argsort(-similarity_matrix[content_id])[:num_recommendations+1]
similar_content_ids = similar_content_ids[similar_content_ids != content_id]

base_movie = movies.iloc[content_id]['title']
print(f'Content_id:{content_id}')
print(f'Recommendations similar to {base_movie}:')
print('')
for i in similar_content_ids:
    print('- ' + movies.iloc[i]['title'])

Content_id:14
Recommendations similar to The Avengers:

- Avengers: Age of Ultron
- X-Men: Days of Future Past
- Ant-Man
- X-Men: Apocalypse
- Suicide Squad
- Man of Steel
- Captain America: The First Avenger
- Batman v Superman: Dawn of Justice
- Captain America: The Winter Soldier
- Fantastic Four
