In [235]:
import pandas as pd
from ast import literal_eval
from tqdm import tqdm
from abc import ABC, abstractmethod
from typing import List

In [202]:
tqdm.pandas()

# Data exploration for content-based movies recommendations

## 1. Data loading and processing

### Ratings

In [203]:
ratings = pd.read_csv("../data/ratings_small.csv")

In [204]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [236]:
ratings.columns = ['user_id', 'movie_id', 'rating', 'timestamp']

### Movies

#### Metadata

In [205]:
metadata = pd.read_csv("../data/movies_metadata.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [206]:
metadata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [207]:
metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [208]:
# drop 3 incorrect rows
metadata = metadata.drop([19730, 29503, 35587])

# genres column
metadata['genres'] = metadata.progress_apply(lambda row: [x['name'] for x in literal_eval(row['genres'])], axis=1)

# collection column
metadata['belongs_to_collection'].fillna('{\'name\': \'\'}', inplace=True)
metadata['collection'] = metadata.progress_apply(lambda row: literal_eval(row['belongs_to_collection'])['name'], axis=1)

# select interesting columns
col = ['id', 'title', 'collection', 'overview', 'genres', 'runtime', 'vote_average', 'vote_count', 'release_date']
metadata = metadata[col]

100%|██████████| 45463/45463 [00:01<00:00, 24443.88it/s]
100%|██████████| 45463/45463 [00:01<00:00, 40950.07it/s]


In [209]:
metadata.head()

Unnamed: 0,id,title,collection,overview,genres,runtime,vote_average,vote_count,release_date
0,862,Toy Story,Toy Story Collection,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]",81.0,7.7,5415.0,1995-10-30
1,8844,Jumanji,,When siblings Judy and Peter discover an encha...,"[Adventure, Fantasy, Family]",104.0,6.9,2413.0,1995-12-15
2,15602,Grumpier Old Men,Grumpy Old Men Collection,A family wedding reignites the ancient feud be...,"[Romance, Comedy]",101.0,6.5,92.0,1995-12-22
3,31357,Waiting to Exhale,,"Cheated on, mistreated and stepped on, the wom...","[Comedy, Drama, Romance]",127.0,6.1,34.0,1995-12-22
4,11862,Father of the Bride Part II,Father of the Bride Collection,Just when George Banks has recovered from his ...,[Comedy],106.0,5.7,173.0,1995-02-10


#### credits

In [210]:
creds = pd.read_csv('./../data/credits.csv')

In [211]:
creds.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [212]:
creds['cast'] = creds.progress_apply(lambda row: [x['name'] for x in literal_eval(row['cast'])[0:10]], axis=1)
creds['crew'] = creds.progress_apply(lambda row: [x['name'] for x in literal_eval(row['crew'])[0:10]], axis=1)

100%|██████████| 45476/45476 [00:19<00:00, 2349.48it/s]
100%|██████████| 45476/45476 [00:13<00:00, 3330.97it/s]


In [213]:
creds.head()

Unnamed: 0,cast,crew,id
0,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[John Lasseter, Joss Whedon, Andrew Stanton, J...",862
1,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[Larry J. Franco, Jonathan Hensleigh, James Ho...",8844
2,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[Howard Deutch, Mark Steven Johnson, Mark Stev...",15602
3,"[Whitney Houston, Angela Bassett, Loretta Devi...","[Forest Whitaker, Ronald Bass, Ronald Bass, Ez...",31357
4,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[Alan Silvestri, Elliot Davis, Nancy Meyers, N...",11862


#### keywords

In [214]:
keywords = pd.read_csv('../data/keywords.csv')

In [215]:
keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [216]:
keywords['keywords'] = keywords.progress_apply(lambda row: [x['name'] for x in literal_eval(row['keywords'])[0:10]], axis=1)

100%|██████████| 46419/46419 [00:02<00:00, 18290.29it/s]


In [217]:
keywords.head()

Unnamed: 0,id,keywords
0,862,"[jealousy, toy, boy, friendship, friends, riva..."
1,8844,"[board game, disappearance, based on children'..."
2,15602,"[fishing, best friend, duringcreditsstinger, o..."
3,31357,"[based on novel, interracial relationship, sin..."
4,11862,"[baby, midlife crisis, confidence, aging, daug..."


#### Merge data

In [226]:
metadata['id'] = metadata['id'].astype('float64')
movies = pd.merge(metadata, creds, on='id')
movies = pd.merge(movies, keywords, on='id').dropna()

In [228]:
movies.head()

Unnamed: 0,id,title,collection,overview,genres,runtime,vote_average,vote_count,release_date,cast,crew,keywords
0,862.0,Toy Story,Toy Story Collection,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]",81.0,7.7,5415.0,1995-10-30,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[John Lasseter, Joss Whedon, Andrew Stanton, J...","[jealousy, toy, boy, friendship, friends, riva..."
1,8844.0,Jumanji,,When siblings Judy and Peter discover an encha...,"[Adventure, Fantasy, Family]",104.0,6.9,2413.0,1995-12-15,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[Larry J. Franco, Jonathan Hensleigh, James Ho...","[board game, disappearance, based on children'..."
2,15602.0,Grumpier Old Men,Grumpy Old Men Collection,A family wedding reignites the ancient feud be...,"[Romance, Comedy]",101.0,6.5,92.0,1995-12-22,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[Howard Deutch, Mark Steven Johnson, Mark Stev...","[fishing, best friend, duringcreditsstinger, o..."
3,31357.0,Waiting to Exhale,,"Cheated on, mistreated and stepped on, the wom...","[Comedy, Drama, Romance]",127.0,6.1,34.0,1995-12-22,"[Whitney Houston, Angela Bassett, Loretta Devi...","[Forest Whitaker, Ronald Bass, Ronald Bass, Ez...","[based on novel, interracial relationship, sin..."
4,11862.0,Father of the Bride Part II,Father of the Bride Collection,Just when George Banks has recovered from his ...,[Comedy],106.0,5.7,173.0,1995-02-10,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[Alan Silvestri, Elliot Davis, Nancy Meyers, N...","[baby, midlife crisis, confidence, aging, daug..."


In [227]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45558 entries, 0 to 46627
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            45558 non-null  float64
 1   title         45558 non-null  object 
 2   collection    45558 non-null  object 
 3   overview      45558 non-null  object 
 4   genres        45558 non-null  object 
 5   runtime       45558 non-null  float64
 6   vote_average  45558 non-null  float64
 7   vote_count    45558 non-null  float64
 8   release_date  45558 non-null  object 
 9   cast          45558 non-null  object 
 10  crew          45558 non-null  object 
 11  keywords      45558 non-null  object 
dtypes: float64(4), object(8)
memory usage: 4.5+ MB


## 2. Validation framework

In [238]:
class binary_recommender(ABC):

    def __init__(self):
        pass

    @abstractmethod
    def recommend(
        self,
        user_ratings: pd.DataFrame, # columns: movie_id, rating
        movies_data: pd.DataFrame = movies
    ) -> List[float]:
    # returns list of recommended movies ids
        pass

    def validate(
        self
    ):
        pass