In [1]:
# Importing libraries
import psycopg2
import pandas as pd
from ast import literal_eval
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib

### DB Setup

In [2]:
%%capture --no-display capture_output
# Loading SQL
%load_ext sql

In [3]:
%%capture --no-display capture_output
# Loading schemadisplay
%load_ext schemadisplay_magic

In [4]:
%%capture --no-display capture_output
# Configuring SQL so it will load files using Pandas
%config SqlMagic.autopandas=True

In [5]:
%%capture --no-display capture_output
# Set to return floats rather than decimals
DEC2FLOAT = psycopg2.extensions.new_type(
    psycopg2.extensions.DECIMAL.values,
    'DEC2FLOAT',
    lambda value, curs: float(value) if value is not None else None)
psycopg2.extensions.register_type(DEC2FLOAT)

In [6]:
# Defining local variables
DB_ENGINE='postgresql'
DB_HOST='db'
DB_PORT=5432
DB_NAME='recommenderdb'
DB_USER='postgres'
DB_PWD='letmein'

In [7]:
# Creating connection string
DB_ML_SETUP_CONNECTION = '{engine}://{user}:{pwd}@{host}:{port}/{name}'.format(engine=DB_ENGINE,
                                                               user=DB_USER,
                                                               pwd=DB_PWD,
                                                               host=DB_HOST,
                                                               port=DB_PORT,
                                                               name=DB_NAME)

In [8]:
print("Connecting with connection string : {}".format(DB_ML_SETUP_CONNECTION))

%sql $DB_ML_SETUP_CONNECTION

Connecting with connection string : postgresql://postgres:letmein@db:5432/recommenderdb


In [9]:
%%sql

SELECT version();

 * postgresql://postgres:***@db:5432/recommenderdb
1 rows affected.


Unnamed: 0,version
0,PostgreSQL 13.4 (Debian 13.4-4.pgdg110+1) on x...


In [10]:
%%sql

SHOW search_path;

 * postgresql://postgres:***@db:5432/recommenderdb
1 rows affected.


Unnamed: 0,search_path
0,"""$user"", public"


In [11]:
%%sql

SELECT *
FROM pg_catalog.pg_tables
WHERE schemaname != 'pg_catalog' AND 
    schemaname = 'recommender';

 * postgresql://postgres:***@db:5432/recommenderdb
3 rows affected.


Unnamed: 0,schemaname,tablename,tableowner,tablespace,hasindexes,hasrules,hastriggers,rowsecurity
0,recommender,credits,postgres,,True,False,True,False
1,recommender,movies_metadata,postgres,,True,False,True,False
2,recommender,keywords,postgres,,True,False,True,False


### Movies Matadata

In [12]:
%%sql movies_metadata <<

SELECT * FROM recommender.movies_metadata;

 * postgresql://postgres:***@db:5432/recommenderdb
34953 rows affected.
Returning data to local variable movies_metadata


In [13]:
movies_metadata.sample()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
14450,False,,0.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",,56619.0,tt0098481,cs,Tma/Svetlo/Tma,A human body gradually reconstructs itself as ...,...,1989-01-01,$0.00,8.0,"[{'iso_639_1': 'cs', 'name': 'Český'}]",Released,,Darkness/Light/Darkness,False,7.4,28.0


In [14]:
movies_metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [15]:
movies_metadata.shape

(34953, 24)

In [16]:
movies_metadata.dtypes

adult                       bool
belongs_to_collection     object
budget                   float64
genres                    object
homepage                  object
id                       float64
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity               float64
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                   object
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                     object
vote_average             float64
vote_count               float64
dtype: object

### Keywords

In [17]:
%%sql keywords <<

SELECT * FROM recommender.keywords;

 * postgresql://postgres:***@db:5432/recommenderdb
34946 rows affected.
Returning data to local variable keywords


In [18]:
keywords.dtypes

id          float64
keywords     object
dtype: object

In [19]:
keywords.sample()

Unnamed: 0,id,keywords
387,10449.0,"[{'id': 1157, 'name': 'wife husband relationsh..."


### Credits

In [20]:
%%sql credits <<

SELECT * FROM recommender.credits;

 * postgresql://postgres:***@db:5432/recommenderdb
34932 rows affected.
Returning data to local variable credits


In [21]:
credits.dtypes

cast     object
crew     object
id      float64
dtype: object

In [22]:
credits.sample()

Unnamed: 0,cast,crew,id
27500,[],"[{'credit_id': '52fe4a859251416c750e4e6b', 'de...",140149.0


### Make soup out of metadata

In [23]:
df = movies_metadata.merge(credits, on='id')
df = df.merge(keywords, on='id')

In [24]:
df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'cast', 'crew', 'keywords'],
      dtype='object')

In [25]:
features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    df[feature] = df[feature].apply(literal_eval)

In [26]:
df.sample()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords
5074,False,,0.0,"[{'id': 10752, 'name': 'War'}, {'id': 28, 'nam...",,8737.0,tt0060218,en,Cast a Giant Shadow,An American Army officer is recruited by the y...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Outnumbered - unarmed - unprepared - they stun...,Cast a Giant Shadow,False,5.9,8.0,"[{'cast_id': 1, 'character': 'Col. David 'Mick...","[{'credit_id': '55c89cd79251415cf40004f2', 'de...","[{'id': 536, 'name': 'israel'}, {'id': 5907, '..."


In [27]:
df.iloc[6567]['crew'][0]

{'credit_id': '52fe4453c3a36847f808f4a5',
 'department': 'Directing',
 'gender': 0,
 'id': 17268,
 'job': 'Director',
 'name': 'Ronny Yu',
 'profile_path': '/uOCTwD8l6WDvo1kMAEvnF5pBnAH.jpg'}

In [28]:
type(df.iloc[6567]['crew'][0])

dict

In [29]:
def get_director(x):
    for crew_member in x:
        if crew_member['job'] == 'Director':
            return crew_member['name']
    return np.nan

In [30]:
df['director'] = df['crew'].apply(get_director)

In [31]:
def generate_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        return names
        
    return []

In [32]:
df['cast'] = df['cast'].apply(generate_list)
df['keywords'] = df['keywords'].apply(generate_list)
df['genres'] = df['genres'].apply(generate_list)

In [33]:
df[['title', 'cast', 'director', 'keywords', 'genres']].sample()

Unnamed: 0,title,cast,director,keywords,genres
32256,Right on Track,"[Beverley Mitchell, Brie Larson, Jon Lindstrom...",Duwayne Dunham,"[sister sister relationship, biography, tv mov...","[Drama, Family, TV Movie]"


In [34]:
# Function to sanitize data to prevent ambiguity. It removes spaces and converts to lowercase
def sanitize(x):
    if isinstance(x, list):
        #Strip spaces and convert to lowercase
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [35]:
for feature in ['cast', 'director', 'genres', 'keywords']:
    df[feature] = df[feature].apply(sanitize)

In [None]:
df.sort_values(by=['release_date'], inplace=True, ascending=False)

In [None]:
df.head()

In [None]:
df.tail()

In [36]:
df[['id', 'title', 'cast', 'director', 'keywords', 'genres']].sample()

Unnamed: 0,id,title,cast,director,keywords,genres
5669,34697.0,Butterfly,"[stacykeach, piazadora, orsonwelles, edmcmahon...",mattcimber,"[nudity, seduction]","[crime, drama]"


In [37]:
#Function that creates a soup out of the desired metadata
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

In [38]:
# Create the new soup feature
df['soup'] = df.apply(create_soup, axis=1)

In [39]:
df.iloc[26643]['soup']

' samuelhui karlmaka lesliecheung ninalichi conanlee hachi-jan luyan waltertsotat-wah ellenchan dannyleesau-yin roycheung mariacordero deborahgrant markhoughton melvinwong fennieyuen liuchia-liang action'

In [40]:
count = CountVectorizer(stop_words='english')

In [41]:
count_matrix = count.fit_transform(df['soup'])

In [42]:
count_matrix = count_matrix.astype(np.float32)

In [43]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [44]:
df = df.reset_index()

In [45]:
indices = pd.Series(df.index, index=df['title'])

In [46]:
def content_recommender(title, cosine_sim=cosine_sim, df=df, indices=indices):
    # Obtain the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    # And convert it into a list of tuples as described above
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the cosine similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies. Ignore the first movie.
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [47]:
content_recommender('The Lion King')

9480                          The Lion King 1½
9245            The Lion King 2: Simba's Pride
29376                              Cheburashka
31257            Tom and Jerry: The Magic Ring
27580                     The Little Matchgirl
24527                      The Seventh Brother
29004                          Superstar Goofy
30041                                  My Love
30966    Pokémon: Arceus and the Jewel of Life
32271                   Puff, the Magic Dragon
Name: title, dtype: object

In [48]:
joblib_file = "mlmodels/cosine_similarity_model.pkl"  
joblib.dump(cosine_sim, joblib_file)

['mlmodels/cosine_similarity_model.pkl']