In [1]:
# Importing libraries
import psycopg2
import pandas as pd
from ast import literal_eval
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import blosc

### DB Setup

In [2]:
%%capture --no-display capture_output
# Loading SQL
%load_ext sql

In [3]:
%%capture --no-display capture_output
# Loading schemadisplay
%load_ext schemadisplay_magic

In [4]:
%%capture --no-display capture_output
# Configuring SQL so it will load files using Pandas
%config SqlMagic.autopandas=True

In [5]:
%%capture --no-display capture_output
# Set to return floats rather than decimals
DEC2FLOAT = psycopg2.extensions.new_type(
    psycopg2.extensions.DECIMAL.values,
    'DEC2FLOAT',
    lambda value, curs: float(value) if value is not None else None)
psycopg2.extensions.register_type(DEC2FLOAT)

In [6]:
# Defining local variables
DB_ENGINE='postgresql'
DB_HOST='db'
DB_PORT=5432
DB_NAME='recommenderdb'
DB_USER='postgres'
DB_PWD='letmein'

In [7]:
# Creating connection string
DB_ML_SETUP_CONNECTION = '{engine}://{user}:{pwd}@{host}:{port}/{name}'.format(engine=DB_ENGINE,
                                                               user=DB_USER,
                                                               pwd=DB_PWD,
                                                               host=DB_HOST,
                                                               port=DB_PORT,
                                                               name=DB_NAME)

In [8]:
print("Connecting with connection string : {}".format(DB_ML_SETUP_CONNECTION))

%sql $DB_ML_SETUP_CONNECTION

Connecting with connection string : postgresql://postgres:letmein@db:5432/recommenderdb


In [9]:
%%sql

SELECT version();

 * postgresql://postgres:***@db:5432/recommenderdb
1 rows affected.


Unnamed: 0,version
0,PostgreSQL 13.4 (Debian 13.4-4.pgdg110+1) on x...


In [10]:
%%sql

SHOW search_path;

 * postgresql://postgres:***@db:5432/recommenderdb
1 rows affected.


Unnamed: 0,search_path
0,"""$user"", public"


In [11]:
%%sql

SELECT *
FROM pg_catalog.pg_tables
WHERE schemaname != 'pg_catalog' AND 
    schemaname = 'recommender';

 * postgresql://postgres:***@db:5432/recommenderdb
3 rows affected.


Unnamed: 0,schemaname,tablename,tableowner,tablespace,hasindexes,hasrules,hastriggers,rowsecurity
0,recommender,keywords,postgres,,True,False,True,False
1,recommender,movies_metadata,postgres,,True,False,True,False
2,recommender,credits,postgres,,True,False,True,False


### Prepere Movies Matadata

In [12]:
%%sql movies_metadata <<

SELECT * FROM recommender.movies_metadata;

 * postgresql://postgres:***@db:5432/recommenderdb
19984 rows affected.
Returning data to local variable movies_metadata


In [13]:
movies_metadata.sample()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
12374,False,,0.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",http://www.thebandsvisit.com,5259.0,tt1032856,he,ביקור התזמורת,Once-not long ago-a small Egyptian police band...,...,2007-05-19,$0.00,83.0,"[{'iso_639_1': 'ar', 'name': 'العربية'}, {'iso...",Released,,The Band's Visit,False,7.0,40.0


In [14]:
movies_metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [15]:
movies_metadata.shape

(19984, 24)

In [16]:
movies_metadata.dtypes

adult                       bool
belongs_to_collection     object
budget                   float64
genres                    object
homepage                  object
id                       float64
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity               float64
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                   object
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                     object
vote_average             float64
vote_count               float64
dtype: object

### Prepere Keywords

In [17]:
%%sql keywords <<

SELECT * FROM recommender.keywords;

 * postgresql://postgres:***@db:5432/recommenderdb
19964 rows affected.
Returning data to local variable keywords


In [18]:
keywords.dtypes

id          float64
keywords     object
dtype: object

In [19]:
keywords.sample()

Unnamed: 0,id,keywords
9267,23985.0,[]


### Prepere Credits

In [20]:
%%sql credits <<

SELECT * FROM recommender.credits;

 * postgresql://postgres:***@db:5432/recommenderdb
19964 rows affected.
Returning data to local variable credits


In [21]:
credits.dtypes

cast     object
crew     object
id      float64
dtype: object

In [22]:
credits.sample()

Unnamed: 0,cast,crew,id
331,"[{'cast_id': 8, 'character': 'Juliet Miller', ...","[{'credit_id': '52fe4545c3a36847f80c4b43', 'de...",9905.0


### Make soup out of metadata

In [23]:
df = movies_metadata.merge(credits, on='id')

In [24]:
df = df.merge(keywords, on='id')

In [25]:
df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'cast', 'crew', 'keywords'],
      dtype='object')

In [26]:
features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    df[feature] = df[feature].apply(literal_eval)

In [27]:
df.sample()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords
5495,False,"{'id': 176097, 'name': 'Barbershop Collection'...",12000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,10611.0,tt0303714,en,Barbershop,A day in the life of a barbershop on the south...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Everyone's gettin' lined up.,Barbershop,False,6.2,139.0,"[{'cast_id': 1, 'character': 'Calvin Palmer', ...","[{'credit_id': '52fe43939251416c75015c49', 'de...","[{'id': 928, 'name': 'hairdresser'}, {'id': 19..."


In [28]:
df.iloc[6567]['crew'][0]

{'credit_id': '52fe4635c3a36847f80f2b1f',
 'department': 'Directing',
 'gender': 0,
 'id': 18598,
 'job': 'Director',
 'name': 'Ishirô Honda',
 'profile_path': '/4B0Q9uj7nCxmGgnmyh9QYhDrxF.jpg'}

In [29]:
type(df.iloc[6567]['crew'][0])

dict

In [30]:
def get_director(x):
    for crew_member in x:
        if crew_member['job'] == 'Director':
            return crew_member['name']
    return np.nan

In [31]:
df['director'] = df['crew'].apply(get_director)

In [32]:
def generate_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        return names
        
    return []

In [33]:
df['cast'] = df['cast'].apply(generate_list)
df['keywords'] = df['keywords'].apply(generate_list)
df['genres'] = df['genres'].apply(generate_list)

In [34]:
df[['title', 'cast', 'director', 'keywords', 'genres']].sample()

Unnamed: 0,title,cast,director,keywords,genres
6429,Flight of the Intruder,"[Danny Glover, Willem Dafoe, Brad Johnson, Ros...",John Milius,"[bomber, vietnam war, u.s. navy, aviation, com...","[Action, Adventure, Drama, Thriller]"


In [35]:
# Function to sanitize data to prevent ambiguity. It removes spaces and converts to lowercase
def sanitize(x):
    if isinstance(x, list):
        #Strip spaces and convert to lowercase
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [36]:
for feature in ['cast', 'director', 'genres', 'keywords']:
    df[feature] = df[feature].apply(sanitize)

In [37]:
df.sort_values(by=['release_date'], inplace=True, ascending=False, na_position='first')

In [38]:
df[['id', 'title', 'release_date']]

Unnamed: 0,id,title,release_date
727,365371.0,War Stories Our Mother Never Told Us,
752,215107.0,Vermont Is for Lovers,
3514,94214.0,"Jails, Hospitals & Hip-Hop",
3685,207731.0,Boricua's Bond,
5932,99885.0,Divine Intervention,
...,...,...,...
18895,159900.0,"Ella Lola, a la Trilby",1898-01-01
19224,94570.0,The Kiss,1896-04-01
18994,104396.0,Dickson Experimental Sound Film,1894-08-31
17571,105158.0,Edison Kinetoscopic Record of a Sneeze,1894-01-09


In [39]:
df[['id', 'title', 'cast', 'director', 'keywords', 'genres']].sample()

Unnamed: 0,id,title,cast,director,keywords,genres
3367,1880.0,Red Dawn,"[patrickswayze, c.thomashowell, leathompson, d...",johnmilius,"[guerrilla, colorado, invasion, anti-communism...","[action, thriller]"


In [40]:
#Function that creates a soup out of the desired metadata
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

In [41]:
# Create the new soup feature
df['soup'] = df.apply(create_soup, axis=1)

In [42]:
df.iloc[5842]['soup']

"informant jumpingfromarooftop surveillancefootage benstiller owenwilson snoopdogg fredwilliamson vincevaughn juliettelewis jasonbateman carmenelectra willferrell amysmart branderoderick mollysims saraswain georgecheung chrispenn mattwalsh g.t.holme jeffreylorenzo harmarsuperstar pattonoswalt brigetteromanek paulmichaelglaser davidsoul danfinnerty jernardburks omarj.dorsey pramodkumar rodtate richardedson raymondma terrycrews richienathanson davidpressman scottl.schwartz judahfriedlander akerinsuksawatpremwattana ambermead darlenatejeiro harryo'reilly tangieambrose deloresgilbeaux kimberlybrickland minnielagrimas rachaelharris davidburton larrychang tonsuckhasem henryt.yamada charlesedwardtownsend nancyanderson jasonyribar tycediorio katiepantenburg timothyanderson kimberlywyatt kristynabbadini kevinalexanderstea adrianarmas gabrielpaige tarawilson brittanyperry-russell taneemccall nadineellis chadazadan jasonbeitel brandonhenschel markmeismer mattsergott lisajoannthompson christianvin

### Prepere the model

In [43]:
count = CountVectorizer(stop_words='english')

In [44]:
count_matrix = count.fit_transform(df['soup'])

In [45]:
count_matrix = count_matrix.astype(np.float32)

In [46]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [53]:
df = df.reset_index()

In [78]:
indices = pd.Series(df.index, index=df['title'])

In [79]:
def content_recommender(title, cosine_sim=cosine_sim, df=df, indices=indices):
    # Obtain the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    # And convert it into a list of tuples as described above
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the cosine similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies. Ignore the first movie.
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [80]:
content_recommender('The Lion King')

5882                           The Lion King 1½
8803             The Lion King 2: Simba's Pride
3023             The Secrets of Jonathan Sperry
12154                         Creature Comforts
7972              Thomas and the Magic Railroad
12691    Pinocchio and the Emperor of the Night
2                    Jails, Hospitals & Hip-Hop
3275                             The Employment
4217                           Peter & the Wolf
5837                          To The Other Side
Name: title, dtype: object

In [81]:
content_recommender('Star Wars: The Clone Wars')

5621                                   Yu-Gi-Oh! The Movie
6804              Spiderman: The Ultimate Villain Showdown
5359                         Left Behind III: World at War
7604                                              Cat Soup
10156                                     Gumby: The Movie
13341                       G.I. Joe: The Revenge of Cobra
1763                                       2012: Supernova
7038                                  Return to Never Land
6567     Inuyasha the Movie 2: The Castle Beyond the Lo...
7450                     Final Fantasy: The Spirits Within
Name: title, dtype: object

In [82]:
content_recommender('Taken')

79                              Taken 2
3230                               Vice
11364      Ranma ½: Nihao, My Concubine
11685                             Fever
6639                              I Spy
5971                       Out of Reach
15207                   Shaft in Africa
15091                        Foxy Brown
2536     The Heir Apparent: Largo Winch
2851        The Art of War II: Betrayal
Name: title, dtype: object

### Generate compresses pickle file

In [None]:
pickle_path_pkl = 'mlmodels/cosine_similarity_model.pkl'

In [None]:
with open(pickle_path_pkl, 'wb') as pickle_file:
    pickle.dump(cosine_sim, pickle_file, protocol=4)

In [None]:
pickle_path = 'mlmodels/cosine_similarity_model.dat'

In [None]:
pickled_data = pickle.dumps(cosine_sim, protocol=4)

In [None]:
compressed_pickle = blosc.compress(pickled_data)

In [None]:
with open(pickle_path, 'wb') as file:
    file.write(compressed_pickle)