## Connecting to the database

In [2]:
%run db_connection.ipynb

Connecting with connection string : postgresql://postgres:letmein@db:5432/recommenderdb
 * postgresql://postgres:***@db:5432/recommenderdb
1 rows affected.
 * postgresql://postgres:***@db:5432/recommenderdb
1 rows affected.
 * postgresql://postgres:***@db:5432/recommenderdb
3 rows affected.


### Prepere Movies Matadata

In [3]:
%%sql movies_metadata <<

SELECT * FROM recommender.movies_metadata;

 * postgresql://postgres:***@db:5432/recommenderdb
9989 rows affected.
Returning data to local variable movies_metadata


In [4]:
movies_metadata.sample()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
7498,False,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10769, 'n...",,45044.0,tt0295743,en,Vendredi Soir,"Paris, 1995. Laure (Valérie Lemercier) is abou...",...,2002-09-11,$0.00,90.0,"[{'iso_639_1': 'fr', 'name': 'Français'}]",Released,,Friday Night,False,5.0,1.0


In [5]:
movies_metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [6]:
movies_metadata.shape

(9989, 24)

In [7]:
movies_metadata.dtypes

adult                       bool
belongs_to_collection     object
budget                   float64
genres                    object
homepage                  object
id                       float64
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity               float64
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                   object
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                     object
vote_average             float64
vote_count               float64
dtype: object

## Prepere Keywords

In [8]:
%%sql keywords <<

SELECT * FROM recommender.keywords;

 * postgresql://postgres:***@db:5432/recommenderdb
9979 rows affected.
Returning data to local variable keywords


In [9]:
keywords.dtypes

id          float64
keywords     object
dtype: object

In [10]:
keywords.sample()

Unnamed: 0,id,keywords
1112,10765.0,"[{'id': 90, 'name': 'paris'}, {'id': 2273, 'na..."


### Prepere Credits

In [11]:
%%sql credits <<

SELECT * FROM recommender.credits;

 * postgresql://postgres:***@db:5432/recommenderdb
9979 rows affected.
Returning data to local variable credits


In [12]:
credits.dtypes

cast     object
crew     object
id      float64
dtype: object

In [13]:
credits.sample()

Unnamed: 0,cast,crew,id
3523,[],"[{'credit_id': '52fe472ac3a36847f8126c3f', 'de...",47295.0


## Make soup out of metadata

In [14]:
df = movies_metadata.merge(credits, on='id')

In [15]:
df = df.merge(keywords, on='id')

In [16]:
df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'cast', 'crew', 'keywords'],
      dtype='object')

In [17]:
features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    df[feature] = df[feature].apply(literal_eval)

In [18]:
df.sample()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords
2062,False,,0.0,"[{'id': 9648, 'name': 'Mystery'}, {'id': 53, '...",,31930.0,tt0021165,en,Murder!,"When a woman is convicted of murder, one of th...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Murder!,False,6.3,19.0,"[{'cast_id': 6, 'character': 'Sir John Menier'...","[{'credit_id': '52fe44a99251416c910187a3', 'de...",[]


In [19]:
df.iloc[6567]['crew'][0]

{'credit_id': '52fe450c9251416c91025bdf',
 'department': 'Writing',
 'gender': 0,
 'id': 65441,
 'job': 'Writer',
 'name': 'Sadayuki Murai',
 'profile_path': None}

In [20]:
type(df.iloc[6567]['crew'][0])

dict

In [21]:
def get_director(x):
    for crew_member in x:
        if crew_member['job'] == 'Director':
            return crew_member['name']
    return np.nan

In [22]:
df['director'] = df['crew'].apply(get_director)

In [23]:
def generate_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        return names
        
    return []

In [24]:
df['cast'] = df['cast'].apply(generate_list)
df['keywords'] = df['keywords'].apply(generate_list)
df['genres'] = df['genres'].apply(generate_list)

In [25]:
df[['title', 'cast', 'director', 'keywords', 'genres']].sample()

Unnamed: 0,title,cast,director,keywords,genres
788,The Frighteners,"[Michael J. Fox, Jeffrey Combs, Jake Busey, Ch...",Peter Jackson,"[judge, architect, ghost world, hell, grim rea...","[Horror, Comedy]"


In [26]:
# Function to sanitize data to prevent ambiguity. It removes spaces and converts to lowercase
def sanitize(x):
    if isinstance(x, list):
        #Strip spaces and convert to lowercase
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [27]:
for feature in ['cast', 'director', 'genres', 'keywords']:
    df[feature] = df[feature].apply(sanitize)

In [28]:
df.sort_values(by=['release_date'], inplace=True, ascending=False, na_position='first')

In [29]:
df[['id', 'title', 'release_date']]

Unnamed: 0,id,title,release_date
1090,141210.0,The Sleepover,2013-10-12
2788,171982.0,Romance,2012-10-09
2125,143750.0,The Farmer's Wife,2012-06-20
5765,78022.0,My Kingdom,2011-09-08
4137,136558.0,Kingdom Come,2011-01-01
...,...,...,...
7804,47653.0,The Immigrant,1917-06-17
7100,3059.0,Intolerance: Love's Struggle Throughout the Ages,1916-09-04
8295,70368.0,The Cheat,1915-12-13
6920,618.0,The Birth of a Nation,1915-02-08


In [30]:
df[['id', 'title', 'cast', 'director', 'keywords', 'genres']].sample()

Unnamed: 0,id,title,cast,director,keywords,genres
2914,476.0,Drugstore Cowboy,"[mattdillon, kellylynch, jameslegros, heatherg...",gusvansant,"[individual, robbery, drugabuse, sheriff, supp...","[crime, drama]"


In [31]:
#Function that creates a soup out of the desired metadata
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

In [32]:
# Create the new soup feature
df['soup'] = df.apply(create_soup, axis=1)

In [33]:
df.iloc[5842]['soup']

'bullying bully caseysiemaszko annieryan richardtyson staceyglick jonathanwise jeffreytambor philipbakerhall mitchpileggi paulfeig yeardleysmith philjoanou comedy'

## Prepere the model

In [34]:
count = CountVectorizer(stop_words='english')

In [35]:
count_matrix = count.fit_transform(df['soup'])

In [36]:
count_matrix = count_matrix.astype(np.float32)

In [37]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [38]:
df = df.reset_index()

In [39]:
indices = pd.Series(df.index, index=df['title'])

In [40]:
def content_recommender(title, cosine_sim=cosine_sim, df=df, indices=indices):
    # Obtain the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    # And convert it into a list of tuples as described above
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the cosine similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies. Ignore the first movie.
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [41]:
content_recommender('The Lion King')

346                    The Lion King 1½
2644     The Lion King 2: Simba's Pride
5450                  Creature Comforts
1957      Thomas and the Magic Railroad
379                       Teacher's Pet
3013                   Ill Gotten Gains
2490                     The King and I
4767                          Sarafina!
9282                So Dear to My Heart
3609    Aladdin and the King of Thieves
Name: title, dtype: object

In [42]:
content_recommender('Star Wars')

7071                         The Empire Strikes Back
6563                              Return of the Jedi
1125    Star Wars: Episode II - Attack of the Clones
360                            Comic Book: The Movie
7518                                      Rollerball
2431       Star Wars: Episode I - The Phantom Menace
7341                 Sinbad and the Eye of the Tiger
8334                          Dr. Who and the Daleks
6112                                         Biggles
6401                                          Sheena
Name: title, dtype: object

## Generate compressed pickle file

In [43]:
pickle_path_pkl = 'mlmodels/cosine_similarity_model.pkl'

In [44]:
with open(pickle_path_pkl, 'wb') as pickle_file:
    pickle.dump(cosine_sim, pickle_file, protocol=4)

In [45]:
pickle_path = 'mlmodels/cosine_similarity_model.dat'

In [46]:
pickled_data = pickle.dumps(cosine_sim, protocol=4)

In [47]:
compressed_pickle = blosc.compress(pickled_data)

In [48]:
with open(pickle_path, 'wb') as file:
    file.write(compressed_pickle)