In [1]:
%matplotlib inline
import pandas as pd
import numpy as np

from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.snowball import SnowballStemmer

import pyarrow as pa
import pyarrow.parquet as pq

import warnings
warnings.simplefilter('ignore')

In [2]:
def get_director(x):
    """
    Extract the Name of the Director for a movie if it is present inside the job
    """
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [3]:
movies_dataset  = pd.read_csv('backend/assets/content/datamovies_metadata.csv')
credits         = pd.read_csv('backend/assets/content/datacredits.csv')
keywords        = pd.read_csv('backend/assets/content/datakeywords.csv')
links           = pd.read_csv('backend/assets/content/datalinks.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'backend/assets/content/datamovies_metadata.csv'

In [7]:
## Dropping these 3 rows because Date Column value for them is string date instead of Int with ID.
movies_dataset = movies_dataset.drop([19730, 29503, 35587])

In [8]:
## Extracting Genres of movies from the genres dictionary. If not present, append empty list
movies_dataset['genres'] = movies_dataset['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [9]:
## Convert to common data type for primary key in our dataset
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
movies_dataset['id'] = movies_dataset['id'].astype('int')

In [10]:
## Merging movies dataset with credits & keywords to form master dataset
movies_dataset = movies_dataset.merge(credits, on='id')
master_dataset = movies_dataset.merge(keywords, on='id')

In [11]:
master_dataset.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."


In [12]:
print(master_dataset.columns)

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'cast', 'crew', 'keywords'],
      dtype='object')


In [13]:
links = links[links['tmdbId'].notnull()]['tmdbId'].astype('int')
master_dataset = master_dataset[master_dataset['id'].isin(links)]
print(master_dataset.shape)

(46628, 27)


In [14]:
## Updating cast, crew and keyword columns by parsing them as their loaded data type is string but need to be converted to list
master_dataset['cast']      = master_dataset['cast'].apply(literal_eval)
master_dataset['crew']      = master_dataset['crew'].apply(literal_eval)
master_dataset['keywords']  = master_dataset['keywords'].apply(literal_eval)

In [15]:
## Updating cast to maintain proportion between different lengths (keeping top 3 cast members)
master_dataset['cast']      = master_dataset['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
master_dataset['cast']      = master_dataset['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

## Setting keywords to empty list if does not exists, otherwise taking into account for each word as keyword
master_dataset['keywords']  = master_dataset['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

## Extracting directory names from the crew
master_dataset['director']  = master_dataset['crew'].apply(get_director)

In [16]:
## for uniqueness, removing all the spaces in between the names
master_dataset['cast']          = master_dataset['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

## Maintaining the original director name as main director
master_dataset['main_director'] = master_dataset['director']

## Maintaining the number of director to maintain proportion (similar to cast column above)
master_dataset['director']      = master_dataset['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
master_dataset['director']      = master_dataset['director'].apply(lambda x: [x,x,x])

In [17]:
## Stacking the keywords and keeping the movies which containers X number of keywords as minimum
s = master_dataset.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'
s = s.value_counts()
print(s[:5])

keyword
woman director      3128
independent film    1942
murder              1314
based on novel       841
musical              734
Name: count, dtype: int64


In [18]:
## Will try to map where more than 1 keyword is present for the movie
s = s[s > 1]

In [19]:
## creating an object for ENGLISH Stemmer - Snowball to trim down keywords to their stem words
stemmer                     = SnowballStemmer('english')

## Trim down keywords to their stem words and then remove the space between keywords which are having more than 1 length for uniqueness
master_dataset['keywords']  = master_dataset['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
master_dataset['keywords']  = master_dataset['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [20]:
stemmer                     = SnowballStemmer('english')
master_dataset['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])

0        [jealousi, toy, boy, friendship, friend, rival...
1        [boardgam, disappear, basedonchildren'sbook, n...
2              [fish, bestfriend, duringcreditsst, oldmen]
3        [basedonnovel, interracialrelationship, single...
4        [babi, midlifecrisi, confid, age, daughter, mo...
                               ...                        
46623                                          [tragiclov]
46624                                [artist, play, pinoy]
46625                                                   []
46626                                                   []
46627                                                   []
Name: keywords, Length: 46628, dtype: object

In [21]:
master_dataset['keywords'].head(3)

0    [jealousi, toy, boy, friendship, friend, rival...
1    [boardgam, disappear, basedonchildren'sbook, n...
2       [fish, bestfriend, duringcreditssting, oldmen]
Name: keywords, dtype: object

In [22]:
## Creating a soup feature - combination of (keywords, cast, director, genres)
master_dataset['soup'] = master_dataset['keywords'] + master_dataset['cast'] + master_dataset['director'] + master_dataset['genres']

## Modifying by placing single space between all the soup words
master_dataset['soup'] = master_dataset['soup'].apply(lambda x: ' '.join(x))

In [23]:
master_dataset['soup'].head(3)

0    jealousi toy boy friendship friend rivalri boy...
1    boardgam disappear basedonchildren'sbook newho...
2    fish bestfriend duringcreditssting oldmen walt...
Name: soup, dtype: object

In [24]:
print(master_dataset.columns)

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'cast', 'crew', 'keywords', 'director',
       'main_director', 'soup'],
      dtype='object')


In [25]:
## Removing unwanted columns from the dataset - these features can be used if you wish to add more features to your recommender system.
## We are not going to use them, so we are removing them.
master_dataset.drop(['adult', 'belongs_to_collection', 'budget','homepage','original_language', 'production_companies','production_countries', 'revenue', 'runtime','spoken_languages','status','video'],axis=1,inplace=True)
master_dataset.drop(['overview', 'tagline','vote_average', 'vote_count', 'cast', 'crew','keywords', 'director'],axis=1,inplace=True)
master_dataset.drop(['id','imdb_id','original_title','poster_path','genres'],axis=1,inplace=True)

In [26]:
## Checking popularity column for being non-float data type and removing them
master_dataset['popularity']    = master_dataset.apply(lambda r: r['popularity'] if type(r['popularity'])==float else np.nan, axis=1)
master_dataset.dropna(inplace=True)

## Checking director column for being non-string data type and removing them
master_dataset['main_director'] = master_dataset.apply(lambda r: r['main_director'] if len(r['main_director'])>1 else np.nan, axis=1)
master_dataset.dropna(inplace=True)

In [27]:
## Sorting the whole dataset based on popularity. This will help us to take top X number of movies based on popularity.
master_dataset.sort_values(by=['popularity'],ascending=False,inplace=True)

## Dropping popularity column after sorting based on popularity
master_dataset.drop(['popularity'],axis=1,inplace=True)
master_dataset.dropna(inplace=True)

In [28]:
## Reset index because after sorting, the index values have changed.
master_dataset.reset_index(inplace=True,drop=True)

In [29]:
## Checking release date column for being non-string data type and removing them
master_dataset['release_date'] = master_dataset.apply(lambda r: r['release_date'] if len(r['release_date'])>1 else np.nan, axis=1)
master_dataset.dropna(inplace=True)

In [30]:
master_dataset = master_dataset[:100]

## For Demo, we will take top 2500 movies, which is hosted online already.
# master_dataset = master_dataset[:2500]

## For Tiny-Model, we will take top 1000 movies
# master_dataset = master_dataset[:1000]

## For Extra-Small-Model, we will take top 5000 movies
# master_dataset = master_dataset[:5000]

## For Small-Model, we will take top 10000 movies
# master_dataset = master_dataset[:10000]

## For Medium-Model, we will take top 20000 movies
# master_dataset = master_dataset[:20000]

## For Large-Model, we will take top 30000 movies
# master_dataset = master_dataset[:30000]

## LEAVE ALL THE LINES COMMENTED IF YOU WISH TO TRAIN FULL MOVIES DATASET.

---

In [31]:
## This is our final dataset which we will be using for training our word and cosine similarity matrix
master_dataset.head()

Unnamed: 0,release_date,title,main_director,soup
0,2015-06-17,Minions,Kyle Balda,assist aftercreditssting duringcreditssting ev...
1,2014-10-24,Big Hero 6,Chris Williams,brotherbrotherrelationship hero talent reveng ...
2,2016-02-09,Deadpool,Tim Miller,antihero mercenari marvelcom superhero basedon...
3,2017-04-19,Guardians of the Galaxy Vol. 2,James Gunn,sequel superhero basedoncom misfit space outer...
4,2009-12-10,Avatar,James Cameron,cultureclash futur spacewar spacecoloni societ...


In [32]:
print(master_dataset.shape)

(100, 4)


## Recommendation Matrix

>     Building the matrix which contains similarity scores between movies based on the features

#### 1: Training Word based count vectorizer model

In [34]:
## Creating a Count Vectorizer object which will be based on word analyzer, with ngram 1-2 and minimum number of occurances of words as 2
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=2, stop_words='english')

## Adjusting the count vectorizer object with respect to our dataset
count_matrix = count.fit_transform(master_dataset['soup'])

In [35]:
print(count_matrix.shape)

(100, 421)


#### 2: Building Cosine Similarity Matrix

In [36]:
## We build it as an pyarrow dataframe because it is the most efficient 
table = pa.Table.from_pandas(pd.DataFrame(cosine_similarity(count_matrix, count_matrix)))

## Model & Data Export

In [43]:
## save the Master Dataset
master_dataset.to_parquet('backend/assets/content/movie_database.parquet',engine='fastparquet',index=False)

In [45]:
## Writing the Matrix table
pq.write_table(table, 'backend/assets/content/model.parquet')

## Inference

>     Loading the trained model to execute Inference

In [1]:
import pandas as pd
import pyarrow as pa

In [2]:
master_dataset = pd.read_parquet('backend/assets/content/movie_database.parquet')

In [3]:
master_dataset.head(3)

Unnamed: 0,release_date,title,main_director,soup
0,2015-06-17,Minions,Kyle Balda,assist aftercreditssting duringcreditssting ev...
1,2014-10-24,Big Hero 6,Chris Williams,brotherbrotherrelationship hero talent reveng ...
2,2016-02-09,Deadpool,Tim Miller,antihero mercenari marvelcom superhero basedon...


In [4]:
table = pa.parquet.read_table('backend/assets/content/model.parquet').to_pandas()

In [5]:
master_dataset = master_dataset.reset_index()
titles = master_dataset['title']
indices = pd.Series(master_dataset.index, index=master_dataset['title'])

In [6]:
def get_recommendations(movie_id_from_db,movie_db):
    try:
        sim_scores = list(enumerate(movie_db[movie_id_from_db]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:15] ## get top 15 Recommendations
        
        movie_indices = [i[0] for i in sim_scores]
        output = master_dataset.iloc[movie_indices]
        output.reset_index(inplace=True, drop=True)

        response = []
        for i in range(len(output)):
            response.append({
                'movie_title':output['title'].iloc[i],
                'movie_release_date':output['release_date'].iloc[i],
                'movie_director':output['main_director'].iloc[i],
                'google_link':"https://www.google.com/search?q=" + '+'.join(output['title'].iloc[i].strip().split())
            })
        return response
    except Exception as e:
        print("error: ",e)
        return []

In [7]:
movie_name = input('Enter a movie Name: ')

In [16]:
print(titles.to_list().index('Minions'))

0


In [8]:
movie_index = titles.to_list().index(movie_name)
recommendations = get_recommendations(movie_index,table)

ValueError: 'big' is not in list

In [62]:
print(f"{'Movie Title':<45} | {'Director':<20} | {'Release Date':<15}")
print(f"-"*80)
for recommendation in recommendations:
    print(f"{recommendation['movie_title']:<45} | {recommendation['movie_director']:<20} | {recommendation['movie_release_date']:<15}")

Movie Title                                   | Director             | Release Date   
--------------------------------------------------------------------------------
Minions                                       | Kyle Balda           | 2015-06-17     
Monsters, Inc.                                | Pete Docter          | 2001-11-01     
Ted 2                                         | Seth MacFarlane      | 2015-06-25     
Finding Nemo                                  | Andrew Stanton       | 2003-05-30     
Fantastic Beasts and Where to Find Them       | David Yates          | 2016-11-16     
Deadpool                                      | Tim Miller           | 2016-02-09     
Spirited Away                                 | Hayao Miyazaki       | 2001-07-20     
Sex Tape                                      | Jake Kasdan          | 2014-07-17     
Furious 7                                     | James Wan            | 2015-04-01     
Pirates of the Caribbean: On Stranger Tides   | R