## Importing Libraries

In [None]:
!pip install implicit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
import implicit
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.colors import hex_to_rgb
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm # A fancy library to show progress bar for long operations

  f"CUDA extension is built, but disabling GPU support because of '{e}'",


## Reading Data

About the dataset:
- __Title__: Movie Title.
- __Overview__: Synopsis of the Movie.
- __Popularity__: Movie popularity rating as per TMDB.
- __Vote_average__: Votes average out of 10.
- __Vote_count__: Number of votes from the users.
- __Release_date__: Date of release of the movie.
- __Keywords__: Keywords for the movie by TMDB in list.
- __Genres__: Movie Genres in list.
- __Cast__: Cast of the movie in list.
- __Crew__: Crew of the movie in list.

In [None]:
data=pd.read_csv('tmdb.csv.zip',compression='zip',index_col='id')
data.head()

Unnamed: 0_level_0,Unnamed: 0,title,overview,popularity,vote_average,vote_count,release_date,keywords,genres,cast,crew
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
19404,0,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...",31.222,8.7,3323,1995-10-20,[],"['Comedy', 'Drama', 'Romance']","['Shah Rukh Khan', 'Kajol', 'Amrish Puri', 'An...",['Aditya Chopra']
278,1,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,76.654,8.7,20434,1994-09-23,"['prison', 'corruption', 'police brutality', '...","['Drama', 'Crime']","['Tim Robbins', 'Morgan Freeman', 'Bob Gunton'...",['Frank Darabont']
238,2,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",75.306,8.7,15270,1972-03-14,"['italy', 'loss of loved one', 'love at first ...","['Drama', 'Crime']","['Marlon Brando', 'Al Pacino', 'James Caan', '...",['Francis Ford Coppola']
724089,3,Gabriel's Inferno Part II,Professor Gabriel Emerson finally learns the t...,21.501,8.6,1369,2020-07-31,['based on novel or book'],['Romance'],"['Melanie Zanetti', 'Giulio Berruti', 'James A...",['Tosca Musk']
424,4,Schindler's List,The true story of how businessman Oskar Schind...,40.585,8.6,12202,1993-11-30,"['based on novel or book', 'factory', 'concent...","['Drama', 'History', 'War']","['Liam Neeson', 'Ben Kingsley', 'Ralph Fiennes...",['Steven Spielberg']


## Data Cleaning

### Removing Unnamed Column

In [None]:
data.columns=['temp', 'title', 'overview', 'popularity', 'vote_average',
       'vote_count', 'release_date', 'keywords', 'genres', 'cast', 'crew']
data.drop('temp',axis=1,inplace=True)

### Handling Null

In [None]:
data.isna().any()

title           False
overview         True
popularity      False
vote_average    False
vote_count      False
release_date    False
keywords        False
genres          False
cast            False
crew            False
dtype: bool

In [None]:
data.overview.fillna('Not Found',inplace=True)

### Changing Data Type

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9480 entries, 19404 to 580
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         9480 non-null   object 
 1   overview      9480 non-null   object 
 2   popularity    9480 non-null   float64
 3   vote_average  9480 non-null   float64
 4   vote_count    9480 non-null   int64  
 5   release_date  9480 non-null   object 
 6   keywords      9480 non-null   object 
 7   genres        9480 non-null   object 
 8   cast          9480 non-null   object 
 9   crew          9480 non-null   object 
dtypes: float64(2), int64(1), object(7)
memory usage: 814.7+ KB


In [None]:
new_types={'title': str,
 'overview': str,
 'release_date': 'datetime64',}
for col in new_types.keys():
    data[col]=data[col].astype(new_types[col])

In [None]:
for col in ['keywords', 'genres', 'cast', 'crew']:
    for val in ['[',']','\'']:
        data[col]=data[col].str.replace(val,'')
    data[col]=data[col].astype(str)

  This is separate from the ipykernel package so we can avoid doing imports until


## Exploring Data

In [None]:
def get_uniques(data,col):
    '''
    data: Dataframe object
    col: column name with comma seperated values
    ---
    returns: a list of unique category values in that column
    '''
    out=set([val.strip().lower() for val in ','.join(data[col].unique()).split(',')])
    try:
        out.remove('')
    except:
        return list(out)
    return list(out)

In [None]:
genres = get_uniques(data,'genres')
keywords = get_uniques(data,'keywords')
cast = get_uniques(data,'cast')
crew = get_uniques(data,'crew')

In [None]:
def get_counts(data, col, categories):
    '''
    data: dataframe object
    col: name of the column
    categories: categories present
    ----
    return a dictionary with counts of each category
    '''
    categ = {category: None for category in categories}
    for category in tqdm(categories):
        val=0
        for index in data.index:
            if category in data.at[index,col].lower():
                val+=1
        categ[category]=val
    return categ

In [None]:
# Get the base counts of for each category and sort them by counts
base_counts = get_counts(data, 'genres', genres)
base_counts = pd.DataFrame(index=base_counts.keys(),
                           data=base_counts.values(),
                           columns=['Counts'])
base_counts.sort_values(by='Counts', inplace=True)
# Plot the chart which shows top genres and separate by color where genre<1000
colors=['#abaeab' if i<1000 else '#A0E045' for i in  base_counts.Counts]
fig = px.bar(x=base_counts.index,
             y=base_counts['Counts'],
             title='Most Popular Genre',color_discrete_sequence=colors,color=base_counts.index)
fig.show()

100%|██████████| 18/18 [00:01<00:00, 16.60it/s]


### Movie Release per year

In [None]:
# Function to plot value counts plots
def plot_value_counts_bar(data, col):
    '''
    data: Dataframe
    col: Name of the column to be plotted
    ----
    returns a plotly figure
    '''
    vc = pd.DataFrame(data[col].value_counts())
    vc['cat'] = vc.index
    fig = px.bar(vc, x='cat', y=col, color='cat', title=col)
    fig.update_layout()

    return fig


data['year']=data.release_date.dt.year
plot_value_counts_bar(data,'year')

In [None]:
def get_ratings(data, col,ratings_col, categories):
    '''
    data: dataframe object
    col: name of the column
    categories: categories present
    ----
    return a dictionary with average ratings of each category
    '''
    categ = {category: None for category in categories}
    for category in tqdm(categories):
        val=0
        ratings=0
        for index in data.index:
            if category in data.at[index,col].lower():
                val+=1
                ratings+=data.at[index,ratings_col]
        categ[category]=round(ratings/val,2)
    return categ
base_counts = get_ratings(data, 'genres','vote_count', genres)
base_counts = pd.DataFrame(index=base_counts.keys(),
                           data=base_counts.values(),
                           columns=['Counts'])
base_counts.sort_values(by='Counts', inplace=True)
fig = px.pie(names=base_counts.index,
             values=base_counts['Counts'],
             title='Most Popular Genre by Votes',color=base_counts.index)
fig.show()

100%|██████████| 18/18 [00:01<00:00, 14.69it/s]


## Building Model

#### CounterVectorizer

In [None]:
data.head()

Unnamed: 0_level_0,title,overview,popularity,vote_average,vote_count,release_date,keywords,genres,cast,crew,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...",31.222,8.7,3323,1995-10-20,,"Comedy, Drama, Romance","Shah Rukh Khan, Kajol, Amrish Puri, Anupam Khe...",Aditya Chopra,1995
278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,76.654,8.7,20434,1994-09-23,"prison, corruption, police brutality, prison c...","Drama, Crime","Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Frank Darabont,1994
238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",75.306,8.7,15270,1972-03-14,"italy, loss of loved one, love at first sight,...","Drama, Crime","Marlon Brando, Al Pacino, James Caan, Robert D...",Francis Ford Coppola,1972
724089,Gabriel's Inferno Part II,Professor Gabriel Emerson finally learns the t...,21.501,8.6,1369,2020-07-31,based on novel or book,Romance,"Melanie Zanetti, Giulio Berruti, James Andrew ...",Tosca Musk,2020
424,Schindler's List,The true story of how businessman Oskar Schind...,40.585,8.6,12202,1993-11-30,"based on novel or book, factory, concentration...","Drama, History, War","Liam Neeson, Ben Kingsley, Ralph Fiennes, Caro...",Steven Spielberg,1993


In [None]:
def create_soup(data):
    # Creating a simple text for countvectorizer to work with
    att = data['title'].lower()
    for i in data[1:]:
        att = att + ' ' + str(i.replace(',',' '))
    return att

model_data=data.copy()
model_data=model_data[['title','keywords','genres','cast','crew']]
model_data['soup']=model_data.apply(create_soup,axis=1)

In [None]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(model_data['soup'])
cosine_sim2 = cosine_similarity(count_matrix)

In [None]:
def get_recommendations_new(title, data, orig_data, cosine_sim=cosine_sim2):
    '''
    title: movie title
    data: model_data
    orig_data: original dataframe
    cosine_sim: cosine similarity matrix to use.
    ---
    returns: Table plot of plotly where top 10 movies by popularity are sorted.
    '''
    indices = pd.Series(data.index, index=data['title'])
    idx = indices[title]
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    out=orig_data[[
        'title', 'vote_average', 'genres', 'crew', 'popularity'
    ]].iloc[movie_indices]
    out.genres = out.genres.str.replace(',', '<br>')
    out.crew = out.crew.str.replace(',', '<br>')
    final=out.sort_values(by='popularity',ascending=False)
    colorscale = [[0, '#477BA8'], [.5, '#ece4db'], [1, '#d8e2dc']]
    fig = ff.create_table(final, colorscale=colorscale, height_constant=70)
    return fig

In [None]:
get_recommendations_new("The Shawshank Redemption",model_data,data)

In [None]:
get_recommendations_new("Spirited Away",model_data,data)

### NearestNeighbors

In [None]:
data.head()

Unnamed: 0_level_0,title,overview,popularity,vote_average,vote_count,release_date,keywords,genres,cast,crew,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...",31.222,8.7,3323,1995-10-20,,"Comedy, Drama, Romance","Shah Rukh Khan, Kajol, Amrish Puri, Anupam Khe...",Aditya Chopra,1995
278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,76.654,8.7,20434,1994-09-23,"prison, corruption, police brutality, prison c...","Drama, Crime","Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Frank Darabont,1994
238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",75.306,8.7,15270,1972-03-14,"italy, loss of loved one, love at first sight,...","Drama, Crime","Marlon Brando, Al Pacino, James Caan, Robert D...",Francis Ford Coppola,1972
724089,Gabriel's Inferno Part II,Professor Gabriel Emerson finally learns the t...,21.501,8.6,1369,2020-07-31,based on novel or book,Romance,"Melanie Zanetti, Giulio Berruti, James Andrew ...",Tosca Musk,2020
424,Schindler's List,The true story of how businessman Oskar Schind...,40.585,8.6,12202,1993-11-30,"based on novel or book, factory, concentration...","Drama, History, War","Liam Neeson, Ben Kingsley, Ralph Fiennes, Caro...",Steven Spielberg,1993


In [None]:
nn_data=data.copy()
def fill_genre(value,col,categories=genres):
    if col in value.lower() :
        return 1
    else:
        return 0
# Create genre columns
for col in genres:
    nn_data[col]=None
for index in tqdm(nn_data.index):
    for col in genres:
        nn_data.at[index,col]=fill_genre(nn_data.at[index,'genres'],col)
for col in genres:
    nn_data[col]=nn_data.genres.apply(fill_genre,args=(col,))
nn_data.drop(['overview','release_date','genres','title'],axis=1,inplace=True)
for col in ['keywords','cast','crew']:
    nn_data[col]=LabelEncoder().fit_transform(nn_data[col])

100%|██████████| 9480/9480 [00:02<00:00, 4019.04it/s]


In [None]:
model_knn = NearestNeighbors(metric='cosine',
                             algorithm='auto',
                             n_neighbors=20,
                             n_jobs=-1)
model_knn.fit(nn_data)

NearestNeighbors(metric='cosine', n_jobs=-1, n_neighbors=20)

In [None]:
# Create a function to recommend top 10 movies
def recommend_movies(movie,nn_data,orig_data):
    orig_data.reset_index(inplace=True)
    nn_data.reset_index(inplace=True,drop=True)
    movie_index=nn_data[orig_data.title==movie].index
    distances, indices = model_knn.kneighbors(np.array(nn_data.iloc[movie_index]).reshape(
    1, -1),n_neighbors=10)

    out=orig_data[[
        'title', 'vote_average', 'genres', 'crew', 'popularity'
    ]].iloc[indices[0]]
    out.genres = out.genres.str.replace(',', '<br>')
    out.crew = out.crew.str.replace(',', '<br>')
    final=out.sort_values(by='popularity',ascending=False)
    colorscale = [[0, '#fad2e1'], [.5, '#fde2e4'], [1, '#fff1e6']]
    fig = ff.create_table(final, colorscale=colorscale, height_constant=70)
    return fig

In [None]:
recommend_movies("Thor",nn_data.copy(),data.copy())


X does not have valid feature names, but NearestNeighbors was fitted with feature names



In [None]:
recommend_movies("Eternals",nn_data.copy(),data.copy())


X does not have valid feature names, but NearestNeighbors was fitted with feature names

