# MOVIE SIMILARTY VISUALIZATION

## Import Library

In [1]:
import re
import nltk
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from ast import literal_eval
import matplotlib.pyplot as plt
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

%matplotlib inline
warnings.simplefilter('ignore')
pd.set_option('display.max_columns', 50)

## Import Data

In [2]:
credits = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv')
movies = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv')

## Prepare Data

In [3]:
credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [4]:
from ast import literal_eval

#Converting the string into list of dictionaries
credits.cast = credits.cast.apply(literal_eval)
credits.crew = credits.crew.apply(literal_eval)

# Extracting the Casts into a list from Dictionaries
credits['cast'] = credits['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

# Extracting the Director from the Crew
def extract_director(x):
    for crew_mem in x:
        if crew_mem['job'] == 'Director':
            return crew_mem['name']
        else:
            return np.nan

credits['director'] = credits['crew'].apply(extract_director)
credits['director'].fillna('',inplace = True)
credits.drop(['crew'],axis = 1,inplace = True)
credits.drop(['title'],axis = 1,inplace = True)

In [5]:
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [6]:
# Extracting the Genres into a list from Dictionaris
movies['genres'] = movies['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
# Extracting the Keywords into a list from Dictionaris
movies['keywords'] = movies['keywords'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [7]:
movies = movies.merge(credits, left_on='id', right_on='movie_id', how = 'left')

In [8]:
# Selecting required columns from the master dataframe
movies = movies[['id','original_title','title','cast', 'director', 'keywords', 'genres', 'release_date', 'overview', 
                 'original_language', 'runtime', 'tagline', 'vote_average', 'vote_count','popularity']]
movies.head()

Unnamed: 0,id,original_title,title,cast,director,keywords,genres,release_date,overview,original_language,runtime,tagline,vote_average,vote_count,popularity
0,19995,Avatar,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weave...",,"[culture clash, future, space war, space colon...","[Action, Adventure, Fantasy, Science Fiction]",2009-12-10,"In the 22nd century, a paraplegic Marine is di...",en,162.0,Enter the World of Pandora.,7.2,11800,150.437577
1,285,Pirates of the Caribbean: At World's End,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...",,"[ocean, drug abuse, exotic island, east india ...","[Adventure, Fantasy, Action]",2007-05-19,"Captain Barbossa, long believed to be dead, ha...",en,169.0,"At the end of the world, the adventure begins.",6.9,4500,139.082615
2,206647,Spectre,Spectre,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",,"[spy, based on novel, secret agent, sequel, mi...","[Action, Adventure, Crime]",2015-10-26,A cryptic message from Bond’s past sends him o...,en,148.0,A Plan No One Escapes,6.3,4466,107.376788
3,49026,The Dark Knight Rises,The Dark Knight Rises,"[Christian Bale, Michael Caine, Gary Oldman, A...",,"[dc comics, crime fighter, terrorist, secret i...","[Action, Crime, Drama, Thriller]",2012-07-16,Following the death of District Attorney Harve...,en,165.0,The Legend Ends,7.6,9106,112.31295
4,49529,John Carter,John Carter,"[Taylor Kitsch, Lynn Collins, Samantha Morton,...",,"[based on novel, mars, medallion, space travel...","[Action, Adventure, Science Fiction]",2012-03-07,"John Carter is a war-weary, former military ca...",en,132.0,"Lost in our world, found in another.",6.1,2124,43.926995


## Handling Missing Values

In [9]:
movies.isna().sum()

id                     0
original_title         0
title                  0
cast                   0
director               0
keywords               0
genres                 0
release_date           1
overview               3
original_language      0
runtime                2
tagline              844
vote_average           0
vote_count             0
popularity             0
dtype: int64

In [10]:
movies.tagline.fillna('',inplace = True)
movies = movies.dropna().reset_index()

In [11]:
movies.release_date = pd.to_datetime(movies.release_date,format = '%Y-%m-%d')
movies['release_year'] = movies.release_date.apply(lambda x: x.year)

## Text Processing

In [12]:
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer=WordNetLemmatizer()

def clean_plot(txt):
    regex = re.compile(r"[!@%&;?'',.""-]")
    txt_clean = re.sub(regex,'',txt)
    txt_clean = txt_clean.lower()
    txt_clean = txt_clean.split(' ')
    txt_clean = [word for word in txt_clean if word not in stopwords.words('english')]
    txt_clean = ' '.join(txt_clean)
    word_list = nltk.word_tokenize(txt_clean)
    txt_clean = ' '.join([lemmatizer.lemmatize(w,get_wordnet_pos(w)) for w in word_list])
    return txt_clean


In [13]:
movies.head()

Unnamed: 0,index,id,original_title,title,cast,director,keywords,genres,release_date,overview,original_language,runtime,tagline,vote_average,vote_count,popularity,release_year
0,0,19995,Avatar,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weave...",,"[culture clash, future, space war, space colon...","[Action, Adventure, Fantasy, Science Fiction]",2009-12-10,"In the 22nd century, a paraplegic Marine is di...",en,162.0,Enter the World of Pandora.,7.2,11800,150.437577,2009
1,1,285,Pirates of the Caribbean: At World's End,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...",,"[ocean, drug abuse, exotic island, east india ...","[Adventure, Fantasy, Action]",2007-05-19,"Captain Barbossa, long believed to be dead, ha...",en,169.0,"At the end of the world, the adventure begins.",6.9,4500,139.082615,2007
2,2,206647,Spectre,Spectre,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",,"[spy, based on novel, secret agent, sequel, mi...","[Action, Adventure, Crime]",2015-10-26,A cryptic message from Bond’s past sends him o...,en,148.0,A Plan No One Escapes,6.3,4466,107.376788,2015
3,3,49026,The Dark Knight Rises,The Dark Knight Rises,"[Christian Bale, Michael Caine, Gary Oldman, A...",,"[dc comics, crime fighter, terrorist, secret i...","[Action, Crime, Drama, Thriller]",2012-07-16,Following the death of District Attorney Harve...,en,165.0,The Legend Ends,7.6,9106,112.31295,2012
4,4,49529,John Carter,John Carter,"[Taylor Kitsch, Lynn Collins, Samantha Morton,...",,"[based on novel, mars, medallion, space travel...","[Action, Adventure, Science Fiction]",2012-03-07,"John Carter is a war-weary, former military ca...",en,132.0,"Lost in our world, found in another.",6.1,2124,43.926995,2012


In [14]:
genres = movies['genres'].apply(lambda x : " ".join(x))
keywords = movies['keywords'].apply(lambda x : " ".join(x))

In [15]:
overview = movies.overview.apply(clean_plot)
genres = genres.apply(clean_plot)
keywords = keywords.apply(clean_plot)

In [16]:
release_year = movies.release_year

In [17]:
genre_keys = genres + ' ' + keywords

In [18]:
tfidf = TfidfVectorizer(analyzer = 'word', ngram_range = (1,1), min_df = 0, stop_words = 'english')
plot_vector = tfidf.fit_transform(overview)

In [19]:
cv = CountVectorizer(analyzer = 'word', ngram_range = (1,1), min_df = 0, stop_words = 'english')
genrekey_vector = cv.fit_transform(genre_keys)

## Calculating Distances

In [20]:
from scipy.spatial import distance
import time
score = []
start_time = time.time()

plot_arr = plot_vector.toarray()
genrekey_arr = genrekey_vector.toarray()

def get_pos(title):
    target_plot_arr = plot_vector[movies[movies.title==title].index.values[0]].toarray()
    target_genre_arr = genrekey_vector[movies[movies.title==title].index.values[0]].toarray()
    pos = {}
    for i in range(plot_arr.shape[0]):
        plot_pos = distance.euclidean(target_plot_arr,plot_arr[i])
        genre_pos = distance.euclidean(target_genre_arr,genrekey_arr[i])
        pos[movies.title[i]] = [release_year[i],plot_pos,genre_pos]
    return pos

print("--- %s seconds ---" % (time.time() - start_time))

--- 1.1199312210083008 seconds ---


## Plotting

In [21]:
import plotly.graph_objects as go
import plotly.express as px
import networkx as nx

In [22]:
def create_graph(title):
    G = nx.Graph()
    G.clear()
    G.add_nodes_from(movies.title.tolist())
    pos = get_pos(title)
    for node in G.nodes:
        G.nodes[node]['pos'] = pos[node]
    return G

In [23]:
def plot_graph(G):
    edge_x = []
    edge_y = []
    for edge in G.edges():
        x0, y0 = G.nodes[edge[0]]['pos']
        x1, y1 = G.nodes[edge[1]]['pos']
        edge_x.append(x0)
        edge_x.append(x1)
        edge_x.append(None)
        edge_y.append(y0)
        edge_y.append(y1)
        edge_y.append(None)

    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=0.5, color='#888'),
        hoverinfo='none',
        mode='lines')

    node_x = []
    node_y = []
    node_z = []
    for node in G.nodes():
        x, y, z = G.nodes[node]['pos']
        node_x.append(x)
        node_y.append(y)
        node_z.append(z)

    node_trace = go.Scatter3d(
        x=node_x, y=node_y,z=node_z,
        mode='markers',
        hoverinfo='text',
        marker=dict(
            # colorscale options
            #'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
            #'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
            #'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
            colorscale='YlGnBu',
            reversescale=True,
            color=z,
            size=10,
            opacity=0.8,
            colorbar=dict(
                thickness=15,
                title='Similarty',
                xanchor='left',
                titleside='right'
            ),
            line_width=2))
    node_adjacencies = []
    node_text = []
    for index,row in movies[['title','genres']].iterrows():
        gen = " | ".join(row['genres'])
        text = 'Title: ' + row['title'] + '\nGenres:' + gen
        node_text.append(text)
#     for node, adjacencies in enumerate(G.adjacency()):
#         node_adjacencies.append(len(adjacencies[1]))
#         node_text.append('# of connections: '+str(len(adjacencies[1])))
#     zipped_nodes = zip(node_x,node_y,node_z)
#     color_node = [round(x + y + z) for (x, y,z) in zipped_nodes]
    node_trace.text = node_text
    fig = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                title='<br>Movie Similarity Graph with Python',
                titlefont_size=16,
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
                annotations=[ dict(
                    text="Python code: <a href='https://www.kaggle.com/aadarsh168/movie-visualization/'> Kaggle Notebook</a>",
                    showarrow=False,
                    xref="paper", yref="paper",
                    x=0.005, y=-0.002 ) ],
                xaxis=dict(showgrid=False, zeroline=True, showticklabels=True),
                yaxis=dict(showgrid=False, zeroline=True, showticklabels=True))
                )
    fig.update_layout(
    scene = dict(
        xaxis = dict(nticks=4, range=[1915,2018],),
        yaxis = dict(nticks=4, range=[-1,10],),
        zaxis = dict(nticks=4, range=[-1,10],),),
        width=700,
        margin=dict(r=20, l=10, b=10, t=10))
    
    fig.update_layout(
        scene = dict(
            xaxis_title='Release Year',
            yaxis_title='Plot Distance',
            zaxis_title='Genres Distance'),
            width=700,
            margin=dict(r=20, b=10, l=10, t=10))
    

#     fig.update_layout(
#         xaxis = dict(
#             tickangle = 90,
#             title_text = "Genre Distance",
#             title_font = {"size": 20},
#             title_standoff = 25),
#         yaxis = dict(
#             title_text = "Plot Distance",
#             title_font = {"size": 20},
#             title_standoff = 25))
    return fig


In [24]:
fig = plot_graph(create_graph('Toy Story'))
fig.show()