In [150]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import make_pipeline

In [146]:
df = pd.read_csv("movie_metadata.csv")

df.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [147]:
df = df.drop(columns=["color", "num_critic_for_reviews", "cast_total_facebook_likes", "facenumber_in_poster", 
    "movie_imdb_link", "aspect_ratio", "director_facebook_likes", "actor_3_facebook_likes", "actor_2_facebook_likes",
    "actor_1_facebook_likes", "budget", "num_voted_users", "gross", "duration", "num_user_for_reviews", "movie_facebook_likes", "imdb_score"])
# df = df.dropna()

df.head(3)

Unnamed: 0,director_name,actor_2_name,genres,actor_1_name,movie_title,actor_3_name,plot_keywords,language,country,content_rating,title_year
0,James Cameron,Joel David Moore,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,Wes Studi,avatar|future|marine|native|paraplegic,English,USA,PG-13,2009.0
1,Gore Verbinski,Orlando Bloom,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,Jack Davenport,goddess|marriage ceremony|marriage proposal|pi...,English,USA,PG-13,2007.0
2,Sam Mendes,Rory Kinnear,Action|Adventure|Thriller,Christoph Waltz,Spectre,Stephanie Sigman,bomb|espionage|sequel|spy|terrorist,English,UK,PG-13,2015.0


In [161]:
df.movie_title = df.movie_title.map(lambda el: el and el.replace("\xa0", ""), na_action="ignore")
df.title_year = df.title_year.astype("Int32")
df.director_name = df.director_name.map(lambda el: el and el.replace(" ", "-"), na_action="ignore")
df.actor_1_name = df.actor_1_name.map(lambda el: el and el.replace(" ", "-"), na_action="ignore")
df.actor_2_name = df.actor_2_name.map(lambda el: el and el.replace(" ", "-"), na_action="ignore")
df.actor_3_name = df.actor_3_name.map(lambda el: el and el.replace(" ", "-"), na_action="ignore")
df.content_rating = df.content_rating.map(lambda el: el and el.replace("-", ""), na_action="ignore")
df.genres = df.genres.map(lambda el: el and el.replace("|", " "), na_action="ignore")
df.plot_keywords = df.plot_keywords.map(lambda el: el and el.replace("|", " "), na_action="ignore")


df.head(2)

Unnamed: 0,director_name,actor_2_name,genres,actor_1_name,movie_title,actor_3_name,plot_keywords,language,country,content_rating,title_year
0,James-Cameron,Joel-David-Moore,Action Adventure Fantasy Sci-Fi,CCH-Pounder,Avatar,Wes-Studi,avatar future marine native paraplegic,English,USA,PG13,2009
1,Gore-Verbinski,Orlando-Bloom,Action Adventure Fantasy,Johnny-Depp,Pirates of the Caribbean: At World's End,Jack-Davenport,goddess marriage ceremony marriage proposal pi...,English,USA,PG13,2007


In [162]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   director_name   4939 non-null   object
 1   actor_2_name    5030 non-null   object
 2   genres          5043 non-null   object
 3   actor_1_name    5036 non-null   object
 4   movie_title     5043 non-null   object
 5   actor_3_name    5020 non-null   object
 6   plot_keywords   4890 non-null   object
 7   language        5031 non-null   object
 8   country         5038 non-null   object
 9   content_rating  4740 non-null   object
 10  title_year      4935 non-null   Int32 
dtypes: Int32(1), object(10)
memory usage: 418.7+ KB


In [163]:
content = df.apply(lambda el: " ".join(map(str, el)).replace("|", " "), axis=1)
content.head(3)

0    James-Cameron Joel-David-Moore Action Adventur...
1    Gore-Verbinski Orlando-Bloom Action Adventure ...
2    Sam-Mendes Rory-Kinnear Action Adventure Thril...
dtype: object

In [164]:
# tf = TfidfTransformer()
cv = CountVectorizer()

count_matrix = cv.fit_transform(content)
count_matrix.shape

(5043, 16616)

In [165]:
sim_score_matrix = cosine_similarity(count_matrix)
sim_score_matrix.shape

(5043, 5043)

In [205]:
def recommend_movie(index, top=10):
    score = sim_score_matrix[index]
    top_index = np.argsort(-score)[0:top]
    top_df = df.iloc[top_index].assign(similarity=score[top_index])
    return top_df


# print(df.iloc[50].movie_title)

res = recommend_movie(10, 30)
res[["movie_title", "similarity"]]

Unnamed: 0,movie_title,similarity
10,Batman v Superman: Dawn of Justice,1.0
15,Man of Steel,0.500193
73,Suicide Squad,0.434828
898,Superman II,0.401189
248,Teenage Mutant Ninja Turtles: Out of the Shadows,0.389039
8,Avengers: Age of Ultron,0.382518
27,Captain America: Civil War,0.376851
812,Deadpool,0.339467
1367,The 5th Wave,0.333462
11,Superman Returns,0.3253
