<table align="left">
  <td>
    <a target="_blank" href="https://www.kaggle.com/mateuszszczepanek/recommendation-engine-movies"><img src="https://cdn.iconscout.com/icon/free/png-128/kaggle-3628869-3030009.png" />Open in Kaggle</a>
  </td>
</table>

### Importing **modules**

In [3]:
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings("ignore")

### Loading **data**

In [4]:
movies_dataset = pd.read_csv("../input/movie-dataset/movie_dataset.csv")
print(movies_dataset.shape)
movies_dataset.sample(3)

(4803, 24)


Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
3922,3922,0,Comedy Drama,,30309,independent film woman director,en,Real Women Have Curves,There's more to Ana Garcia than meets the eye....,1.026243,...,86.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,Real Women Have Curves,5.8,19,America Ferrera Lupe Ontiveros Ingrid Oliu Geo...,"[{'name': 'Patricia Cardoso', 'gender': 0, 'de...",Patricia Cardoso
4249,4249,0,Horror Comedy Thriller,http://www.houseboundthemovie.com/,253306,haunted house father-in-law superstition house...,en,Housebound,When Kylie Bucknell is sentenced to home deten...,10.322858,...,109.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Terror Gets Domesticated,Housebound,6.5,358,Morgana O'Reilly Rima Te Wiata Glen-Paul Waru ...,"[{'name': 'Haley Williams', 'gender': 1, 'depa...",Gerard Johnstone
3914,3914,3000000,Drama History,,821,judge concentration camp world war ii nazis co...,en,Judgment at Nuremberg,"In 1947, four German judges who served on the ...",13.338539,...,186.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The event the world will never forget,Judgment at Nuremberg,7.6,155,Spencer Tracy Burt Lancaster Richard Widmark M...,"[{'name': 'Stanley Kramer', 'gender': 2, 'depa...",Stanley Kramer


### Taking only **needed features**

In [5]:
movies_dataset.sample(1).T

Unnamed: 0,640
index,640
budget,95000000
genres,Action Adventure
homepage,
id,9342
keywords,california spy hero horseback riding sword fight
original_language,en
original_title,The Mask of Zorro
overview,It has been twenty years since Don Diego de la...
popularity,31.086791


In [6]:
movies_dataset = movies_dataset[["genres", "keywords", "production_companies",  "title", "cast", "crew", "director"]]
movies_dataset.head(4)

Unnamed: 0,genres,keywords,production_companies,title,cast,crew,director
0,Action Adventure Fantasy Science Fiction,culture clash future space war space colony so...,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",Avatar,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,Adventure Fantasy Action,ocean drug abuse exotic island east india trad...,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",Pirates of the Caribbean: At World's End,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,Action Adventure Crime,spy based on novel secret agent sequel mi6,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",Spectre,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,Action Crime Drama Thriller,dc comics crime fighter terrorist secret ident...,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",The Dark Knight Rises,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan


In [7]:
df = movies_dataset.copy()
df.fillna("", inplace=True)
df.shape

(4803, 7)

### Preparing **final dataframe**

In [8]:
# Creating a list of indexes of untitled movies and removing them
no_title = [n for n, b in enumerate(df.title.isna()) if b]
df.drop(no_title, inplace=True)

# Removing movies with insufficient information
# ( with empty columns: "genres", "cast", "keyword")
index_to_drop = [n for n in range(df.shape[0]) if len(df.genres[n] + df.cast[n] + df.keywords[n]) == 6]
df.drop(index=index_to_drop, inplace=True)

### *Before changing* (selected movie - **"Tangled"**)

<table align="center">
  <td>
    <a href="https://www.google.com/search?q=tangled&rlz=1C1PNBB_enPL948PL948&sxsrf=ALeKk02ogGb6JvWLCYM93OTV1kmAIuscpQ%3A1622478148580&ei=RA21YPL5IuyhrgTf3rX4Cg&oq=tangled&gs_lcp=Cgdnd3Mtd2l6EAMyBAgjECcyBAgjECcyAgguMgQILhBDMgIIADIECAAQQzICCAAyBAgAEEMyAggAMgIILjoHCAAQRxCwAzoHCAAQsAMQQzoKCC4QsAMQyAMQQ0oFCDgSATFQ5RRY8hVg-xdoAXACeACAAVuIAYECkgEBM5gBAKABAaoBB2d3cy13aXrIAQvAAQE&sclient=gws-wiz&ved=0ahUKEwjyq86NqvTwAhXskIsKHV9vDa8Q4dUDCA4&uact=5"><img src="https://ae01.alicdn.com/kf/HTB1vUoCKpXXXXaUXXXXq6xXFXXXG/Free-shipping-Rapunzel-Flynn-in-Tangled-Movie-Poster-HD-HOME-WALL-Decor-Custom-ART-PRINT-Silk.jpg" /></a>
  </td>
</table>

In [9]:
TITLE = "Tangled"

pd.set_option('max_colwidth', 500)
df[df.title == TITLE].T

Unnamed: 0,6
genres,Animation Family
keywords,hostage magic horse fairy tale musical
production_companies,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""name"": ""Walt Disney Animation Studios"", ""id"": 6125}]"
title,Tangled
cast,Zachary Levi Mandy Moore Donna Murphy Ron Perlman M.C. Gainey
crew,"[{'name': 'John Lasseter', 'gender': 2, 'department': 'Production', 'job': 'Executive Producer', 'credit_id': '52fe46db9251416c91062101', 'id': 7879}, {'name': 'Jacob Grimm', 'gender': 2, 'department': 'Writing', 'job': 'Novel', 'credit_id': '52fe46db9251416c910620e3', 'id': 38748}, {'name': 'Wilhelm Grimm', 'gender': 2, 'department': 'Writing', 'job': 'Novel', 'credit_id': '52fe46db9251416c910620dd', 'id': 5448}, {'name': 'John Kahrs', 'gender': 0, 'department': 'Visual Effects', 'job': 'An..."
director,Byron Howard


In [10]:
def change(row):    
    txt = ""
    for n in range(len(row)):
        txt += f"{row[n]['name']} "
    return txt  

def change_crew(row):
    txt = ""
    r = 3 if len(row) > 3 else len(row)
    for i in range(r):
        txt += f"{row[i]['name']} "  
    return txt


# Taking only "name" values from each column and row
df["production_companies"] = df["production_companies"].apply(lambda x: ast.literal_eval(x))
df["production_companies"] = df["production_companies"].apply(change)                                                       

df["crew"] = df["crew"].apply(lambda x: ast.literal_eval(x))
df["crew"] = df["crew"].apply(change_crew)

### _After changing_


In [11]:
df[df.title == TITLE].T

Unnamed: 0,6
genres,Animation Family
keywords,hostage magic horse fairy tale musical
production_companies,Walt Disney Pictures Walt Disney Animation Studios
title,Tangled
cast,Zachary Levi Mandy Moore Donna Murphy Ron Perlman M.C. Gainey
crew,John Lasseter Jacob Grimm Wilhelm Grimm
director,Byron Howard


In [12]:
# Add "recommend" column
df["recommend"] = ""

# with values from all columns
for c in df.columns:
    df["recommend"] += df[c] + " "
df[df.title == TITLE]

Unnamed: 0,genres,keywords,production_companies,title,cast,crew,director,recommend
6,Animation Family,hostage magic horse fairy tale musical,Walt Disney Pictures Walt Disney Animation Studios,Tangled,Zachary Levi Mandy Moore Donna Murphy Ron Perlman M.C. Gainey,John Lasseter Jacob Grimm Wilhelm Grimm,Byron Howard,Animation Family hostage magic horse fairy tale musical Walt Disney Pictures Walt Disney Animation Studios Tangled Zachary Levi Mandy Moore Donna Murphy Ron Perlman M.C. Gainey John Lasseter Jacob Grimm Wilhelm Grimm Byron Howard Animation Family hostage magic horse fairy tale musical Walt Disney Pictures Walt Disney Animation Studios Tangled Zachary Levi Mandy Moore Donna Murphy Ron Perlman M.C. Gainey John Lasseter Jacob Grimm Wilhelm Grimm Byron Howard


In [15]:
def get_index_from_title(title):
    return df[df.title.str.upper()==title.upper()].index[0]

def get_title_from_index(index):
    return df.iloc[index].title


movie_index = get_index_from_title(TITLE)

In [16]:
# Instantiating the vectorizer object
cv = CountVectorizer()

# Converting documents into a matrix
wm = cv.fit_transform(df.recommend)

# Computing cosine similarity
cosine_sim = cosine_similarity(wm)

# Creating a sorted list of tuples (index, cosine similarity)
result = sorted( list( enumerate(cosine_sim[movie_index]) ), reverse=True, key= lambda x : x[1])

# First 10 movies similar to choosen movie ( "Tangled" )
print(f'Movies similar to "{get_title_from_index(movie_index)}" movie:\n')
for i in range(1, 11):
    index, sim = result[i]
    print(f"{sim*100:.0f}% - {get_title_from_index(index)}")

Movies similar to "The Matrix" movie:

82% - The Matrix Revolutions
75% - The Matrix Reloaded
38% - Terminator 3: Rise of the Machines
35% - Speed Racer
33% - The Invasion
33% - Terminator Salvation
32% - The Terminator
31% - Cradle 2 the Grave
30% - Red Planet
30% - Sherlock Holmes: A Game of Shadows
