<table align="left">
  <td>
    <a target="_blank" href="https://www.kaggle.com/mateuszszczepanek/recommendation-engine-movies"><img src="https://cdn.iconscout.com/icon/free/png-128/kaggle-3628869-3030009.png" />Open in Kaggle</a>
  </td>
</table>

### Importing **modules**

In [1]:
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings("ignore")

### Loading **data**

In [2]:
movies_dataset = pd.read_csv("../input/movie-dataset/movie_dataset.csv")
print(movies_dataset.shape)
movies_dataset.sample(3)

(4803, 24)


Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
4454,4454,0,Comedy Drama,,11446,parents kids relationship sister sister relati...,en,Welcome to the Dollhouse,An unattractive 7th grader struggles to cope w...,5.870319,...,88.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Not all girls want to play with dolls.,Welcome to the Dollhouse,6.9,103,Heather Matarazzo Victoria Davis Christina Bru...,"[{'name': 'Alan Oxman', 'gender': 2, 'departme...",Todd Solondz
2313,2313,23000000,Romance Comedy Crime Fantasy,,854,dual identity bank mockery green balloon,en,The Mask,When timid bank clerk Stanley Ipkiss discovers...,85.30318,...,101.0,"[{""iso_639_1"": ""sv"", ""name"": ""svenska""}, {""iso...",Released,From zero to hero.,The Mask,6.6,2472,Jim Carrey Cameron Diaz Nancy Fish Tim Bagley ...,"[{'name': 'Arthur Coburn', 'gender': 2, 'depar...",Chuck Russell
2975,2975,10000000,Drama,http://www.miramax.com/movie/good-will-hunting,489,baseball boston professor m.i.t. harvard unive...,en,Good Will Hunting,Will Hunting has a genius-level IQ but chooses...,43.734886,...,126.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Some people can never believe in themselves, u...",Good Will Hunting,7.9,2811,Robin Williams Matt Damon Ben Affleck Stellan ...,"[{'name': 'Lawrence Bender', 'gender': 2, 'dep...",Gus Van Sant


### Taking only **needed features**

In [3]:
movies_dataset.sample(1).T

Unnamed: 0,419
index,419
budget,85000000
genres,Adventure Fantasy Science Fiction
homepage,http://www.fox.co.uk/jumper
id,8247
keywords,adolescence based on novel loss of child fight...
original_language,en
original_title,Jumper
overview,"David Rice is a man who knows no boundaries, a..."
popularity,21.218


In [4]:
movies_dataset = movies_dataset[["genres", "keywords", "production_companies",  "title", "cast", "crew", "director"]]
movies_dataset.head(4)

Unnamed: 0,genres,keywords,production_companies,title,cast,crew,director
0,Action Adventure Fantasy Science Fiction,culture clash future space war space colony so...,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",Avatar,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,Adventure Fantasy Action,ocean drug abuse exotic island east india trad...,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",Pirates of the Caribbean: At World's End,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,Action Adventure Crime,spy based on novel secret agent sequel mi6,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",Spectre,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,Action Crime Drama Thriller,dc comics crime fighter terrorist secret ident...,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",The Dark Knight Rises,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan


In [5]:
df = movies_dataset.copy()
df.fillna("", inplace=True)
df.shape

(4803, 7)

### Preparing **final dataframe**

In [6]:
# Creating a list of indexes of untitled movies and removing them
no_title = [n for n, b in enumerate(df.title.isna()) if b]
df.drop(no_title, inplace=True)

# Removing movies with insufficient information
# ( with empty columns: "genres", "cast", "keyword")
index_to_drop = [n for n in range(df.shape[0]) if len(df.genres[n] + df.cast[n] + df.keywords[n]) == 6]
df.drop(index=index_to_drop, inplace=True)

### *Before changing* (selected movie - **"Tangled"**)

<table align="center">
  <td>
    <a href="https://www.google.com/search?q=tangled&rlz=1C1PNBB_enPL948PL948&sxsrf=ALeKk02ogGb6JvWLCYM93OTV1kmAIuscpQ%3A1622478148580&ei=RA21YPL5IuyhrgTf3rX4Cg&oq=tangled&gs_lcp=Cgdnd3Mtd2l6EAMyBAgjECcyBAgjECcyAgguMgQILhBDMgIIADIECAAQQzICCAAyBAgAEEMyAggAMgIILjoHCAAQRxCwAzoHCAAQsAMQQzoKCC4QsAMQyAMQQ0oFCDgSATFQ5RRY8hVg-xdoAXACeACAAVuIAYECkgEBM5gBAKABAaoBB2d3cy13aXrIAQvAAQE&sclient=gws-wiz&ved=0ahUKEwjyq86NqvTwAhXskIsKHV9vDa8Q4dUDCA4&uact=5"><img src="https://ae01.alicdn.com/kf/HTB1vUoCKpXXXXaUXXXXq6xXFXXXG/Free-shipping-Rapunzel-Flynn-in-Tangled-Movie-Poster-HD-HOME-WALL-Decor-Custom-ART-PRINT-Silk.jpg" /></a>
  </td>
</table>

In [7]:
TITLE = "Tangled"

pd.set_option('max_colwidth', 500)
df[df.title == TITLE].T

Unnamed: 0,6
genres,Animation Family
keywords,hostage magic horse fairy tale musical
production_companies,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""name"": ""Walt Disney Animation Studios"", ""id"": 6125}]"
title,Tangled
cast,Zachary Levi Mandy Moore Donna Murphy Ron Perlman M.C. Gainey
crew,"[{'name': 'John Lasseter', 'gender': 2, 'department': 'Production', 'job': 'Executive Producer', 'credit_id': '52fe46db9251416c91062101', 'id': 7879}, {'name': 'Jacob Grimm', 'gender': 2, 'department': 'Writing', 'job': 'Novel', 'credit_id': '52fe46db9251416c910620e3', 'id': 38748}, {'name': 'Wilhelm Grimm', 'gender': 2, 'department': 'Writing', 'job': 'Novel', 'credit_id': '52fe46db9251416c910620dd', 'id': 5448}, {'name': 'John Kahrs', 'gender': 0, 'department': 'Visual Effects', 'job': 'An..."
director,Byron Howard


In [8]:
def change(row):    
    txt = ""
    for n in range(len(row)):
        txt += f"{row[n]['name']} "
    return txt  

def change_crew(row):
    txt = ""
    r = 3 if len(row) > 3 else len(row)
    for i in range(r):
        txt += f"{row[i]['name']} "  
    return txt


# Taking only "name" values from each column and row
df["production_companies"] = df["production_companies"].apply(lambda x: ast.literal_eval(x))
df["production_companies"] = df["production_companies"].apply(change)                                                       

df["crew"] = df["crew"].apply(lambda x: ast.literal_eval(x))
df["crew"] = df["crew"].apply(change_crew)

### _After changing_


In [9]:
df[df.title == TITLE].T

Unnamed: 0,6
genres,Animation Family
keywords,hostage magic horse fairy tale musical
production_companies,Walt Disney Pictures Walt Disney Animation Studios
title,Tangled
cast,Zachary Levi Mandy Moore Donna Murphy Ron Perlman M.C. Gainey
crew,John Lasseter Jacob Grimm Wilhelm Grimm
director,Byron Howard


In [10]:
# Add "recommend" column
df["recommend"] = ""

# with values from all columns
for c in df.columns:
    df["recommend"] += df[c] + " "
df[df.title == TITLE]

Unnamed: 0,genres,keywords,production_companies,title,cast,crew,director,recommend
6,Animation Family,hostage magic horse fairy tale musical,Walt Disney Pictures Walt Disney Animation Studios,Tangled,Zachary Levi Mandy Moore Donna Murphy Ron Perlman M.C. Gainey,John Lasseter Jacob Grimm Wilhelm Grimm,Byron Howard,Animation Family hostage magic horse fairy tale musical Walt Disney Pictures Walt Disney Animation Studios Tangled Zachary Levi Mandy Moore Donna Murphy Ron Perlman M.C. Gainey John Lasseter Jacob Grimm Wilhelm Grimm Byron Howard Animation Family hostage magic horse fairy tale musical Walt Disney Pictures Walt Disney Animation Studios Tangled Zachary Levi Mandy Moore Donna Murphy Ron Perlman M.C. Gainey John Lasseter Jacob Grimm Wilhelm Grimm Byron Howard


In [11]:
def get_index_from_title(title):
    return df[df.title.str.upper()==title.upper()].index[0]

def get_title_from_index(index):
    return df.iloc[index].title


movie_index = get_index_from_title(TITLE)

In [12]:
# Instantiating the vectorizer object
cv = CountVectorizer()

# Converting documents into a matrix
wm = cv.fit_transform(df.recommend)

# Computing cosine similarity
cosine_sim = cosine_similarity(wm)

# Creating a sorted list of tuples (index, cosine similarity)
result = sorted( list( enumerate(cosine_sim[movie_index]) ), reverse=True, key= lambda x : x[1])

# First 10 movies similar to choosen movie ( "Tangled" )
print(f'Movies similar to "{get_title_from_index(movie_index)}" movie:\n')
for i in range(1, 11):
    index, sim = result[i]
    print(f"{sim*100:.0f}% - {get_title_from_index(index)}")

Movies similar to "Tangled" movie:

47% - Return to Never Land
45% - The Jungle Book 2
42% - Frozen
41% - The Princess and the Frog
41% - Fantasia
40% - Dinosaur
39% - Mulan
39% - Roadside Romeo
39% - Monsters University
38% - Big Hero 6
