# Content based Filtering

In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

import seaborn as sns
import matplotlib.pyplot as plt
import json

In [7]:
movies = pd.read_csv("tmdb-5000-movie-dataset/tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb-5000-movie-dataset/tmdb_5000_credits.csv")

In [8]:
movies = movies.merge(credits, on="title")

In [10]:
print(movies.shape)
movies.head(2)

(4809, 23)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


## *Feature Selection and Preprocessing*

### *Important Features*

1. id
2. title
3. genres
5. keywords
6. release_date
7. overview
9. cast
10. crew

In [11]:
movies = movies[["movie_id", "title", "release_date", "overview", "genres", "keywords", "cast", "crew"]]

In [12]:
movies

Unnamed: 0,movie_id,title,release_date,overview,genres,keywords,cast,crew
0,19995,Avatar,2009-12-10,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,2007-05-19,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,2015-10-26,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,2012-07-16,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,2012-03-07,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."
...,...,...,...,...,...,...,...,...
4804,9367,El Mariachi,1992-09-04,El Mariachi just wants to play his guitar and ...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 5616, ""name"": ""united states\u2013mexi...","[{""cast_id"": 1, ""character"": ""El Mariachi"", ""c...","[{""credit_id"": ""52fe44eec3a36847f80b280b"", ""de..."
4805,72766,Newlyweds,2011-12-26,A newlywed couple's honeymoon is upended by th...,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 10749, ""...",[],"[{""cast_id"": 1, ""character"": ""Buzzy"", ""credit_...","[{""credit_id"": ""52fe487dc3a368484e0fb013"", ""de..."
4806,231617,"Signed, Sealed, Delivered",2013-10-13,"""Signed, Sealed, Delivered"" introduces a dedic...","[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""nam...","[{""id"": 248, ""name"": ""date""}, {""id"": 699, ""nam...","[{""cast_id"": 8, ""character"": ""Oliver O\u2019To...","[{""credit_id"": ""52fe4df3c3a36847f8275ecf"", ""de..."
4807,126186,Shanghai Calling,2012-05-03,When ambitious New York attorney Sam is sent t...,[],[],"[{""cast_id"": 3, ""character"": ""Sam"", ""credit_id...","[{""credit_id"": ""52fe4ad9c3a368484e16a36b"", ""de..."


In [16]:
movies.isnull().sum()

movie_id        0
title           0
release_date    0
overview        0
genres          0
keywords        0
cast            0
crew            0
dtype: int64

In [17]:
movies.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies.dropna(inplace=True)


In [15]:
movies.duplicated().sum()

0

### *Convert genres and keywords*

In [18]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [19]:
def convert_genres(obj):
    L = []
    for i in json.loads(obj):
        L.append(i["name"])
    return L

In [20]:
# convert_to_list(movies.iloc[0].genres)

In [21]:
movies["genres"] = movies["genres"].apply(convert_genres)
movies["keywords"] = movies["keywords"].apply(convert_genres)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies["genres"] = movies["genres"].apply(convert_genres)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies["keywords"] = movies["keywords"].apply(convert_genres)


In [22]:
movies.head()

Unnamed: 0,movie_id,title,release_date,overview,genres,keywords,cast,crew
0,19995,Avatar,2009-12-10,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,2007-05-19,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,2015-10-26,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,2012-07-16,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,2012-03-07,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


### *Convert cast*

In [23]:
def convert_cast(obj):
    L = []
    counter = 0
    for i in json.loads(obj):
        if counter == 3:
            break
        else:
            counter+=1
            L.append(i["name"])
    return L

In [24]:
movies["cast"] = movies["cast"].apply(convert_cast)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies["cast"] = movies["cast"].apply(convert_cast)


In [25]:
movies.head()

Unnamed: 0,movie_id,title,release_date,overview,genres,keywords,cast,crew
0,19995,Avatar,2009-12-10,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,2007-05-19,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,2015-10-26,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,2012-07-16,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman]","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,2012-03-07,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton]","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


### *Convert crew*

In [26]:
def convert_crew(obj):
    L = []
    for i in json.loads(obj):
        if i["job"] == "Director":
            L.append(i["name"])
    return L

In [27]:
movies["crew"] = movies["crew"].apply(convert_crew)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies["crew"] = movies["crew"].apply(convert_crew)


In [28]:
movies.head()

Unnamed: 0,movie_id,title,release_date,overview,genres,keywords,cast,crew
0,19995,Avatar,2009-12-10,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,2007-05-19,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,2015-10-26,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,2012-07-16,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,John Carter,2012-03-07,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


### *Convert Overview*

In [29]:
movies["overview"] = movies["overview"].apply(lambda x:x.split())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies["overview"] = movies["overview"].apply(lambda x:x.split())


In [30]:
movies.head()

Unnamed: 0,movie_id,title,release_date,overview,genres,keywords,cast,crew
0,19995,Avatar,2009-12-10,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,2007-05-19,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,2015-10-26,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,2012-07-16,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,John Carter,2012-03-07,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


## *Remove spaces*

In [31]:
movies["overview"] = movies["overview"].apply(lambda x:[i.replace(" ","") for i in x])
movies["genres"] = movies["genres"].apply(lambda x:[i.replace(" ","") for i in x])
movies["keywords"] = movies["keywords"].apply(lambda x:[i.replace(" ","") for i in x])
movies["cast"] = movies["cast"].apply(lambda x:[i.replace(" ","") for i in x])
movies["crew"] = movies["crew"].apply(lambda x:[i.replace(" ","") for i in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies["overview"] = movies["overview"].apply(lambda x:[i.replace(" ","") for i in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies["genres"] = movies["genres"].apply(lambda x:[i.replace(" ","") for i in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies["keywords"] = movies["keywor

In [32]:
movies.head()

Unnamed: 0,movie_id,title,release_date,overview,genres,keywords,cast,crew
0,19995,Avatar,2009-12-10,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,2007-05-19,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]
2,206647,Spectre,2015-10-26,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes]
3,49026,The Dark Knight Rises,2012-07-16,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan]
4,49529,John Carter,2012-03-07,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton]


## *Convert release_date*

In [45]:
movies["release_date"] = movies["release_date"].apply(lambda x: [str(x)[:4]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies["release_date"] = movies["release_date"].apply(lambda x: [str(x)[:4]])


In [46]:
movies.head()

Unnamed: 0,movie_id,title,release_date,overview,genres,keywords,cast,crew
0,19995,Avatar,[2009],"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,[2007],"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]
2,206647,Spectre,[2015],"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes]
3,49026,The Dark Knight Rises,[2012],"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan]
4,49529,John Carter,[2012],"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton]


## *Combine columns: release_date, overview, genres, keywords, cast & crew*

In [47]:
movies.head()

Unnamed: 0,movie_id,title,release_date,overview,genres,keywords,cast,crew
0,19995,Avatar,[2009],"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,[2007],"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]
2,206647,Spectre,[2015],"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes]
3,49026,The Dark Knight Rises,[2012],"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan]
4,49529,John Carter,[2012],"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton]


In [50]:
movies["tags"] = movies["release_date"] + movies["overview"] + movies["genres"] + movies["keywords"] + movies["cast"] + movies["crew"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies["tags"] = movies["release_date"] + movies["overview"] + movies["genres"] + movies["keywords"] + movies["cast"] + movies["crew"]


In [51]:
movies = movies[["movie_id", "title", "tags"]]
movies.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[2009, In, the, 22nd, century,, a, paraplegic,..."
1,285,Pirates of the Caribbean: At World's End,"[2007, Captain, Barbossa,, long, believed, to,..."
2,206647,Spectre,"[2015, A, cryptic, message, from, Bond’s, past..."
3,49026,The Dark Knight Rises,"[2012, Following, the, death, of, District, At..."
4,49529,John Carter,"[2012, John, Carter, is, a, war-weary,, former..."


In [52]:
movies["tags"] = movies["tags"].apply(lambda x:" ".join(x))
movies["tags"] = movies["tags"].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies["tags"] = movies["tags"].apply(lambda x:" ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies["tags"] = movies["tags"].apply(lambda x: x.lower())


In [54]:
movies.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"2009 in the 22nd century, a paraplegic marine ..."
1,285,Pirates of the Caribbean: At World's End,"2007 captain barbossa, long believed to be dea..."
2,206647,Spectre,2015 a cryptic message from bond’s past sends ...
3,49026,The Dark Knight Rises,2012 following the death of district attorney ...
4,49529,John Carter,"2012 john carter is a war-weary, former milita..."


In [55]:
movies["tags"][0]

'2009 in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver jamescameron'

## *Stem contents of tags with NLTK*

In [56]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

In [57]:
def stem(text):
    ret = []
    words = text.split(" ")
    for word in words:
        ret.append(stemmer.stem(word))
    return " ".join(ret)

In [58]:
movies["tags"] = movies["tags"].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies["tags"] = movies["tags"].apply(stem)


## *Vectorize movie tags data*

In [59]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words="english")

In [60]:
vectors = cv.fit_transform(movies["tags"]).toarray()

In [61]:
vectors.shape

(4805, 5000)

In [62]:
# for i in cv.get_feature_names_out():
    # print(i)

## *Measure Cosine Similarity*

In [63]:
from sklearn.metrics.pairwise import cosine_similarity

In [64]:
similarity_matrix = cosine_similarity(vectors)

In [65]:
similarity_matrix = pd.DataFrame(similarity_matrix, index=movies['title'], columns=movies['title'])

In [66]:
similarity_matrix

title,Avatar,Pirates of the Caribbean: At World's End,Spectre,The Dark Knight Rises,John Carter,Spider-Man 3,Tangled,Avengers: Age of Ultron,Harry Potter and the Half-Blood Prince,Batman v Superman: Dawn of Justice,...,On The Downlow,Sanctuary: Quite a Conundrum,Bang,Primer,Cavite,El Mariachi,Newlyweds,"Signed, Sealed, Delivered",Shanghai Calling,My Date with Drew
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,1.000000,0.082261,0.084717,0.073005,0.187317,0.107434,0.039936,0.145095,0.087407,0.095673,...,0.000000,0.000000,0.041885,0.052632,0.000000,0.019118,0.044992,0.044566,0.000000,0.000000
Pirates of the Caribbean: At World's End,0.082261,1.000000,0.058849,0.038035,0.073193,0.134332,0.020806,0.125988,0.060718,0.099689,...,0.000000,0.000000,0.021822,0.027420,0.000000,0.039841,0.000000,0.023218,0.000000,0.025482
Spectre,0.084717,0.058849,1.000000,0.058756,0.075378,0.069171,0.021427,0.155700,0.062531,0.102665,...,0.082061,0.000000,0.000000,0.000000,0.017236,0.041030,0.000000,0.023911,0.000000,0.000000
The Dark Knight Rises,0.073005,0.038035,0.058756,1.000000,0.048718,0.059609,0.055395,0.067087,0.040414,0.199062,...,0.026519,0.053037,0.058099,0.054754,0.022280,0.066296,0.000000,0.030909,0.066354,0.084807
John Carter,0.187317,0.073193,0.075378,0.048718,1.000000,0.095590,0.053300,0.193649,0.077771,0.106407,...,0.034021,0.034021,0.074536,0.023415,0.142915,0.153093,0.000000,0.019826,0.106407,0.043519
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
El Mariachi,0.019118,0.039841,0.041030,0.066296,0.153093,0.078049,0.029013,0.140546,0.127000,0.034752,...,0.000000,0.055556,0.136931,0.057354,0.256718,1.000000,0.000000,0.000000,0.156386,0.124367
Newlyweds,0.044992,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.065372,0.000000,0.000000,0.027462,0.000000,1.000000,0.114291,0.000000,0.000000
"Signed, Sealed, Delivered",0.044566,0.023218,0.023911,0.030909,0.019826,0.018194,0.000000,0.000000,0.049341,0.020253,...,0.064752,0.032376,0.017733,0.044566,0.013601,0.000000,0.114291,1.000000,0.040505,0.041416
Shanghai Calling,0.000000,0.000000,0.000000,0.066354,0.106407,0.058587,0.036298,0.043959,0.105925,0.021739,...,0.000000,0.069505,0.095173,0.000000,0.145989,0.156386,0.000000,0.040505,1.000000,0.088911


## *Make Recommendations*

In [70]:
def make_rocommendation(movie_name):
    recommended_movies = []
    similar_scores = similarity_matrix[movie_name]
    similar_scores = pd.DataFrame(similarity_matrix[movie_name])
    similar_scores = similar_scores.sort_values(movie_name, ascending=False)[1:11]
    # similar_scores = similar_scores.rename(columns={movie_name: "similarity_index"})
    similar_scores = similar_scores.merge(movies, on="title")
    for m in similar_scores["title"]:
        recommended_movies.append(m)
    return recommended_movies

In [71]:
make_rocommendation("The Godfather")

['The Godfather: Part II',
 'The Godfather: Part III',
 'City By The Sea',
 'Sexy Beast',
 'Road to Perdition',
 'Dom Hemingway',
 'Harsh Times',
 'Donnie Brasco',
 'Blood Ties',
 'The Usual Suspects']

In [74]:
# for mov in movies.title:
#     if "God" in mov:
#         print(mov)

In [73]:
# import pickle

# pickle.dump(similarity_matrix, open("similarity_matrix.pkl", "wb"))
# pickle.dump(new_df, open("movies.pkl", "wb"))