In [1]:
import pandas as pd
import numpy as np

# TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Word2Vec Model
from gensim.models import Word2Vec

#  FastText Model
from gensim.models.fasttext import FastText

#  Doc2Vec Model
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Latent Dirichlet Allocation (LDA)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity

# Bag of words
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.metrics.pairwise import cosine_similarity


### Loading the dataset

In [2]:
# Define the file paths for the two CSV files
file_path1 = r"D:\Project\Project-2023\Content Based Recommendation System-Movie Recommendation System\Dataset\TMDB 5000 Movie Dataset\tmdb_5000_credits.csv"
file_path2 = r"D:\Project\Project-2023\Content Based Recommendation System-Movie Recommendation System\Dataset\TMDB 5000 Movie Dataset\tmdb_5000_movies.csv" 

# Read the first CSV file into a pandas DataFrame
credits = pd.read_csv(file_path1)

# Read the second CSV file into another pandas DataFrame
movies = pd.read_csv(file_path2)

# Now you can work with the 'credits' and 'movies' DataFrames

In [3]:
credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [4]:
credits.shape

(4803, 4)

In [5]:
credits.columns

Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')

In [6]:
credits.dtypes

movie_id     int64
title       object
cast        object
crew        object
dtype: object

In [7]:
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [8]:
movies.shape

(4803, 20)

In [9]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [10]:
movies.dtypes

budget                    int64
genres                   object
homepage                 object
id                        int64
keywords                 object
original_language        object
original_title           object
overview                 object
popularity              float64
production_companies     object
production_countries     object
release_date             object
revenue                   int64
runtime                 float64
spoken_languages         object
status                   object
tagline                  object
title                    object
vote_average            float64
vote_count                int64
dtype: object

In [11]:
# Concatenate the two DataFrames along rows (vertically)
# data = pd.concat([data1, data2], ignore_index=True)

In [12]:
# data.head() 
# here id referes to tmdb id

In [13]:
# data.dtypes

In [14]:
# data.shape

In [15]:
# data.describe()

In [16]:
credits.head(1).values

array([[19995, 'Avatar',
        '[{"cast_id": 242, "character": "Jake Sully", "credit_id": "5602a8a7c3a3685532001c9a", "gender": 2, "id": 65731, "name": "Sam Worthington", "order": 0}, {"cast_id": 3, "character": "Neytiri", "credit_id": "52fe48009251416c750ac9cb", "gender": 1, "id": 8691, "name": "Zoe Saldana", "order": 1}, {"cast_id": 25, "character": "Dr. Grace Augustine", "credit_id": "52fe48009251416c750aca39", "gender": 1, "id": 10205, "name": "Sigourney Weaver", "order": 2}, {"cast_id": 4, "character": "Col. Quaritch", "credit_id": "52fe48009251416c750ac9cf", "gender": 2, "id": 32747, "name": "Stephen Lang", "order": 3}, {"cast_id": 5, "character": "Trudy Chacon", "credit_id": "52fe48009251416c750ac9d3", "gender": 1, "id": 17647, "name": "Michelle Rodriguez", "order": 4}, {"cast_id": 8, "character": "Selfridge", "credit_id": "52fe48009251416c750ac9e1", "gender": 2, "id": 1771, "name": "Giovanni Ribisi", "order": 5}, {"cast_id": 7, "character": "Norm Spellman", "credit_id": "52fe

### Merging two data on the basis of title

In [17]:
movies = movies.merge(credits,on="title")

In [18]:
movies.shape

(4809, 23)

In [19]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [20]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [21]:
movies.dtypes

budget                    int64
genres                   object
homepage                 object
id                        int64
keywords                 object
original_language        object
original_title           object
overview                 object
popularity              float64
production_companies     object
production_countries     object
release_date             object
revenue                   int64
runtime                 float64
spoken_languages         object
status                   object
tagline                  object
title                    object
vote_average            float64
vote_count                int64
movie_id                  int64
cast                     object
crew                     object
dtype: object

### Data pre-processing
#### Droping the un-wanted columns

#### checking all the columns respectively and deciding for each and every column

In [22]:
# Budgets of the movie is not requred for any purpose so obviously remove this column
movies['budget'].head()


0    237000000
1    300000000
2    245000000
3    250000000
4    260000000
Name: budget, dtype: int64

In [23]:
# homepage is all the links of the movies websites whis is not needed at all so remove it
movies['homepage'].head()

0                     http://www.avatarmovie.com/
1    http://disney.go.com/disneypictures/pirates/
2     http://www.sonypictures.com/movies/spectre/
3              http://www.thedarkknightrises.com/
4            http://movies.disney.com/john-carter
Name: homepage, dtype: object

In [24]:
# id is important value. It is used to fetch the image data later 
movies['id'].value_counts().sum()

4809

In [25]:
# Keywords is important data 
movies['keywords'].head()

0    [{"id": 1463, "name": "culture clash"}, {"id":...
1    [{"id": 270, "name": "ocean"}, {"id": 726, "na...
2    [{"id": 470, "name": "spy"}, {"id": 818, "name...
3    [{"id": 849, "name": "dc comics"}, {"id": 853,...
4    [{"id": 818, "name": "based on novel"}, {"id":...
Name: keywords, dtype: object

In [26]:
# original_language
# Around 95% is in english language rest 5 percent is other foreign language so we can remove it
movies['original_language'].value_counts()

en    4510
fr      70
es      32
zh      27
de      27
hi      19
ja      16
it      14
ko      12
cn      12
ru      11
pt       9
da       7
sv       5
nl       4
fa       4
th       3
he       3
ta       2
cs       2
ro       2
id       2
ar       2
vi       1
sl       1
ps       1
no       1
ky       1
hu       1
pl       1
af       1
nb       1
tr       1
is       1
xx       1
te       1
el       1
Name: original_language, dtype: int64

In [27]:
# This is important data this tells us about the title of the data but i am removing this original_title column becuase this original_title can me in other languages 
# We also have the title column in which the titles are all in english language so i am using title rather than original_title 
movies['original_title'].head()

0                                      Avatar
1    Pirates of the Caribbean: At World's End
2                                     Spectre
3                       The Dark Knight Rises
4                                 John Carter
Name: original_title, dtype: object

In [28]:
# This is an important column this can be used to determine any similarities between the movies. So we will keep this column
movies['overview'].head()

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
2    A cryptic message from Bond’s past sends him o...
3    Following the death of District Attorney Harve...
4    John Carter is a war-weary, former military ca...
Name: overview, dtype: object

In [29]:
# Since it is a numeric approach we will not be using this column becuace we are trying to do tags but this is important column also
movies['popularity'].head()

0    150.437577
1    139.082615
2    107.376788
3    112.312950
4     43.926995
Name: popularity, dtype: float64

In [30]:
# We generally dont suggest movies on the basis of production companies so we are not using this column
movies['production_companies'].head()

0    [{"name": "Ingenious Film Partners", "id": 289...
1    [{"name": "Walt Disney Pictures", "id": 2}, {"...
2    [{"name": "Columbia Pictures", "id": 5}, {"nam...
3    [{"name": "Legendary Pictures", "id": 923}, {"...
4          [{"name": "Walt Disney Pictures", "id": 2}]
Name: production_companies, dtype: object

In [31]:
# We generally dont suggest movies on the basis of production countries so we are not using this column
movies['production_countries'].head()

0    [{"iso_3166_1": "US", "name": "United States o...
1    [{"iso_3166_1": "US", "name": "United States o...
2    [{"iso_3166_1": "GB", "name": "United Kingdom"...
3    [{"iso_3166_1": "US", "name": "United States o...
4    [{"iso_3166_1": "US", "name": "United States o...
Name: production_countries, dtype: object

In [32]:
# This is an important factor since we can categories movies on the basis of release date. But we are not using this column becuace again it is numerical value 
movies['release_date'].head()

0    2009-12-10
1    2007-05-19
2    2015-10-26
3    2012-07-16
4    2012-03-07
Name: release_date, dtype: object

In [33]:
# This is not important column so we are not using this column
movies['revenue'].head()

0    2787965087
1     961000000
2     880674609
3    1084939099
4     284139100
Name: revenue, dtype: int64

In [34]:
# This is not important column so we are not usnig this column
movies['runtime'].head()

0    162.0
1    169.0
2    148.0
3    165.0
4    132.0
Name: runtime, dtype: float64

In [35]:
# this is not important column so we are not usnig this column since story matters
movies['spoken_languages'].head()

0    [{"iso_639_1": "en", "name": "English"}, {"iso...
1             [{"iso_639_1": "en", "name": "English"}]
2    [{"iso_639_1": "fr", "name": "Fran\u00e7ais"},...
3             [{"iso_639_1": "en", "name": "English"}]
4             [{"iso_639_1": "en", "name": "English"}]
Name: spoken_languages, dtype: object

In [36]:
# This is not important column so we are not usnig this column since released or unreleased doesnt matter
movies['status'].head()

0    Released
1    Released
2    Released
3    Released
4    Released
Name: status, dtype: object

In [37]:
# This is not important column so we are not usnig this column becuase overview coulmn is already solving this problem
movies['tagline'].head()

0                       Enter the World of Pandora.
1    At the end of the world, the adventure begins.
2                             A Plan No One Escapes
3                                   The Legend Ends
4              Lost in our world, found in another.
Name: tagline, dtype: object

In [38]:
# This is important column we will be using this column
movies['title'].head()

0                                      Avatar
1    Pirates of the Caribbean: At World's End
2                                     Spectre
3                       The Dark Knight Rises
4                                 John Carter
Name: title, dtype: object

In [39]:
# this is important column we will be using this column
movies['vote_average'].head()

0    7.2
1    6.9
2    6.3
3    7.6
4    6.1
Name: vote_average, dtype: float64

In [40]:
# This is important column we will be using this column
movies['vote_count'].head()

0    11800
1     4500
2     4466
3     9106
4     2124
Name: vote_count, dtype: int64

In [41]:
# this is not important column we will not be using this column becuase we are already using the id column
movies['movie_id'].head()

0     19995
1       285
2    206647
3     49026
4     49529
Name: movie_id, dtype: int64

In [42]:
# We recommend movies based on actors so this column is important we will be using it
movies['cast'].head()

0    [{"cast_id": 242, "character": "Jake Sully", "...
1    [{"cast_id": 4, "character": "Captain Jack Spa...
2    [{"cast_id": 1, "character": "James Bond", "cr...
3    [{"cast_id": 2, "character": "Bruce Wayne / Ba...
4    [{"cast_id": 5, "character": "John Carter", "c...
Name: cast, dtype: object

In [43]:
# in this coumn w have director so we will use this column
movies['crew'].head()

0    [{"credit_id": "52fe48009251416c750aca23", "de...
1    [{"credit_id": "52fe4232c3a36847f800b579", "de...
2    [{"credit_id": "54805967c3a36829b5002c41", "de...
3    [{"credit_id": "52fe4781c3a36847f81398c3", "de...
4    [{"credit_id": "52fe479ac3a36847f813eaa3", "de...
Name: crew, dtype: object

### keeping only id, title, overview, genres, keywords, cast, crew columns and removing rest

In [44]:
movies=movies[['id','title','overview','genres','keywords','cast','crew']]

In [45]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


#### Checking and handelling Missing Values

In [46]:
movies.isnull().sum()

id          0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [47]:
## If there are any missing values in data so drop them if they are less than 5%
movies.dropna(inplace=True)

In [48]:
movies.isnull().sum()

id          0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [49]:
# Checking wether there are any duplicate values
movies.duplicated().sum()

0

In [50]:
# now making all the columns in right required format
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

#### Task :
#### '[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'
#### Convert into this format as shown below
#### ['Action','Adventure','Fantasy','Science Fiction']

#### Problem while converting: 
#### '[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'
####  The given list is in the format of string so we will get the error

#### Solution:
#### import ast
#### ast.litral_eval('[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]')
#### This will convert all sting list into list

In [51]:
# def convert(obj):
#     List=[]
#     for i in obj:
#         List.append(i["name"])
#     return List

In [52]:
convert('[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'')

SyntaxError: unterminated string literal (detected at line 1) (1371293799.py, line 1)

In [53]:
import ast
ast.literal_eval('[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]')

[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 878, 'name': 'Science Fiction'}]

In [54]:
# Function to convert objects into List
def convert(obj):
    List=[]
    for i in ast.literal_eval(obj):
        List.append(i["name"])
    return List


In [55]:
movies['genres']=movies['genres'].apply(convert)

In [56]:
movies['genres'].head()

0    [Action, Adventure, Fantasy, Science Fiction]
1                     [Adventure, Fantasy, Action]
2                       [Action, Adventure, Crime]
3                 [Action, Crime, Drama, Thriller]
4             [Action, Adventure, Science Fiction]
Name: genres, dtype: object

In [57]:
movies['keywords'] = movies['keywords'].apply(convert)

In [58]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


#### For cast column I am considering only first 5 character only, rest all I am ignoring

In [59]:
movies['cast'][0]

'[{"cast_id": 242, "character": "Jake Sully", "credit_id": "5602a8a7c3a3685532001c9a", "gender": 2, "id": 65731, "name": "Sam Worthington", "order": 0}, {"cast_id": 3, "character": "Neytiri", "credit_id": "52fe48009251416c750ac9cb", "gender": 1, "id": 8691, "name": "Zoe Saldana", "order": 1}, {"cast_id": 25, "character": "Dr. Grace Augustine", "credit_id": "52fe48009251416c750aca39", "gender": 1, "id": 10205, "name": "Sigourney Weaver", "order": 2}, {"cast_id": 4, "character": "Col. Quaritch", "credit_id": "52fe48009251416c750ac9cf", "gender": 2, "id": 32747, "name": "Stephen Lang", "order": 3}, {"cast_id": 5, "character": "Trudy Chacon", "credit_id": "52fe48009251416c750ac9d3", "gender": 1, "id": 17647, "name": "Michelle Rodriguez", "order": 4}, {"cast_id": 8, "character": "Selfridge", "credit_id": "52fe48009251416c750ac9e1", "gender": 2, "id": 1771, "name": "Giovanni Ribisi", "order": 5}, {"cast_id": 7, "character": "Norm Spellman", "credit_id": "52fe48009251416c750ac9dd", "gender": 

In [60]:
## Converting Cast column
def convert_Cast(obj):
    List=[]
    count = 0
    for i in ast.literal_eval(obj):
        if count !=5:
            List.append(i["name"])
            count += 1
        else:
            break
    return List

In [61]:
movies['cast'] = movies['cast'].apply(convert_Cast)

In [62]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman, A...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton,...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


#### For crew column i only need the Dictionary where job:Director 

In [63]:
movies["crew"][0]

'[{"credit_id": "52fe48009251416c750aca23", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"}, {"credit_id": "539c47ecc3a36810e3001f87", "department": "Art", "gender": 2, "id": 496, "job": "Production Design", "name": "Rick Carter"}, {"credit_id": "54491c89c3a3680fb4001cf7", "department": "Sound", "gender": 0, "id": 900, "job": "Sound Designer", "name": "Christopher Boyes"}, {"credit_id": "54491cb70e0a267480001bd0", "department": "Sound", "gender": 0, "id": 900, "job": "Supervising Sound Editor", "name": "Christopher Boyes"}, {"credit_id": "539c4a4cc3a36810c9002101", "department": "Production", "gender": 1, "id": 1262, "job": "Casting", "name": "Mali Finn"}, {"credit_id": "5544ee3b925141499f0008fc", "department": "Sound", "gender": 2, "id": 1729, "job": "Original Music Composer", "name": "James Horner"}, {"credit_id": "52fe48009251416c750ac9c3", "department": "Directing", "gender": 2, "id": 2710, "job": "Director", "name": "James Cameron"},

In [64]:
def fetch_director(obj):
    List=[]
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            List.append(i["name"])
    return List

In [65]:
# Fetching only the Director name form Crew column
movies['crew'] = movies['crew'].apply(fetch_director)

In [66]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes]
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan]
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton]


#### In overview column we data in whole string format so I am splittig it and storing it inside a list using lambda finction
#### lambda x : x.split()

In [67]:
movies['overview'] = movies['overview'].apply(lambda x : x.split())

In [68]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton]


#### Problem : Note there are white spaces in the genres,keywords,cast,crew columns we need to remove it becuase there will be ambiguity while creating tags
#### Science Fiction -----> ScienceFiction 
#### here it will consider as one entity

#### Solution use lambda function and inside use list comprehension
#### eg: lambda x : [i.replace(" ","") for i in x]

In [69]:
movies['genres'] = movies['genres'].apply(lambda x : [i.replace(" ","") for i in x])

In [70]:
movies['keywords'] = movies['keywords'].apply(lambda x : [i.replace(" ","") for i in x])

In [71]:
movies['cast'] = movies['cast'].apply(lambda x : [i.replace(" ","") for i in x])

In [72]:
movies['crew'] = movies['crew'].apply(lambda x : [i.replace(" ","") for i in x])

In [73]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste...",[GoreVerbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux, Ralp...",[SamMendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman, Anne...",[ChristopherNolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton, Wi...",[AndrewStanton]


### Creating tags column by concatenating overview, genres, keywords, cast, crew columns

In [74]:
movies['tags']= movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [75]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste...",[GoreVerbinski],"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux, Ralp...",[SamMendes],"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman, Anne...",[ChristopherNolan],"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton, Wi...",[AndrewStanton],"[John, Carter, is, a, war-weary,, former, mili..."


### we dont require overview, genres, keywords, cast, crew columns anymore since are considered in tags columns so remove it

In [76]:
new_df = movies[['id', 'title','tags']]

In [77]:
new_df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."


#### now again convert the tags which are in list into string by using join() inside lambda 
#### lambda x:" ".join(x)

In [78]:
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))


In [79]:
new_df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [80]:
new_df['tags'][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy ScienceFiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d SamWorthington ZoeSaldana SigourneyWeaver StephenLang MichelleRodriguez JamesCameron'

#### Recommended that stings should be in lower case

In [81]:
new_df['tags'] = new_df['tags'].apply(lambda x : x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x : x.lower())


In [82]:
new_df['tags'][0]

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver stephenlang michellerodriguez jamescameron'

In [83]:
new_df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


### TF-IDF vectorization and cosine similarity

In [84]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(new_df['tags'])

In [85]:
# Calculate Cosine Similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [86]:
# Get Movie Recommendations
indices = pd.Series(new_df.index, index=new_df['title'])

def get_recommendations(title, num_recommendations=5):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations + 1]
    movie_indices = [i[0] for i in sim_scores]
    return new_df['title'].iloc[movie_indices]

movie_title = 'Pirates of the Caribbean: At World\'s End'
recommendations = get_recommendations(movie_title)
print("Recommendations for", movie_title, ":")
print(recommendations)
recommendations

Recommendations for Pirates of the Caribbean: At World's End :
12            Pirates of the Caribbean: Dead Man's Chest
199    Pirates of the Caribbean: The Curse of the Bla...
17           Pirates of the Caribbean: On Stranger Tides
848        The Pirates! In an Adventure with Scientists!
536                                    Anna and the King
Name: title, dtype: object


12            Pirates of the Caribbean: Dead Man's Chest
199    Pirates of the Caribbean: The Curse of the Bla...
17           Pirates of the Caribbean: On Stranger Tides
848        The Pirates! In an Adventure with Scientists!
536                                    Anna and the King
Name: title, dtype: object

### Word2Vec model

In [87]:
# Train Word2Vec model
word2vec_model = Word2Vec(sentences=new_df['tags'], vector_size=100, window=5, min_count=1, sg=1)

In [88]:
#  Using Word Embeddings for Movie Recommendations

# Function to get average word vector for a list of words
def average_word_vector(words, model, num_features):
    feature_vector = np.zeros((num_features,), dtype="float32")
    n_words = 0
    for word in words:
        if word in model.wv:
            n_words += 1
            feature_vector = np.add(feature_vector, model.wv[word])
    if n_words > 0:
        feature_vector = np.divide(feature_vector, n_words)
    return feature_vector

# Create a column for average word vectors
num_features = 100
new_df['tag_vector'] = new_df['tags'].apply(lambda x: average_word_vector(x, word2vec_model, num_features))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tag_vector'] = new_df['tags'].apply(lambda x: average_word_vector(x, word2vec_model, num_features))


In [89]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity between tag vectors
cosine_sim = cosine_similarity(np.stack(new_df['tag_vector']), np.stack(new_df['tag_vector']))

# Reset the index of the DataFrame
new_df = new_df.reset_index(drop=True)

# Get movie title and index mapping
movie_indices = pd.Series(new_df.index, index=new_df['title'])

# Function to get movie recommendations based on cosine similarity
def get_recommendations(title, cosine_sim_matrix, indices, num_recommendations=10):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]
    movie_indices = [x[0] for x in sim_scores]
    return new_df['title'].iloc[movie_indices]

# Test the recommendation system
movie_title = "Avatar"
recommendations = get_recommendations(movie_title, cosine_sim, movie_indices)
print(f"Recommendations for '{movie_title}':")
print(recommendations)


Recommendations for 'Avatar':
2774                    Paris, je t'aime
3166                         Thunderball
1624                        Side Effects
2241                    A Scanner Darkly
1805                        Original Sin
3609                   Sunshine Cleaning
165                                 Hulk
2439    Superman IV: The Quest for Peace
4083         Mad Max 2: The Road Warrior
4343                              Dr. No
Name: title, dtype: object


### FastText model

In [90]:
# Train FastText model
fasttext_model = FastText(sentences=new_df['tags'], vector_size=100, window=5, min_count=1, sg=1)


In [91]:

# Function to get average word vector for a list of words
def average_word_vector(words, model, num_features):
    feature_vector = np.zeros((num_features,), dtype="float32")
    n_words = 0
    for word in words:
        if word in model.wv:
            n_words += 1
            feature_vector = np.add(feature_vector, model.wv[word])
    if n_words > 0:
        feature_vector = np.divide(feature_vector, n_words)
    return feature_vector

# Create a column for average word vectors
num_features = 100
new_df['tag_vector'] = new_df['tags'].apply(lambda x: average_word_vector(x, fasttext_model, num_features))

In [92]:
# Calculate cosine similarity between tag vectors
cosine_sim = cosine_similarity(np.stack(new_df['tag_vector']), np.stack(new_df['tag_vector']))

# Reset the index of the DataFrame
new_df = new_df.reset_index(drop=True)

# Get movie title and index mapping
movie_indices = pd.Series(new_df.index, index=new_df['title'])

# Function to get movie recommendations based on cosine similarity
def get_recommendations(title, cosine_sim_matrix, indices, num_recommendations=10):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]
    movie_indices = [x[0] for x in sim_scores]
    return new_df['title'].iloc[movie_indices]

# Test the recommendation system
movie_title = "Avatar"
recommendations = get_recommendations(movie_title, cosine_sim, movie_indices)
print(f"Recommendations for '{movie_title}':")
print(recommendations)

Recommendations for 'Avatar':
245             Tomorrow Never Dies
2774               Paris, je t'aime
506                 Despicable Me 2
2241               A Scanner Darkly
4343                         Dr. No
4042                      Antibirth
1193                Shall We Dance?
4083    Mad Max 2: The Road Warrior
839                           Evita
1156                           Lucy
Name: title, dtype: object


###  Doc2Vec Model

In [93]:
# Create tagged documents for training
tagged_data = [TaggedDocument(words=tags, tags=[str(i)]) for i, tags in enumerate(new_df['tags'])]

# Train Doc2Vec model
doc2vec_model = Doc2Vec(vector_size=100, window=5, min_count=1, dm=1, epochs=30, workers=4)
doc2vec_model.build_vocab(tagged_data)
doc2vec_model.train(tagged_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

In [94]:
# Create a column for document vectors
new_df['doc_vector'] = [doc2vec_model.dv[str(i)] for i in range(len(tagged_data))]


In [111]:
# Calculate cosine similarity between document vectors
cosine_sim_doc = cosine_similarity(new_df['doc_vector'].tolist(), new_df['doc_vector'].tolist())

# Function to get movie recommendations based on similarity
def get_recommendations(title, num_recommendations=5):
    idx = new_df[new_df['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim_doc[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations + 1]
    movie_indices = [i[0] for i in sim_scores]
    return new_df['title'].iloc[movie_indices]

# Get recommendations for a movie title
movie_title = 'Pirates of the Caribbean: At World\'s End'
recommendations = get_recommendations(movie_title)
print("Recommendations for", movie_title, ":")
print(recommendations)


Recommendations for Pirates of the Caribbean: At World's End :
3691    Pat Garrett & Billy the Kid
4531                     Mean Creek
2854                       Nebraska
1485                    Exit Wounds
3536                    Ghost World
Name: title, dtype: object


###  BERT Model

In [98]:
# # Load the pre-trained BERT model and tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased')


In [99]:
# # Create a function to encode movie descriptions using BERT embeddings
# def encode_description(description):
#     inputs = tokenizer(description, return_tensors='pt', truncation=True, padding=True)
#     outputs = model(**inputs)
#     embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()
#     return embeddings

# # Apply the encoding function to create BERT embeddings for movie descriptions
# movies['description_embeddings'] = movies['overview'].apply(encode_description)

# # Calculate cosine similarity between description embeddings
# cosine_sim = cosine_similarity(np.stack(movies['description_embeddings']), np.stack(movies['description_embeddings']))

# # Reset the index of the DataFrame
# movies = movies.reset_index(drop=True)

# # Get movie title and index mapping
# movie_indices = pd.Series(movies.index, index=movies['title'])

In [100]:
# # Function to get movie recommendations based on cosine similarity
# def get_recommendations(title, cosine_sim_matrix, indices, num_recommendations=10):
#     idx = indices[title]
#     sim_scores = list(enumerate(cosine_sim_matrix[idx]))
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
#     sim_scores = sim_scores[1:num_recommendations+1]
#     movie_indices = [x[0] for x in sim_scores]
#     return movies['title'].iloc[movie_indices]

# # Test the recommendation system
# movie_title = "Avatar"
# recommendations = get_recommendations(movie_title, cosine_sim, movie_indices)
# print(f"Recommendations for '{movie_title}':")
# print(recommendations)

### Latent Dirichlet Allocation (LDA) 

In [104]:
# Combine relevant text data columns into a single column
new_df['text_data'] = new_df['tags']

# Create a CountVectorizer to convert text data into a bag-of-words representation
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X = vectorizer.fit_transform(new_df['text_data'])

# Apply Latent Dirichlet Allocation (LDA) to discover topics in the text data
num_topics = 10  # Number of topics to discover
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda_matrix = lda_model.fit_transform(X)

# Calculate cosine similarity between LDA topic distributions and reset the DataFrame index
cosine_sim = cosine_similarity(lda_matrix, lda_matrix)
new_df = new_df.reset_index(drop=True)

# Create a mapping between movie titles and indices
movie_indices = pd.Series(new_df.index, index=new_df['title'])

In [106]:
# Function to get movie recommendations based on cosine similarity
def get_recommendations(title, cosine_sim_matrix, indices, num_recommendations=10):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]
    movie_indices = [x[0] for x in sim_scores]
    return movies['title'].iloc[movie_indices]

# Get the movie title you want recommendations for
movie_title = "Avatar"

# Get recommendations using the function
recommendations = get_recommendations(movie_title, cosine_sim, movie_indices)

# Print the recommendations
print(f"Recommendations for '{movie_title}':")
for idx, recommendation in enumerate(recommendations, start=1):
    print(f"{idx}. {recommendation}")


Recommendations for 'Avatar':
1. Highlander: The Final Dimension
2. Kites
3. 5 Days of War
4. They Live
5. Galaxina
6. Air Force One
7. Steamboy
8. 12 Angry Men
9. The Fifth Element
10. Shin Godzilla


### Bag of words

In [109]:
# Create a CountVectorizer to convert text data into a bag-of-words representation
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X = vectorizer.fit_transform(new_df['tags'])

# Calculate cosine similarity between bag-of-words representations
cosine_sim_bow = cosine_similarity(X, X)

# Reset the index of the DataFrame
new_df = new_df.reset_index(drop=True)

# Get movie title and index mapping
movie_indices_bow = pd.Series(new_df.index, index=new_df['title'])


In [110]:
# Function to get movie recommendations based on cosine similarity
def get_recommendations_bow(title, cosine_sim_matrix, indices, num_recommendations=10):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]
    movie_indices = [x[0] for x in sim_scores]
    return new_df['title'].iloc[movie_indices]

# Get the movie title you want recommendations for
movie_title = "Avatar"

# Get recommendations using the BoW model
recommendations_bow = get_recommendations_bow(movie_title, cosine_sim_bow, movie_indices_bow)

# Print the recommendations
print(f"Recommendations for '{movie_title}' using BoW:")
for idx, recommendation in enumerate(recommendations_bow, start=1):
    print(f"{idx}. {recommendation}")

Recommendations for 'Avatar' using BoW:
1. Titan A.E.
2. Aliens
3. Aliens vs Predator: Requiem
4. Ender's Game
5. Independence Day
6. Battle: Los Angeles
7. Small Soldiers
8. Lifeforce
9. Falcon Rising
10. Predators
