In [1]:
import pandas as pd
import numpy as np
import ast

In [2]:
credits = pd.read_csv('credits_dataset.csv')
movies = pd.read_csv('movies_dataset.csv')

In [5]:
movies = movies.merge(credits, left_on = 'title', right_on = 'title')

In [7]:
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew', 'original_language', 'production_countries', 'tagline']]

In [8]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,original_language,production_countries,tagline
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",en,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",Enter the World of Pandora.


In [9]:
def convert(obj):
  L = []
  for i in ast.literal_eval(obj):
    L.append(i['name'])
  return L

In [10]:
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(lambda x: [(i['character'], i['name']) for i in ast.literal_eval(x)][:5])
movies['crew'] = movies['crew'].apply(lambda x: [i['name'] for i in ast.literal_eval(x) if i['job'] == 'Director'])
movies['production_countries'] = movies['production_countries'].apply(convert)

In [11]:
print(movies['genres'])
print(movies['keywords'])
print(movies['cast'])
print(movies['crew'])
print(movies['production_countries'])

0       [Action, Adventure, Fantasy, Science Fiction]
1                        [Adventure, Fantasy, Action]
2                          [Action, Adventure, Crime]
3                    [Action, Crime, Drama, Thriller]
4                [Action, Adventure, Science Fiction]
                            ...                      
4804                        [Action, Crime, Thriller]
4805                                [Comedy, Romance]
4806               [Comedy, Drama, Romance, TV Movie]
4807                                               []
4808                                    [Documentary]
Name: genres, Length: 4809, dtype: object
0       [culture clash, future, space war, space colon...
1       [ocean, drug abuse, exotic island, east india ...
2       [spy, based on novel, secret agent, sequel, mi...
3       [dc comics, crime fighter, terrorist, secret i...
4       [based on novel, mars, medallion, space travel...
                              ...                        
4804    [united 

In [14]:
movies['tag'] = movies['title'] + ' ' + movies['genres'].apply(lambda x: " ".join(x)) + movies['keywords'].apply(lambda x: " ".join(x)) + movies['cast'].apply(lambda x: " ".join(map(str, x))) + movies['crew'].apply(lambda x: " ".join(x)) + movies['production_countries'].apply(lambda x: " ".join(x))

In [16]:
movies = movies[['movie_id', 'title', 'tagline', 'overview',  'original_language', 'tag']]

In [18]:
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Taha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Taha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Taha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [19]:
def clean_text(text):
  text = re.sub(r'[^\w\s]', '', text)
  tokens = nltk.word_tokenize(text.lower())
  return ' '.join(tokens)

In [20]:
movies['tag'] = movies['tag'].apply(clean_text)

In [21]:
movies['title'] = movies['title'].apply(clean_text)

In [24]:
movies.head()

Unnamed: 0,movie_id,title,tagline,overview,original_language,tag
0,19995,avatar,Enter the World of Pandora.,"In the 22nd century, a paraplegic Marine is di...",en,avatar action adventure fantasy science fictio...
1,285,pirate caribbean world end,"At the end of the world, the adventure begins.","Captain Barbossa, long believed to be dead, ha...",en,pirate caribbean world end adventure fantasy a...
2,206647,spectre,A Plan No One Escapes,A cryptic message from Bond’s past sends him o...,en,spectre action adventure crimespy based novel ...
3,49026,dark knight rise,The Legend Ends,Following the death of District Attorney Harve...,en,dark knight rise action crime drama thrillerdc...
4,49529,john carter,"Lost in our world, found in another.","John Carter is a war-weary, former military ca...",en,john carter action adventure science fictionba...


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(movies['tag'])

In [25]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [26]:
def get_recommendations(title, cosine_sim=cosine_sim):
  title = clean_text(title)
  idx = movies[movies['title'] == title].index
  if len(idx) == 0:
    print(f"Movie '{title}' not found in the database.")
    return []
  idx = idx[0]
  sim_scores = list(enumerate(cosine_sim[idx]))
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
  sim_scores = sim_scores[1:11]
  movie_indices = [i[0] for i in sim_scores]
  return movies['title'].iloc[movie_indices]

In [40]:
print(get_recommendations("LOVE LETTERS"))

2895                     darling companion
3852                        winter passing
4287                               rebecca
1640                      love other drugs
85      captain america the winter soldier
1099           love in the time of cholera
4250                             pontypool
1287                           snowpiercer
1301                        the ugly truth
3441                               control
Name: title, dtype: object


In [41]:
def get_recommendations_by_keywords(tag, cosine_sim=cosine_sim):

  tag = clean_text(tag) 
  keyword_vector = tfidf.transform([tag])  

  sim_scores = cosine_similarity(keyword_vector, tfidf_matrix).flatten() 

  sim_scores = list(enumerate(sim_scores))
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
  sim_scores = sim_scores[0:10]  

  movie_indices = [i[0] for i in sim_scores]
  return movies['title'].iloc[movie_indices]

In [42]:
recommendations = get_recommendations_by_keywords("action science fiction thor")
print(recommendations)

16                the avengers
7       avengers age of ultron
126        thor the dark world
3537                  galaxina
1717                   timecop
2967     the last days on mars
129                       thor
260                enders game
108         terminator genisys
4407          the helix loaded
Name: title, dtype: object


In [43]:
import pickle
with open('movies.pkl', 'wb') as file:
  pickle.dump((movies, cosine_sim), file)