In [3]:
# Loading data from csv file
import pandas as pd
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings

import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
movies=pd.read_csv("/content/gdrive/MyDrive/Colab Notebooks/Movie/tmdb_5000_movies.csv")
movies=movies[['title','genres','keywords','overview']]
movies.rename(columns={'title':"name"},inplace=True)
movies.head(2)

Unnamed: 0,name,genres,keywords,overview
0,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di..."
1,Pirates of the Caribbean: At World's End,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","Captain Barbossa, long believed to be dead, ha..."


In [6]:
credit=pd.read_csv("/content/gdrive/MyDrive/Colab Notebooks/Movie/tmdb_5000_credits.csv")
credit.rename(columns={'title':'name','movie_id':'id'},inplace=True)
credit.head(2)

Unnamed: 0,id,name,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [7]:
print(movies.shape)
print(credit.shape)

(4803, 4)
(4803, 4)


#Data Cleaning:

In [8]:
# Merging on "name" columns
movies = movies.merge(credit,on='name')

In [9]:
movies.shape

(4809, 7)

In [10]:
#checking duplicate rows
movies.duplicated().sum()

0

In [11]:
# checking null values in each column 
movies.isnull().sum()

name        0
genres      0
keywords    0
overview    3
id          0
cast        0
crew        0
dtype: int64

In [12]:
# dropping rows with NA values
movies.dropna(inplace=True)
movies.isnull().sum()

name        0
genres      0
keywords    0
overview    0
id          0
cast        0
crew        0
dtype: int64

In [13]:
movies.sample(3)

Unnamed: 0,name,genres,keywords,overview,id,cast,crew
3255,Much Ado About Nothing,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 35, ""name...","[{""id"": 497, ""name"": ""shakespeare""}, {""id"": 13...","In this Shakespearean farce, Hero and her groo...",11971,"[{""cast_id"": 9, ""character"": ""Benedikt"", ""cred...","[{""credit_id"": ""52fe44ad9251416c7503d12b"", ""de..."
1917,Shark Night,"[{""id"": 27, ""name"": ""Horror""}, {""id"": 53, ""nam...","[{""id"": 2988, ""name"": ""shark attack""}, {""id"": ...",A weekend at a lake house in the Louisiana Gul...,65055,"[{""cast_id"": 2, ""character"": ""Sara Palski"", ""c...","[{""credit_id"": ""52fe46fac3a368484e0ae759"", ""de..."
2550,Not Another Teen Movie,"[{""id"": 35, ""name"": ""Comedy""}]","[{""id"": 240, ""name"": ""underdog""}, {""id"": 2283,...","On a bet, a gridiron hero at John Hughes High ...",11397,"[{""cast_id"": 11, ""character"": ""Janey Briggs"", ...","[{""credit_id"": ""58c2e4269251417326000f9e"", ""de..."


# Extracting generes keywords cast and crew

In [14]:
import ast

In [15]:
def _help(lst,val):# val=genres,keyword,actor,director
  genres=[]
  keywords=[]
  director=[]
  actor=[]
  #-------------------------------------------------------
  if val=="actor":
    for dic in lst[0:3]:
      actor.append(dic["name"].lower().replace(" ",""))
    return actor
  else:
    for dic in lst:
      #--------------------genres-------------------------
      if val=="genres":
        genres.append(dic["name"].lower().replace(" ",""))
      #------------------Keywords---------------------------
      elif val=="keyword":
        keywords.append(dic["name"].lower().replace(" ",""))
      #-----------------crew----------------------------------
      elif val=="director":
        if dic["job"]=="Director":
          director.append(dic["name"].lower().replace(" ",""))
  #-----------------------------------------------------------
  if val=="genres":
    return genres
  elif val=="keyword":
    return keywords
  elif val=="director":
    return director

In [16]:
def tag(genres,keyword,cast,crew,overview):
  rows=[]
  # which runs on genres,keywords,cast and crew
  for i in range(len(genres)):
    temp=overview[i].lower()
    #-----------------genres-------------
    gen=ast.literal_eval(genres[i])
    A=_help(gen,"genres")
    temp=temp+" ".join(A)+" "
    #---------------Keyword---------------
    key=ast.literal_eval(keyword[i])
    B=_help(key,"keyword")
    temp=temp+" ".join(B)+" "
    #--------------cast---------------
    cst=ast.literal_eval(cast[i])
    C=_help(cst,"actor")
    temp=temp+" ".join(C)+" "
    #----------------Crew----------------
    cre=ast.literal_eval(crew[i])
    D=_help(cre,"director")
    temp=temp+" ".join(D)+""
    #---------------------------------------
    #removing all special characters
    rows.append(re.sub(r"\W+|_", " ", temp))
    del temp,A,B,C,D,gen,key,cst,cre
  return np.array(rows)

In [17]:
#calling Tag() function that returns the numpy array that contains "numpy_strings"
X=tag(movies.genres.values,movies.keywords.values,movies.cast.values,movies.crew.values,movies.overview.values)

# Adding "X" as column named "Tag", to movies DataFrame
movies["Tags"]=X

# Dropping columns [genres', 'keywords', 'overview','cast','crew']
movies.drop(columns=['genres', 'keywords', 'overview','cast','crew'],inplace=True)
movies.head(2)

Unnamed: 0,name,id,Tags
0,Avatar,19995,in the 22nd century a paraplegic marine is dis...
1,Pirates of the Caribbean: At World's End,285,captain barbossa long believed to be dead has ...


# converting "Tags",to Vecotors, using TFIDF


**Performing Stemming, before convering into vectors**

In [18]:
def stemmer(tags):
  PS=PorterStemmer()
  rows=[]
  for sen in tags:
    #---------------------
    lst=word_tokenize(sen)
    for i in range(len(lst)):
      lst[i]=PS.stem(lst[i])
    rows.append((" ").join(lst))
    del lst
  return np.array(rows)  

In [19]:
# now, X is stemmed
X=stemmer(movies.Tags)

**TF IDF**

In [20]:
X_vec=TfidfVectorizer(max_features=5000,stop_words='english',ngram_range=(1,1),min_df=10)
data=X_vec.fit_transform(X)

In [21]:
# Feature Names
#X_vec.get_feature_names_out()

In [22]:
X_vec.idf_

array([5.7642564 , 5.81426682, 6.14562396, ..., 6.99292182, 7.0799332 ,
       6.83877114])

In [23]:
len(X_vec.get_feature_names_out())

3292

**Calculating Cosine Similarity**

In [24]:
CS=cosine_similarity(data)

In [25]:
CS.shape

(4806, 4806)

In [26]:
CS[0]

array([1.        , 0.02588031, 0.03502353, ..., 0.05077461, 0.00629113,
       0.        ])

In [27]:
len(CS[0])

4806

**Movie Recommender**

In [28]:
import random

In [29]:
def recommend_movie(movie_name):
  idx=movies[movies.name==movie_name].index[0]
  temp=np.argsort(CS[idx])[::-1][1:6]
  #print(CS[0][temp])
  #print("-"*40)
  print("#. "+movie_name)
  print("5 recommended movies below:")
  for i in temp:
    print(movies["name"].iloc[i])
  print("-"*40)

# recommending Harry Potter movies to check

In [30]:
recommend_movie("Harry Potter and the Philosopher's Stone")

#. Harry Potter and the Philosopher's Stone
5 recommended movies below:
Harry Potter and the Goblet of Fire
Harry Potter and the Chamber of Secrets
Harry Potter and the Half-Blood Prince
Harry Potter and the Order of the Phoenix
Harry Potter and the Prisoner of Azkaban
----------------------------------------


# Recommending 5 randomly choosen movies

In [39]:
r=random.sample(range(0,len(list(movies.name))),5)
for idx in r:
  recommend_movie(list(movies.name)[idx])

#. Departure
5 recommended movies below:
This Is It
Cloudy with a Chance of Meatballs
Cloudy with a Chance of Meatballs 2
The Last Waltz
Foodfight!
----------------------------------------
#. Head of State
5 recommended movies below:
I Think I Love My Wife
Swing Vote
Top Five
Down to Earth
Undercover Brother
----------------------------------------
#. Walter
5 recommended movies below:
This Is It
Fame
Glee: The Concert Movie
Scott Walker: 30 Century Man
Masked and Anonymous
----------------------------------------
#. The Bridges of Madison County
5 recommended movies below:
A Walk on the Moon
Fur: An Imaginary Portrait of Diane Arbus
Closer
Far from Heaven
Flags of Our Fathers
----------------------------------------
#. Gangster's Paradise: Jerusalema
5 recommended movies below:
Frenzy
Drowning Mona
Gone Girl
Jack Reacher
Sea of Love
----------------------------------------
