In [3]:
#importing all the required dependencies

import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [4]:
# we have our dataset in 5 chunks, so we have to merge them.

df1 = pd.read_csv("data_100.csv", index_col =False)
df2 = pd.read_csv("data_200.csv", index_col =False)
df3 = pd.read_csv("data_300.csv", index_col =False)
df4 = pd.read_csv("data_400.csv", index_col =False)
df5 = pd.read_csv("data_500.csv", index_col =False)

df_merged = pd.concat([df1, df2, df3, df4, df5], ignore_index=False)
df_merged.head()

Unnamed: 0.1,Unnamed: 0,Title,Overview,Id,Genre,Tagline,Cast,Crew,Keywords
0,0.0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,278,"['Drama', 'Crime']",Fear can hold you prisoner. Hope can set you f...,"['Tim Robbins', 'Morgan Freeman', 'Bob Gunton'...","['Niki Marvin', 'Frank Darabont']","['prison', 'friendship', 'police brutality', '..."
1,1.0,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",238,"['Drama', 'Crime']",An offer you can't refuse.,"['Marlon Brando', 'Al Pacino', 'James Caan', '...","['Albert S. Ruddy', 'Francis Ford Coppola', 'F...","['based on novel or book', 'loss of loved one'..."
2,2.0,The Godfather Part II,In the continuing saga of the Corleone crime f...,240,"['Drama', 'Crime']",The rise and fall of the Corleone empire.,"['Al Pacino', 'Robert Duvall', 'Diane Keaton',...","['Francis Ford Coppola', 'Francis Ford Coppola']","['italian american', 'cuba', 'italy', 'gangste..."
3,3.0,Schindler's List,The true story of how businessman Oskar Schind...,424,"['Drama', 'History', 'War']","Whoever saves one life, saves the world entire.","['Liam Neeson', 'Ben Kingsley', 'Ralph Fiennes...","['Gerald R. Molen', 'Steven Spielberg', 'Steve...","['factory', 'hero', 'based on novel or book', ..."
4,4.0,12 Angry Men,The defense and the prosecution have rested an...,389,['Drama'],Life is in their hands — Death is on their minds!,"['Martin Balsam', 'John Fiedler', 'Lee J. Cobb...","['Reginald Rose', 'Henry Fonda', 'Sidney Lumet']","['death penalty', 'anonymity', 'court case', '..."


In [8]:
df_merged.to_csv('movies_data.csv', index=False)

# Download the file from Google Colab
from google.colab import files
files.download('movies_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [6]:
import pickle

pickle.dump(df_merged.to_dict(), open('df_merged_dict.pkl', 'wb'))

In [7]:
df_merged.shape  # we can see that now we have 9567 movies in our dataset

(9567, 9)

In [9]:
df_merged = df_merged.drop('Unnamed: 0', axis=1)

In [10]:
df_merged.shape

(9567, 8)

In [11]:
movies = df_merged.copy()
movies.head()

Unnamed: 0,Title,Overview,Id,Genre,Tagline,Cast,Crew,Keywords
0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,278,"['Drama', 'Crime']",Fear can hold you prisoner. Hope can set you f...,"['Tim Robbins', 'Morgan Freeman', 'Bob Gunton'...","['Niki Marvin', 'Frank Darabont']","['prison', 'friendship', 'police brutality', '..."
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",238,"['Drama', 'Crime']",An offer you can't refuse.,"['Marlon Brando', 'Al Pacino', 'James Caan', '...","['Albert S. Ruddy', 'Francis Ford Coppola', 'F...","['based on novel or book', 'loss of loved one'..."
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,240,"['Drama', 'Crime']",The rise and fall of the Corleone empire.,"['Al Pacino', 'Robert Duvall', 'Diane Keaton',...","['Francis Ford Coppola', 'Francis Ford Coppola']","['italian american', 'cuba', 'italy', 'gangste..."
3,Schindler's List,The true story of how businessman Oskar Schind...,424,"['Drama', 'History', 'War']","Whoever saves one life, saves the world entire.","['Liam Neeson', 'Ben Kingsley', 'Ralph Fiennes...","['Gerald R. Molen', 'Steven Spielberg', 'Steve...","['factory', 'hero', 'based on novel or book', ..."
4,12 Angry Men,The defense and the prosecution have rested an...,389,['Drama'],Life is in their hands — Death is on their minds!,"['Martin Balsam', 'John Fiedler', 'Lee J. Cobb...","['Reginald Rose', 'Henry Fonda', 'Sidney Lumet']","['death penalty', 'anonymity', 'court case', '..."


In [12]:
movies.isnull().sum()

Title          0
Overview       1
Id             0
Genre          0
Tagline     1506
Cast           0
Crew           0
Keywords       0
dtype: int64

we can see there are many movies for which there is no tagline so we are going to remove the tagline column

In [13]:
movies = movies.drop(['Tagline','Id'], axis=1)

In [14]:
movies.head()

Unnamed: 0,Title,Overview,Genre,Cast,Crew,Keywords
0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,"['Drama', 'Crime']","['Tim Robbins', 'Morgan Freeman', 'Bob Gunton'...","['Niki Marvin', 'Frank Darabont']","['prison', 'friendship', 'police brutality', '..."
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","['Drama', 'Crime']","['Marlon Brando', 'Al Pacino', 'James Caan', '...","['Albert S. Ruddy', 'Francis Ford Coppola', 'F...","['based on novel or book', 'loss of loved one'..."
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"['Drama', 'Crime']","['Al Pacino', 'Robert Duvall', 'Diane Keaton',...","['Francis Ford Coppola', 'Francis Ford Coppola']","['italian american', 'cuba', 'italy', 'gangste..."
3,Schindler's List,The true story of how businessman Oskar Schind...,"['Drama', 'History', 'War']","['Liam Neeson', 'Ben Kingsley', 'Ralph Fiennes...","['Gerald R. Molen', 'Steven Spielberg', 'Steve...","['factory', 'hero', 'based on novel or book', ..."
4,12 Angry Men,The defense and the prosecution have rested an...,['Drama'],"['Martin Balsam', 'John Fiedler', 'Lee J. Cobb...","['Reginald Rose', 'Henry Fonda', 'Sidney Lumet']","['death penalty', 'anonymity', 'court case', '..."


In [15]:
movies.isnull().sum()

Title       0
Overview    1
Genre       0
Cast        0
Crew        0
Keywords    0
dtype: int64

In [16]:
movies.dropna(inplace=True)
movies.isnull().sum()

Title       0
Overview    0
Genre       0
Cast        0
Crew        0
Keywords    0
dtype: int64

So, we have removed all the null values from our dataset

In [17]:
type(movies['Cast'].iloc[0])

str

In [18]:
movies['Cast'].iloc[0]

"['Tim Robbins', 'Morgan Freeman', 'Bob Gunton', 'William Sadler', 'Clancy Brown', 'Gil Bellows', 'James Whitmore', 'Mark Rolston', 'Jeffrey DeMunn', 'Larry Brandenburg']"

In [19]:
movies['Overview'] = movies['Overview'].apply(lambda x: x.split())

In [20]:
movies.head()

Unnamed: 0,Title,Overview,Genre,Cast,Crew,Keywords
0,The Shawshank Redemption,"[Imprisoned, in, the, 1940s, for, the, double,...","['Drama', 'Crime']","['Tim Robbins', 'Morgan Freeman', 'Bob Gunton'...","['Niki Marvin', 'Frank Darabont']","['prison', 'friendship', 'police brutality', '..."
1,The Godfather,"[Spanning, the, years, 1945, to, 1955,, a, chr...","['Drama', 'Crime']","['Marlon Brando', 'Al Pacino', 'James Caan', '...","['Albert S. Ruddy', 'Francis Ford Coppola', 'F...","['based on novel or book', 'loss of loved one'..."
2,The Godfather Part II,"[In, the, continuing, saga, of, the, Corleone,...","['Drama', 'Crime']","['Al Pacino', 'Robert Duvall', 'Diane Keaton',...","['Francis Ford Coppola', 'Francis Ford Coppola']","['italian american', 'cuba', 'italy', 'gangste..."
3,Schindler's List,"[The, true, story, of, how, businessman, Oskar...","['Drama', 'History', 'War']","['Liam Neeson', 'Ben Kingsley', 'Ralph Fiennes...","['Gerald R. Molen', 'Steven Spielberg', 'Steve...","['factory', 'hero', 'based on novel or book', ..."
4,12 Angry Men,"[The, defense, and, the, prosecution, have, re...",['Drama'],"['Martin Balsam', 'John Fiedler', 'Lee J. Cobb...","['Reginald Rose', 'Henry Fonda', 'Sidney Lumet']","['death penalty', 'anonymity', 'court case', '..."


we can remove all the stop words from the overview column

In [21]:
# prompt: Remove all the stop words from the overview column
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')

movies['Overview'] = movies['Overview'].apply(lambda x: [word for word in x if word not in stop_words])

movies.head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,Title,Overview,Genre,Cast,Crew,Keywords
0,The Shawshank Redemption,"[Imprisoned, 1940s, double, murder, wife, love...","['Drama', 'Crime']","['Tim Robbins', 'Morgan Freeman', 'Bob Gunton'...","['Niki Marvin', 'Frank Darabont']","['prison', 'friendship', 'police brutality', '..."
1,The Godfather,"[Spanning, years, 1945, 1955,, chronicle, fict...","['Drama', 'Crime']","['Marlon Brando', 'Al Pacino', 'James Caan', '...","['Albert S. Ruddy', 'Francis Ford Coppola', 'F...","['based on novel or book', 'loss of loved one'..."
2,The Godfather Part II,"[In, continuing, saga, Corleone, crime, family...","['Drama', 'Crime']","['Al Pacino', 'Robert Duvall', 'Diane Keaton',...","['Francis Ford Coppola', 'Francis Ford Coppola']","['italian american', 'cuba', 'italy', 'gangste..."
3,Schindler's List,"[The, true, story, businessman, Oskar, Schindl...","['Drama', 'History', 'War']","['Liam Neeson', 'Ben Kingsley', 'Ralph Fiennes...","['Gerald R. Molen', 'Steven Spielberg', 'Steve...","['factory', 'hero', 'based on novel or book', ..."
4,12 Angry Men,"[The, defense, prosecution, rested, jury, fili...",['Drama'],"['Martin Balsam', 'John Fiedler', 'Lee J. Cobb...","['Reginald Rose', 'Henry Fonda', 'Sidney Lumet']","['death penalty', 'anonymity', 'court case', '..."


In [22]:
type(movies['Overview'].iloc[0])

list

In [None]:
movies['Crew'].iloc[0]

"['Niki Marvin', 'Frank Darabont']"

In case of Genre, Keywords, cast and crew columns. We have to remove space between the each element.

For Example, 'James Cameron' will be converted to 'JamesCameron', 'Science fiction' will be converted to 'ScienceFiction'. We are doing this inorder to make the elements unique.

In [23]:
import ast

# Function to convert string representation of list to an actual list
def convert_to_list(crew_string):
    try:
        return ast.literal_eval(crew_string)
    except (ValueError, SyntaxError):
        return []

# Apply the function to the 'Crew' column
movies['Crew'] = movies['Crew'].apply(convert_to_list)
movies['Genre'] = movies['Genre'].apply(convert_to_list)
movies['Cast'] = movies['Cast'].apply(convert_to_list)
movies['Keywords'] = movies['Keywords'].apply(convert_to_list)

In [24]:
movies['Genre'] = movies['Genre'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['Crew'] = movies['Crew'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['Cast'] = movies['Cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['Keywords'] = movies['Keywords'].apply(lambda x: [i.replace(" ", "") for i in x])

In [25]:
movies.head()

Unnamed: 0,Title,Overview,Genre,Cast,Crew,Keywords
0,The Shawshank Redemption,"[Imprisoned, 1940s, double, murder, wife, love...","[Drama, Crime]","[TimRobbins, MorganFreeman, BobGunton, William...","[NikiMarvin, FrankDarabont]","[prison, friendship, policebrutality, corrupti..."
1,The Godfather,"[Spanning, years, 1945, 1955,, chronicle, fict...","[Drama, Crime]","[MarlonBrando, AlPacino, JamesCaan, RobertDuva...","[AlbertS.Ruddy, FrancisFordCoppola, FrancisFor...","[basedonnovelorbook, lossoflovedone, loveatfir..."
2,The Godfather Part II,"[In, continuing, saga, Corleone, crime, family...","[Drama, Crime]","[AlPacino, RobertDuvall, DianeKeaton, RobertDe...","[FrancisFordCoppola, FrancisFordCoppola]","[italianamerican, cuba, italy, gangster, prais..."
3,Schindler's List,"[The, true, story, businessman, Oskar, Schindl...","[Drama, History, War]","[LiamNeeson, BenKingsley, RalphFiennes, Caroli...","[GeraldR.Molen, StevenSpielberg, StevenSpielbe...","[factory, hero, basedonnovelorbook, nazi, conc..."
4,12 Angry Men,"[The, defense, prosecution, rested, jury, fili...",[Drama],"[MartinBalsam, JohnFiedler, LeeJ.Cobb, E.G.Mar...","[ReginaldRose, HenryFonda, SidneyLumet]","[deathpenalty, anonymity, courtcase, court, ju..."


In [26]:
movies.shape

(9566, 6)

In [27]:
# These are all the unique genres present in our movies['Genre'] column

unique_genres = pd.Series([genre for sublist in movies['Genre'] for genre in sublist]).unique()
print(unique_genres)

['Drama' 'Crime' 'History' 'War' 'Comedy' 'Romance' 'Animation' 'Family'
 'Fantasy' 'Action' 'Thriller' 'Adventure' 'Western' 'ScienceFiction'
 'Horror' 'Mystery' 'Music' 'TVMovie']


In [28]:
movies['Cast'] = movies['Cast'].apply(lambda x: x[:3])
movies['Crew'] = movies['Crew'].apply(lambda x: x[:1])

Now that we have removed the space between the names in all the columns required. We can create a tag column which will be the concatenation of all the values in that row

In [29]:
movies['tags'] = movies['Genre'] + movies['Cast'] + movies['Crew'] + movies['Overview'] + movies['Keywords']
movies.head()

Unnamed: 0,Title,Overview,Genre,Cast,Crew,Keywords,tags
0,The Shawshank Redemption,"[Imprisoned, 1940s, double, murder, wife, love...","[Drama, Crime]","[TimRobbins, MorganFreeman, BobGunton]",[NikiMarvin],"[prison, friendship, policebrutality, corrupti...","[Drama, Crime, TimRobbins, MorganFreeman, BobG..."
1,The Godfather,"[Spanning, years, 1945, 1955,, chronicle, fict...","[Drama, Crime]","[MarlonBrando, AlPacino, JamesCaan]",[AlbertS.Ruddy],"[basedonnovelorbook, lossoflovedone, loveatfir...","[Drama, Crime, MarlonBrando, AlPacino, JamesCa..."
2,The Godfather Part II,"[In, continuing, saga, Corleone, crime, family...","[Drama, Crime]","[AlPacino, RobertDuvall, DianeKeaton]",[FrancisFordCoppola],"[italianamerican, cuba, italy, gangster, prais...","[Drama, Crime, AlPacino, RobertDuvall, DianeKe..."
3,Schindler's List,"[The, true, story, businessman, Oskar, Schindl...","[Drama, History, War]","[LiamNeeson, BenKingsley, RalphFiennes]",[GeraldR.Molen],"[factory, hero, basedonnovelorbook, nazi, conc...","[Drama, History, War, LiamNeeson, BenKingsley,..."
4,12 Angry Men,"[The, defense, prosecution, rested, jury, fili...",[Drama],"[MartinBalsam, JohnFiedler, LeeJ.Cobb]",[ReginaldRose],"[deathpenalty, anonymity, courtcase, court, ju...","[Drama, MartinBalsam, JohnFiedler, LeeJ.Cobb, ..."


In [30]:
movie_final = movies[['Title','tags']]
movie_final.head()

Unnamed: 0,Title,tags
0,The Shawshank Redemption,"[Drama, Crime, TimRobbins, MorganFreeman, BobG..."
1,The Godfather,"[Drama, Crime, MarlonBrando, AlPacino, JamesCa..."
2,The Godfather Part II,"[Drama, Crime, AlPacino, RobertDuvall, DianeKe..."
3,Schindler's List,"[Drama, History, War, LiamNeeson, BenKingsley,..."
4,12 Angry Men,"[Drama, MartinBalsam, JohnFiedler, LeeJ.Cobb, ..."


In [31]:
movie_final.to_csv('movies_tags.csv', index=False)

# Download the file from Google Colab
from google.colab import files
files.download('movies_tags.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [32]:
movie_final['tags'].iloc[0]

['Drama',
 'Crime',
 'TimRobbins',
 'MorganFreeman',
 'BobGunton',
 'NikiMarvin',
 'Imprisoned',
 '1940s',
 'double',
 'murder',
 'wife',
 'lover,',
 'upstanding',
 'banker',
 'Andy',
 'Dufresne',
 'begins',
 'new',
 'life',
 'Shawshank',
 'prison,',
 'puts',
 'accounting',
 'skills',
 'work',
 'amoral',
 'warden.',
 'During',
 'long',
 'stretch',
 'prison,',
 'Dufresne',
 'comes',
 'admired',
 'inmates',
 '--',
 'including',
 'older',
 'prisoner',
 'named',
 'Red',
 '--',
 'integrity',
 'unquenchable',
 'sense',
 'hope.',
 'prison',
 'friendship',
 'policebrutality',
 'corruption',
 'basedonnovelorbook',
 'hope',
 'prisoncell',
 'delinquent',
 'redemption',
 'paroleboard',
 'prisonescape',
 'wrongfulimprisonment',
 'interracialfriendship',
 'framedformurder',
 '1940s',
 'voiceover',
 'hopeful']

Now we need to convert this list in the tags column to string

In [33]:
movie_final['tags'] = movie_final['tags'].apply(lambda x: ' '.join(x))
movie_final.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_final['tags'] = movie_final['tags'].apply(lambda x: ' '.join(x))


Unnamed: 0,Title,tags
0,The Shawshank Redemption,Drama Crime TimRobbins MorganFreeman BobGunton...
1,The Godfather,Drama Crime MarlonBrando AlPacino JamesCaan Al...
2,The Godfather Part II,Drama Crime AlPacino RobertDuvall DianeKeaton ...
3,Schindler's List,Drama History War LiamNeeson BenKingsley Ralph...
4,12 Angry Men,Drama MartinBalsam JohnFiedler LeeJ.Cobb Regin...


In [34]:
movie_final['tags'].iloc[1]

'Drama Crime MarlonBrando AlPacino JamesCaan AlbertS.Ruddy Spanning years 1945 1955, chronicle fictional Italian-American Corleone crime family. When organized crime family patriarch, Vito Corleone barely survives attempt life, youngest son, Michael steps take care would-be killers, launching campaign bloody revenge. basedonnovelorbook lossoflovedone loveatfirstsight italy symbolism patriarch europe organizedcrime mafia religion lawyer revengemotive crimefamily sicilianmafia religioushypocrisy gunviolence risetopower deadhorse gangviolence 1940s 1950s mafiawar'

Now let's convert the string to lowerCase and remove all the punctuation

In [35]:
movie_final['tags'] = movie_final['tags'].apply(lambda x: x.lower())

import string
exclude = string.punctuation
movie_final['tags'] = movie_final['tags'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_final['tags'] = movie_final['tags'].apply(lambda x: x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_final['tags'] = movie_final['tags'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))


In [36]:
movie_final['tags'].iloc[1]

'drama crime marlonbrando alpacino jamescaan albertsruddy spanning years 1945 1955 chronicle fictional italianamerican corleone crime family when organized crime family patriarch vito corleone barely survives attempt life youngest son michael steps take care wouldbe killers launching campaign bloody revenge basedonnovelorbook lossoflovedone loveatfirstsight italy symbolism patriarch europe organizedcrime mafia religion lawyer revengemotive crimefamily sicilianmafia religioushypocrisy gunviolence risetopower deadhorse gangviolence 1940s 1950s mafiawar'

we are going to perfrom stemming on the tags column

In [37]:
# prompt: perform stemming on the tags column

nltk.download('punkt')
stemmer = PorterStemmer()
movie_final['tags'] = movie_final['tags'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
movie_final.head()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_final['tags'] = movie_final['tags'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))


Unnamed: 0,Title,tags
0,The Shawshank Redemption,drama crime timrobbin morganfreeman bobgunton ...
1,The Godfather,drama crime marlonbrando alpacino jamescaan al...
2,The Godfather Part II,drama crime alpacino robertduval dianekeaton f...
3,Schindler's List,drama histori war liamneeson benkingsley ralph...
4,12 Angry Men,drama martinbalsam johnfiedl leejcobb reginald...


In [38]:
movie_final.to_csv('movies_tags.csv', index=False)

# Download the file from Google Colab
from google.colab import files
files.download('movies_tags.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Text Vectorization

In [None]:

import random
import torch
import transformers
from transformers import BertTokenizer, BertModel

# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to find word embedding for each tag
def find_word_embedding(tag):
  # Tokenize the tag
  tokens = tokenizer.tokenize(tag)
  # Convert tokens to IDs
  input_ids = tokenizer.convert_tokens_to_ids(tokens)
  # Create a tensor from input IDs
  input_ids = torch.tensor([input_ids])
  # Get the word embeddings from the BERT model
  outputs = model(input_ids)
  # Get the last hidden state of the BERT model
  last_hidden_state = outputs[0]
  # Get the word embedding for the first token (usually the most important word)
  word_embedding = last_hidden_state[0][0].detach().numpy()
  return word_embedding

# Apply the function to the 'tags' column
movie_final['word_embedding'] = movie_final['tags'].apply(find_word_embedding)

# Store the word embeddings in the dataframe
movie_final.head()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_final['word_embedding'] = movie_final['tags'].apply(find_word_embedding)


Unnamed: 0,Title,tags,word_embedding
0,The Shawshank Redemption,drama crime timrobbin morganfreeman bobgunton ...,"[0.03449695, -0.0807922, 0.2330443, -0.0458649..."
1,The Godfather,drama crime marlonbrando alpacino jamescaan ro...,"[0.25267026, -0.083612114, 0.43519393, 0.04981..."
2,The Godfather Part II,drama crime alpacino robertduval dianekeaton r...,"[0.1090253, -0.11107748, 0.41904408, -0.020341..."
3,Schindler's List,drama histori war liamneeson benkingsley ralph...,"[0.18110614, -0.04659139, 0.33005926, 0.101305..."
4,12 Angry Men,drama martinbalsam johnfiedl leejcobb egmarsha...,"[0.11268974, 0.030628169, 0.1710939, -0.043651..."


In [None]:
# prompt: in the above code, generate only 100 word embeddings

# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to find word embedding for each tag
def find_word_embedding(tag):
  # Tokenize the tag
  tokens = tokenizer.tokenize(tag)
  # Convert tokens to IDs
  input_ids = tokenizer.convert_tokens_to_ids(tokens)
  # Create a tensor from input IDs
  input_ids = torch.tensor([input_ids])
  # Get the word embeddings from the BERT model
  outputs = model(input_ids)
  # Get the last hidden state of the BERT model
  last_hidden_state = outputs[0]
  # Get the word embedding for the first token (usually the most important word)
  word_embedding = last_hidden_state[0][0].detach().numpy()
  return word_embedding

# Apply the function to the first 100 rows of the 'tags' column
movie_final['word_embedding'] = movie_final['tags'].iloc[:100].apply(find_word_embedding)

# Store the word embeddings in the dataframe
movie_final.head()


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Reshape the word embeddings to 2D arrays
movie_final['word_embedding'] = movie_final['word_embedding'].apply(lambda x: x.reshape(1, -1))

# Now calculate the cosine similarity
relation = cosine_similarity(movie_final['word_embedding'].iloc[0], movie_final['word_embedding'].iloc[1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_final['word_embedding'] = movie_final['word_embedding'].apply(lambda x: x.reshape(1, -1))


In [None]:
relation[0][0]

array([0.9305384], dtype=float32)

In [None]:
sim_list = []

for i in range(len(movie_final)):
    sim_list.append(cosine_similarity(movie_final['word_embedding'].iloc[i], movie_final['word_embedding'].iloc[7])[0][0])


In [None]:
# prompt: download the above movie_final dataframe as a csv file

movie_final.to_csv('movie_final.csv', index=False)

# Download the file from Google Colab
from google.colab import files
files.download('movie_final.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
similar = sorted(list(enumerate(sim_list)), reverse=True, key=lambda x:x[1])[1:10]

In [None]:
similar

[(5763, 0.9796535),
 (1312, 0.9770107),
 (1838, 0.976688),
 (144, 0.97584903),
 (2751, 0.9756415),
 (4712, 0.9756273),
 (3323, 0.97546935),
 (1665, 0.975354),
 (1127, 0.975327)]

In [None]:
for i in similar:
    print(movie_final.iloc[i[0]].Title)

Rendition
I Killed My Mother
Cherry
The Departed
Paterson
Pig
Frailty
Law Abiding Citizen
Manchester by the Sea


In [None]:
movie_final['Title'].head(20)

0                          The Shawshank Redemption
1                                     The Godfather
2                             The Godfather Part II
3                                  Schindler's List
4                                      12 Angry Men
5                       Dilwale Dulhania Le Jayenge
6                                     Spirited Away
7                                   The Dark Knight
8                                          Parasite
9                                    The Green Mile
10                                       Your Name.
11                                     Pulp Fiction
12    The Lord of the Rings: The Return of the King
13                                     Forrest Gump
14                   The Good, the Bad and the Ugly
15                                       GoodFellas
16                           Grave of the Fireflies
17                                    Seven Samurai
18                                  Cinema Paradiso
19          

In [None]:
cosine_similarity(movie_final['word_embedding'].iloc[0], movie_final['word_embedding'].iloc[0])[0][0]

KeyError: 'word_embedding'

### Bag of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000)

In [None]:
vectors = cv.fit_transform(movie_final['tags']).toarray()

In [None]:
cv.get_feature_names_out()

array(['10', '10yearold', '11yearold', ..., 'zombieapocalyps', 'zone',
       'zoo'], dtype=object)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
similarity_matrix = cosine_similarity(vectors)

In [2]:
similarity_matrix

NameError: name 'similarity_matrix' is not defined

In [None]:
def recommend_movie(movie):
  movie_idx = movie_final[movie_final['Title'] == movie].index[0]
  distances = similarity_matrix[movie_idx]
  movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:10]
  for i in movies_list:
    print(movie_final.iloc[i[0]].Title)


In [None]:
recommend_movie('The Godfather')

The Godfather Part II
The Godfather Part III
Extremely Wicked, Shockingly Evil and Vile
Road to Perdition
7 Women and a Murder
Gotti
House of Gucci
Shoplifters
Shottas


In [None]:
recommend_movie('Batman Begins')

The Dark Knight
The Dark Knight Rises
Batman: Bad Blood
Batman
Batman: The Dark Knight Returns, Part 1
Batman: Under the Red Hood
Batman: Mask of the Phantasm
The Batman
Batman: Hush


In [None]:
recommend_movie('(500) Days of Summer')

My First Time
Man Up
Kuch Kuch Hota Hai
Twice Born
The Wandering Earth
Ash Is Purest White
22 Bullets
Sorry if I Love You
Sorry If I Call You Love


In [None]:
recommend_movie('The Conjuring')

Insidious: Chapter 2
The Conjuring: The Devil Made Me Do It
Demonic
Belzebuth
Insidious: The Red Door
The Conjuring 2
Exorcist II: The Heretic
Dark Skies
The Darkness


In [None]:
recommend_movie('Cast Away')

The Land Before Time V: The Mysterious Island
Lord of the Flies
The Martian
Lord of the Flies
Alpha
Blue Lagoon: The Awakening
Through My Window 3: Looking at You
Alice
The Chronicles of Riddick: Dark Fury


In [None]:
recommend_movie('Inception')

Mission: Impossible - Rogue Nation
The Matrix Revolutions
The Matrix Resurrections
Vanguard
Infinite
The Magnificent One
No Time to Die
The Guns of Navarone
The City of Lost Children


In [None]:
recommend_movie('Interstellar')

Mission to Mars
Lightyear
Silent Running
Space Chimps
Stowaway
Automata
Lost in Space
The Right Stuff
A Million Miles Away


### Tf-Idf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf.fit_transform(movie_final['tags']).toarray()
tfidf_similarity_matrix = cosine_similarity(tfidf_matrix)

tfidf_similarity_matrix

array([[1.        , 0.0626997 , 0.01918027, ..., 0.        , 0.        ,
        0.        ],
       [0.0626997 , 1.        , 0.33933512, ..., 0.        , 0.01652212,
        0.01717352],
       [0.01918027, 0.33933512, 1.        , ..., 0.        , 0.01190133,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.00782363,
        0.06748429],
       [0.        , 0.01652212, 0.01190133, ..., 0.00782363, 1.        ,
        0.00501719],
       [0.        , 0.01717352, 0.        , ..., 0.06748429, 0.00501719,
        1.        ]])

In [None]:
def recommend_movie_2(movie):
  movie_idx = movie_final[movie_final['Title'] == movie].index[0]
  distances = tfidf_similarity_matrix[movie_idx]
  movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:10]
  for i in movies_list:
    print(movie_final.iloc[i[0]].Title)


In [None]:
recommend_movie_2('(500) Days of Summer')

Kuch Kuch Hota Hai
Man Up
My First Time
22 Bullets
Sorry if I Love You
Head Full of Honey
Warm Bodies
Return
Paradise


In [None]:
recommend_movie_2('Inception')

Duplicity
Vanguard
Criminal
Inception: The Cobol Job
The Guns of Navarone
Cypher
Mission: Impossible - Rogue Nation
Tenet
Infinite


In [None]:
recommend_movie_2('Fight Club')

The Guernsey Literary & Potato Peel Pie Society
Never Back Down
My Mom Is a Character 3
UHF
Tyrannosaur
Motherless Brooklyn
Fantastic Four
Promised Land
Undisputed II: Last Man Standing


In [None]:
recommend_movie_2('Casino Royale')

Maverick
Mississippi Grind
Molly's Game
Rounders
Casino Royale
Spectre
Runner Runner
No Time to Die
Skyfall


## Model Download

In [None]:
pickle.dump(tfidf_similarity_matrix, open('similarity.pkl', 'wb'))