# Scraping top 250 movies title and plot summeries

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from colorama import Fore,  Back, Style

In [3]:
def get_titles_ids():
    url = 'https://www.imdb.com/chart/top/'
    driver = webdriver.Chrome()  # You can specify the path to your chromedriver if needed
    driver.get(url)
    
    time.sleep(5)  # Wait for the page to fully load

    a_elements = driver.find_elements(By.CLASS_NAME, "ipc-title-link-wrapper")

    id_results = [elem.get_attribute('href').split("/title/")[1].split("/?ref")[0] for elem in a_elements[:250]]
    names_results = [elem.find_element(By.CLASS_NAME, "ipc-title__text").text.split(". ")[1] for elem in a_elements[:250]]
    
    driver.quit()

    final_data = pd.DataFrame()
    final_data['Id'] = id_results
    final_data['Title'] = names_results
    return final_data

def get_plot_summaries():
    final_data = get_titles_ids()
    list_ids = final_data['Id'].values.tolist()
    summaries = []
    
    driver = webdriver.Chrome()  # You can specify the path to your chromedriver if needed
    
    for id in list_ids:
        print(id)
        url = 'https://www.imdb.com/title/' + id + '/plotsummary/?ref_=tt_stry_pl'
        driver.get(url)
        
        time.sleep(3)  # Wait for the page to fully load
        
        plot_summary_elements = driver.find_elements(By.CLASS_NAME, "ipc-html-content-inner-div")
        if len(plot_summary_elements) > 3:
            result = plot_summary_elements[3].text
        else:
            result = ""
        summaries.append(result)
    
    driver.quit()

    final_data['Plot_Summary'] = summaries
    return final_data

# Example usage:

In [4]:
# df_top_250_movies=get_plot_summaries()
df_top_250_movies=pd.read_csv("./top_250_movies.csv")

In [5]:
print(Style.BRIGHT, Fore.MAGENTA, Back.BLACK, df_top_250_movies.shape)

[1m [35m [40m (250, 4)


In [5]:
# df_top_250_movies.to_csv("./top_250_movies.csv")

## with BeautifulSoup(Depricated):

In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [7]:
def get_titles_ids():
    url = 'https://www.imdb.com/chart/top/'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    response = requests.get(url, headers=headers)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    a=soup.find_all(class_="ipc-title-link-wrapper")
    
    id_results=[res['href'].split("/title/")[1].split("/?ref")[0] for res in a[:250]]
    names_results=[str(res.find(class_="ipc-title__text")).split(". ")[1].split("</h")[0] for res in a[:250]]
    
    final_data=pd.DataFrame()
    final_data['Id']=id_results
    final_data['Title']=names_results
    return final_data


def get_plot_summaries():
    final_data=get_titles_ids()
    list_ids=final_data['Id'].values.tolist()
    summaries=[]
    for id in list_ids:
        print(id)
        url='https://www.imdb.com/title/'+id+'/plotsummary/?ref_=tt_stry_pl'
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        plot_summary2=soup.find_all(class_="ipc-html-content-inner-div")
        result=str(plot_summary2[3]).split("<span ")[0].split('div">')[1]
        summaries.append(result)
    final_data['Plot_Summary']=summaries
    return final_data

# Preprocessing (delete stopwords, punctuations and etc.)

In [6]:
import re
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm

tqdm.pandas()
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Arshia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
def clean_text(text):
    text = re.sub(r'[^\w\s]', "",text)
    text=text.lower()
    words_ = text.split()
    #remove stop words
    new_filtered_words = [word for word in words_ if word not in stopwords.words('english')]

    cleaned_text = ' '.join(new_filtered_words)

    return cleaned_text

In [8]:
df_top_250_movies['cleaned_PS']=df_top_250_movies['Plot_Summary'].apply(lambda x: clean_text(x))

df_top_250_movies.head()

Unnamed: 0.1,Unnamed: 0,Id,Title,Plot_Summary,cleaned_PS
0,0,tt0111161,The Shawshank Redemption,Chronicles the experiences of a formerly succe...,chronicles experiences formerly successful ban...
1,1,tt0068646,The Godfather,"The Godfather ""Don"" Vito Corleone is the head ...",godfather vito corleone head corleone mafia fa...
2,2,tt0468569,The Dark Knight,Set within a year after the events of Batman B...,set within year events batman begins 2005 batm...
3,3,tt0071562,The Godfather Part II,The continuing saga of the Corleone crime fami...,continuing saga corleone crime family tells st...
4,4,tt0050083,12 Angry Men,"The defense and the prosecution have rested, a...",defense prosecution rested jury filing jury ro...


# Create Vocabulary:

In [9]:
df_top_250_movies['split_PS']=df_top_250_movies['cleaned_PS'].apply(lambda x: x.split())
vocabulary=[]
for i in range(len(df_top_250_movies)):
    for word in df_top_250_movies.loc[i, 'split_PS']:
        vocabulary.append(word)
vocabulary=list(set(vocabulary))
print(Style.BRIGHT, Fore.RED, Back.GREEN, len(vocabulary))

[1m [31m [42m 6580


# TF-IDF:

In [10]:
import math
import numpy as np

In [11]:
def TF(clean_summary , movie_number , word):
    return clean_summary[movie_number - 1].count(word)/len(clean_summary[movie_number - 1])

def IDF(clean_summary , word):
    count_word_in_plot = 0
    for plot in clean_summary:
        if word in plot:
            count_word_in_plot += 1
    return math.log10(len(clean_summary) / count_word_in_plot) + 1

def TF_IDF(clean_summary , movie_number , word):
    return TF(clean_summary , movie_number , word) * IDF(clean_summary , word)

In [None]:
new_list = []

for i , plot in enumerate(df_top_250_movies['split_PS'].values.tolist()):
    word_TF_IDF = {}
    for word in vocabulary:
        word_TF_IDF[word] = TF_IDF(df_top_250_movies['split_PS'].values.tolist(), i+1 , word)
    new_list.append(word_TF_IDF) # [{hgfdh : '2' , b : '0' , b = '0' ...}, {} , {} , ... ]

matrix = pd.DataFrame(new_list)
matrix.to_csv('matrix.csv')


In [None]:
print(Style.NORMAL, Fore.BLACK, Back.YELLOW, matrix.shape)

In [14]:
# matrix.to_csv("./matrix.csv")

In [20]:
matrix

Unnamed: 0,guido,promoted,case,sobinski,jerrydaphne,stephan,promising,rebel,still,recruits,...,toy,kidnap,flees,initiated,sympathizing,example,woo,ratings,spending,old
0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.032629
2,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
3,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
4,0.0,0.0,0.053925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
246,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
247,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
248,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000


# Top similar movies (KNN):

In [31]:
import numpy as np
import csv

In [None]:

def cos_of_vector(A, B):
    dot_product = np.dot(A, B)
    magnitude_A = np.linalg.norm(A)
    magnitude_B = np.linalg.norm(B)
    return dot_product / (magnitude_A * magnitude_B)

def compute_knn(n:int, input_plot_summary:str, input_title:str, matrix=matrix, df_input=df_top_250_movies, vocabulary=vocabulary):
    # cleaned and split plot summeries
    list_top = df_input['split_PS'].values.tolist()
    split_plot_summeryies = clean_text(input_plot_summary).split()

    # Delete OOV(Out Of Vocabulary) words
    delete_oov_words=[word if word in vocabulary else "" for word in split_plot_summeryies]
    list_top.append(delete_oov_words)
    
    # Compute TF-IDF
    word_TF_IDF = {}
    for word in vocabulary:
        word_TF_IDF[word] = TF_IDF(list_top, 251 , word)
        tf_idf_new_input=list(word_TF_IDF.values())

    # Compute Similarities
    similarities=[]
    for vector in matrix.values.tolist():
        similarities.append(cos_of_vector(np.array(vector), np.array(tf_idf_new_input)))
    
    df_final=pd.DataFrame()
    df_final['Title']=df_input['Title']
    df_final['similarities']=similarities
    
    return df_final.sort_values(by=['similarities'], ascending = False ).reset_index(drop=True).iloc[:n]

In [24]:
input_title=str(input("Please Enter the title of movie: "))
input_plot_summery=str(input("Please Enter the plot summery: "))
input_int=int(input("How many similar films do you need?"))
# compute_knn(n=input_int, input_plot_summary=input_plot_summery, input_title=input_title)
print(Style.NORMAL, Fore.BLACK, Back.RED , compute_knn(n=input_int, input_plot_summary=input_plot_summery, input_title=input_title))

Scikitlearn for TF_IDF and KNN

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier

dataframe = pd.read_csv('top_250_movies.csv')
column4_data = dataframe.iloc[:, 3]

lst = []
for i in  column4_data:
    lst.append(i)
lst.append(input_plot_summery)

text = [i for i in lst]
vectorizer = TfidfVectorizer ( max_df=1.0, min_df=1, stop_words='english',norm = None)
X = vectorizer.fit_transform(text)

input_matrix = vectorizer.transform ([input_plot_summery])

knn = KNeighborsClassifier(n_neighbors=input_int, metric = 'cosine') 
knn.fit(X, range(len(text))) #calculating KNN
cosine, number_movie = knn.kneighbors(input_matrix, n_neighbors = input_int+1 )

a = []
for i in number_movie:
    for j in i:
        a.append(j)
# print(Style.NORMAL, Fore.MAGENTA,Back.BLACK, a )


file = open('top_250_movies.csv')
data = csv.reader(file)

for i in a:
    for j in range(1 , 252):
        header = next(data)
        if j-2  == i:
            print(Style.BRIGHT, Fore.BLACK, Back.RED, header[2])
    file.seek(0)
    

file.close()

[1m [30m [41m Unforgiven
[1m [30m [41m Pirates of the Caribbean: The Curse of the Black Pearl
[1m [30m [41m Princess Mononoke
[1m [30m [41m Fargo
