In [1]:
pip install requests

Note: you may need to restart the kernel to use updated packages.


# PART 1: THIS SECTION OF THE CODE AIMS AT FETCHING THE MOVIE METADATA FROM OMDB API FROM THE GIVEN LIST OF MOVIE TITLES.

In [5]:
import pandas as pd
import requests
import time

# THIS FUCTION IS TO FETCH MOVIE INFORMATION BY TITLE AND IF AVAILABLE BY YEAR
def get_omdb_movie_data(movie_title, movie_year, api_key):
    if movie_year and not pd.isna(movie_year):  # Check if year is provided
        movie_year = str(int(movie_year))
        url = f'http://www.omdbapi.com/?apikey={api_key}&t={movie_title}&y={movie_year}'
    else:
        url = f'http://www.omdbapi.com/?apikey={api_key}&t={movie_title}'
    
    try:
        # SENDING REQUEST TO API
        response = requests.get(url, timeout=10)
        
        # CHECK TO SEE IF RESPONSE WAS SUCCESSFULL
        if response.status_code == 200:
            movie_data = response.json()
            
            if movie_data.get("Response") == "True":
                return {
                    "Title": movie_data.get("Title", "N/A"),
                    "Year": movie_data.get("Year", "N/A"),
                    "Rated": movie_data.get("Rated", "N/A"),
                    "Released": movie_data.get("Released", "N/A"),
                    "Runtime": movie_data.get("Runtime", "N/A"),
                    "Genre": movie_data.get("Genre", "N/A"),
                    "Director": movie_data.get("Director", "N/A"),
                    "Actors": movie_data.get("Actors", "N/A"),
                    "Plot": movie_data.get("Plot", "N/A"),
                    "IMDb Rating": movie_data.get("imdbRating", "N/A")
                }
            else:
                return None
        else:
            return None
    except Exception as e:
        return None
        
# READING THE CSV FILE WITH MOVIE TITLES AND YEARS
try:
    my_movie_list_df = pd.read_csv("C:/Users/Nutan/Downloads/movies.csv", encoding="ISO-8859-1")
except FileNotFoundError:
    print("CSV file not found. Please check the file path and try again.")
    exit()

# INITIALIZING API KEY
API_KEY = "6f5fdbed"

# INITIALIZING EMPTY LIST TO STORE MOVIE METADATA     
my_movie_dataset = []

# LOOPING THROUGH THE MOVIES FROM CSV FILE AND FETCHING THE DETAILS FOR EACH MOVIE
for index, row in my_movie_list_df.iterrows():
    movie_title = row['Title'] 
    movie_year = row.get('Year', None) 
    
    # FETCHING MOVIE DATA
    movie_data = get_omdb_movie_data(movie_title, movie_year, API_KEY)
    
    if movie_data:  # ONLY ADDING MOVIE IF THE DATA WAS FETCHED SUCCESSFULLY 
        my_movie_dataset.append(movie_data)
    
    # SINCE I AM FETCHING DATA FOR 1500 MOVIE TITLES, I AM SHOWING THE PROGESS AFTER EVERY 100 MOVIES HAVE BEEN FETCHED SUCCESSFULLY 
    if (index + 1) % 100 == 0:
        print(f"Processed {index + 1} movies...")

    time.sleep(1)  # PAUSING HERE TO AVOID HITTING RATE LIMIT OF THE API

# CONVERTING THE MOVIE METADATA OBTAINED INTO A PANDAS DATAFRAME
df = pd.DataFrame(my_movie_dataset)

# SAVING THE DATAFRAME TO A CSV FILE 
df.to_csv('MY_omdb_movie_dataset.csv', index=False) 

print("Movie dataset has been created and saved to 'MY_omdb_movie_dataset.csv'")

df.head(10)

Processed 100 movies...
Processed 200 movies...
Processed 300 movies...
Processed 400 movies...
Processed 500 movies...
Processed 600 movies...
Processed 700 movies...
Processed 800 movies...
Processed 900 movies...
Processed 1000 movies...
Processed 1100 movies...
Processed 1200 movies...
Processed 1300 movies...
Processed 1400 movies...
Processed 1500 movies...
Movie dataset has been created and saved to 'MY_omdb_movie_dataset.csv'


Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Actors,Plot,IMDb Rating
0,The Shawshank Redemption,1994,R,14 Oct 1994,142 min,Drama,Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton",A banker convicted of uxoricide forms a friend...,9.3
1,Forrest Gump,1994,PG-13,06 Jul 1994,142 min,"Drama, Romance",Robert Zemeckis,"Tom Hanks, Robin Wright, Gary Sinise",The history of the United States from the 1950...,8.8
2,Schindler's List,1993,R,04 Feb 1994,195 min,"Biography, Drama, History",Steven Spielberg,"Liam Neeson, Ralph Fiennes, Ben Kingsley","In German-occupied Poland during World War II,...",9.0
3,The Godfather,1972,R,24 Mar 1972,175 min,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan",The aging patriarch of an organized crime dyna...,9.2
4,The Green Mile,1999,R,10 Dec 1999,189 min,"Crime, Drama, Fantasy",Frank Darabont,"Tom Hanks, Michael Clarke Duncan, David Morse","A tale set on death row, where gentle giant Jo...",8.6
5,Good Will Hunting,1997,R,09 Jan 1998,126 min,"Drama, Romance",Gus Van Sant,"Robin Williams, Matt Damon, Ben Affleck","Will Hunting, a janitor at MIT, has a gift for...",8.3
6,A Beautiful Mind,2001,PG-13,04 Jan 2002,135 min,"Biography, Drama, Mystery",Ron Howard,"Russell Crowe, Ed Harris, Jennifer Connelly","A mathematical genius, John Nash made an aston...",8.2
7,Saving Private Ryan,1998,R,24 Jul 1998,169 min,"Drama, War",Steven Spielberg,"Tom Hanks, Matt Damon, Tom Sizemore","Following the Normandy Landings, a group of U....",8.6
8,The Pursuit of Happyness,2006,PG-13,15 Dec 2006,117 min,"Biography, Drama",Gabriele Muccino,"Will Smith, Thandiwe Newton, Jaden Smith",A struggling salesman takes custody of his son...,8.0
9,The Pianist,2002,R,28 Mar 2003,150 min,"Biography, Drama, Music",Roman Polanski,"Adrien Brody, Thomas Kretschmann, Frank Finlay","During WWII, acclaimed Polish musician Wladysl...",8.5


## PREPROCESSING THE PLOT FOR NLP COLUMNS

In [7]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

# DOWNLOADING NLTK RESOURCES 
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# LOADING MOVIES DATASET
movies_df = pd.read_csv("MY_omdb_movie_dataset.csv")

# INITIALIZING NLTK TOOLS
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# FUNCTION TO PREPROCESS MOVIE PLOT
def preprocess_movie_plot(plot):
    #CHECK IF THE PLOT IS NOT A STRING
    if not isinstance(plot, str):
        return "" 
        
    # STEP1: LOWERCASING
    plot = plot.lower()

    # STEP 2: REMOVING SPECIAL CHARACTERS( RETAINS ONLY WORDS AND SPACES)
    plot = re.sub(r'[^a-z\s]', '', plot)

    # STEP 3: TOKENIZING
    tokens = word_tokenize(plot)

    # STEP 4: REMOVING STOP WORDS
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # STEP 5 : LEMMATIZING
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    # JOINGING TOKENS BACK TO A SINGLE STRING
    return ' '.join(lemmatized_tokens)

# APPLYING PREPROCESSING TO THE 'Plot' COLUMN
movies_df['Processed_Plot'] = movies_df['Plot'].apply(preprocess_movie_plot)

# SAVING THE PREPROCESSED PLOT COLUMN TO CSV FILE
movies_df.to_csv("MY_omdb_movie_dataset.csv", index=False)

print("Preprocessing complete. First 5 processed plots:")
movies_df[['Title', 'Plot', 'Processed_Plot']].head()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nutan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nutan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Nutan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocessing complete. First 5 processed plots:


Unnamed: 0,Title,Plot,Processed_Plot
0,The Shawshank Redemption,A banker convicted of uxoricide forms a friend...,banker convicted uxoricide form friendship qua...
1,Forrest Gump,The history of the United States from the 1950...,history united state unfolds perspective alaba...
2,Schindler's List,"In German-occupied Poland during World War II,...",germanoccupied poland world war ii industriali...
3,The Godfather,The aging patriarch of an organized crime dyna...,aging patriarch organized crime dynasty transf...
4,The Green Mile,"A tale set on death row, where gentle giant Jo...",tale set death row gentle giant john coffey po...


In [7]:
pip install textblob




# PART 2: SENTIMENT ANALYSIS OF THE MOVIE PLOTS

In [13]:
from textblob import TextBlob

movies_df = pd.read_csv("MY_omdb_movie_dataset.csv")

# FUNCTION TO ANALYSE SENTIMENT AND SENTIMENT SCORE OF MOVIE PLOTS 
def analyze_plot_sentiment(plot):
    blob = TextBlob(plot)
    polarity = blob.sentiment.polarity
    sentiment = "Positive" if polarity > 0 else "Negative" if polarity < 0 else "Neutral"
    return sentiment, polarity

# APPLYING SENTIMENT ANALYSIS TO THE PLOT COLUMN 
movies_df['Sentiment'], movies_df['Sentiment Score'] = zip(*movies_df['Processed_Plot'].apply(lambda x: analyze_plot_sentiment(x) if isinstance(x, str) else ("Neutral", 0)))

movies_df.to_csv('MY_omdb_movie_dataset.csv', index=False)

# DISPLAYING THE UPDATED DATAFRAME WITH SENTIMENT AND SENTIMENT SCORE COLUMNS
movies_df[['Title', 'Processed_Plot', 'Sentiment', 'Sentiment Score']].head(10)

Unnamed: 0,Title,Processed_Plot,Sentiment,Sentiment Score
0,The Shawshank Redemption,banker convicted uxoricide form friendship qua...,Neutral,0.0
1,Forrest Gump,history united state unfolds perspective alaba...,Neutral,0.0
2,Schindler's List,germanoccupied poland world war ii industriali...,Neutral,0.0
3,The Godfather,aging patriarch organized crime dynasty transf...,Neutral,0.0
4,The Green Mile,tale set death row gentle giant john coffey po...,Positive,0.066667
5,Good Will Hunting,hunting janitor mit gift mathematics need help...,Neutral,0.0
6,A Beautiful Mind,mathematical genius john nash made astonishing...,Positive,0.22
7,Saving Private Ryan,following normandy landing group u soldier go ...,Negative,-0.125
8,The Pursuit of Happyness,struggling salesman take custody son he poised...,Positive,0.1
9,The Pianist,wwii acclaimed polish musician wladyslaw face ...,Negative,-0.15


# PART 3: KEYWORD EXTRACTION USING KeyBERT

In [5]:
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
import pandas as pd

movies_df = pd.read_csv("MY_omdb_movie_dataset.csv")

# INITIALIZING THE KeyBERT MODEL (THIS MODEL USES BERT EMBEDDINGS INTERNALLY)
model = KeyBERT('all-MiniLM-L6-v2')

# FUCNTION TO EXTRACT KEYWORDS USING KeyBERT
def extracting_keywords_with_keybert(text, top_n=5):
    keywords = model.extract_keywords(text, top_n=top_n)
    return ", ".join([kw[0] for kw in keywords])

# APPLYING THE KeyBERT KEYWORD EXTRACTION TO MOVIE PLOT COLUMN 
movies_df['Extracted_Keywords'] = movies_df['Processed_Plot'].fillna("").apply(extracting_keywords_with_keybert)

# SAVING THE COLUMN TO THE CSV FILE
movies_df.to_csv("MY_omdb_movie_dataset.csv", index=False)

# DISPLAYING THE EXTRACTED KEYWORDS COLUMN
movies_df[['Title', 'Processed_Plot', 'Extracted_Keywords']].head(10)




Unnamed: 0,Title,Processed_Plot,Extracted_Keywords
0,The Shawshank Redemption,banker convicted uxoricide form friendship qua...,"convict, convicted, innocence, uxoricide, banker"
1,Forrest Gump,history united state unfolds perspective alaba...,"alabama, reunited, iq, state, history"
2,Schindler's List,germanoccupied poland world war ii industriali...,"schindler, nazi, oskar, jewish, germanoccupied"
3,The Godfather,aging patriarch organized crime dynasty transf...,"dynasty, clandestine, empire, patriarch, crime"
4,The Green Mile,tale set death row gentle giant john coffey po...,"coffey, tale, john, giant, death"
5,Good Will Hunting,hunting janitor mit gift mathematics need help...,"mathematics, janitor, psychologist, gift, hunting"
6,A Beautiful Mind,mathematical genius john nash made astonishing...,"nash, mathematical, discovery, john, journey"
7,Saving Private Ryan,following normandy landing group u soldier go ...,"paratrooper, normandy, soldier, landing, enemy"
8,The Pursuit of Happyness,struggling salesman take custody son he poised...,"salesman, custody, son, career, lifechanging"
9,The Pianist,wwii acclaimed polish musician wladyslaw face ...,"wladyslaw, warsaw, polish, wwii, musician"
