In [6]:
import pandas as pd

# load excel file
df = pd.read_excel("movies.xlsx")
df.head()

Unnamed: 0,title
0,The Age of Adaline
1,Black Christmas
2,London
3,Twisted
4,Friends with Kids


In [22]:
print('Number of rows:', df.size)

Number of rows: 150


In [7]:
# initialize omdb
import requests
import time

OMDB_API_KEY = "c0c5b16c"

In [17]:
# function to get movie data from omdp
def get_omdb_data(title):
    url = f"http://www.omdbapi.com/?t={title}&apikey={OMDB_API_KEY}"
    response = requests.get(url)
    data = response.json()
    
    if data.get("Response") == "True":
        return {
            "Title": data.get("Title"),
            "Year": data.get("Year"),
            "Rated": data.get("Rated"),
            "Runtime": data.get("Runtime"),
            "IMDB_Rating": data.get("imdbRating"),
            "IMDB_Votes": data.get("imdbVotes"),
            "Box_Office": data.get("BoxOffice"),
            "Released": data.get("Released"),
            "Genre": data.get("Genre"),
            "Director": data.get("Director"),
            "Writer": data.get("Writer"),
            "Actors": data.get("Actors"),
            "Plot": data.get("Plot"),
            "Language": data.get("Language"),
            "Country": data.get("Country"),
            "Awards": data.get("Awards"),
            "Poster": data.get("Poster"),
            "Metascore": data.get("Metascore"),
            "Ratings": data.get("Ratings")
        }
    else:
        return None

In [21]:
# get_omdp_data test
get_omdb_data("inception")

{'Title': 'Inception',
 'Year': '2010',
 'Rated': 'PG-13',
 'Runtime': '148 min',
 'IMDB_Rating': '8.8',
 'IMDB_Votes': '2,645,456',
 'Box_Office': '$292,587,330',
 'Released': '16 Jul 2010',
 'Genre': 'Action, Adventure, Sci-Fi',
 'Director': 'Christopher Nolan',
 'Writer': 'Christopher Nolan',
 'Actors': 'Leonardo DiCaprio, Joseph Gordon-Levitt, Elliot Page',
 'Plot': 'A thief who steals corporate secrets through the use of dream-sharing technology is given the inverse task of planting an idea into the mind of a C.E.O., but his tragic past may doom the project and his team to disaster.',
 'Language': 'English, Japanese, French',
 'Country': 'United States, United Kingdom',
 'Awards': 'Won 4 Oscars. 159 wins & 220 nominations total',
 'Poster': 'https://m.media-amazon.com/images/M/MV5BMjAxMzY3NjcxNF5BMl5BanBnXkFtZTcwNTI5OTM0Mw@@._V1_SX300.jpg',
 'Metascore': '74',
 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '8.8/10'},
  {'Source': 'Rotten Tomatoes', 'Value': '87%'},
  

In [24]:
# fetch omdb data for all movies
omdb_data = []
for i, title in enumerate(df["title"]):
    print(f"Getting data of movie {i+1}/{len(df)}: {title}")
    data = get_omdb_data(title)
    if data:
        omdb_data.append(data)
    # set delays to avoid API rate limits
    time.sleep(1)

# convert list to dataframe
omdb_df = pd.DataFrame(omdb_data)

# to avoid case sensitivity in merge
df["Title"] = df["title"].str.title()
df = df.drop(columns=["title"])
omdb_df["Title"] = omdb_df["Title"].str.title()

# merge with original df
movies_df = df.merge(omdb_df, on="Title", how="left")

Getting data of movie 1/150: The Age of Adaline
Getting data of movie 2/150: Black Christmas
Getting data of movie 3/150: London
Getting data of movie 4/150: Twisted
Getting data of movie 5/150: Friends with Kids
Getting data of movie 6/150: Ken Park
Getting data of movie 7/150: Girls Trip
Getting data of movie 8/150: The Art of the Steal
Getting data of movie 9/150: Macbeth
Getting data of movie 10/150: Fatherhood
Getting data of movie 11/150: London Has Fallen
Getting data of movie 12/150: Anthropoid
Getting data of movie 13/150: Me, Myself & Irene
Getting data of movie 14/150: The Last Face
Getting data of movie 15/150: The Fly
Getting data of movie 16/150: A Quiet Place
Getting data of movie 17/150: God's Not Dead 2
Getting data of movie 18/150: Marvel One-Shot: Agent Carter
Getting data of movie 19/150: Hamlet 2
Getting data of movie 20/150: Where the Truth Lies
Getting data of movie 21/150: Ashby
Getting data of movie 22/150: Jaws
Getting data of movie 23/150: Ricochet
Getting da

In [25]:
movies_df.head()

Unnamed: 0,title,Title,Year,Rated,Runtime,IMDB_Rating,IMDB_Votes,Box_Office,Released,Genre,Director,Writer,Actors,Plot,Language,Country,Awards,Poster,Metascore,Ratings
0,The Age of Adaline,The Age Of Adaline,2015,PG-13,112 min,7.2,212877,"$42,629,776",24 Apr 2015,"Drama, Fantasy, Romance",Lee Toland Krieger,"J. Mills Goodloe, Salvador Paskowitz","Blake Lively, Michiel Huisman, Harrison Ford","A young woman, born at the turn of the 20th ce...","English, Portuguese, Italian","United States, Canada",1 win & 10 nominations,https://m.media-amazon.com/images/M/MV5BMTAzMT...,51,"[{'Source': 'Internet Movie Database', 'Value'..."
1,Black Christmas,Black Christmas,1974,R,98 min,7.1,50352,,20 Dec 1974,"Horror, Mystery, Thriller",Bob Clark,Roy Moore,"Olivia Hussey, Keir Dullea, Margot Kidder","During their Christmas break, a group of soror...","English, Latin",Canada,3 wins & 2 nominations,https://m.media-amazon.com/images/M/MV5BZjdiMz...,65,"[{'Source': 'Internet Movie Database', 'Value'..."
2,London,London,2005,R,92 min,6.3,25119,"$20,361",03 Jul 2021,"Drama, Romance",Hunter Richards,Hunter Richards,"Jessica Biel, Chris Evans, Jason Statham","At a drug laden party in a New York loft, a yo...",English,"United Kingdom, United States",1 win & 1 nomination total,https://m.media-amazon.com/images/M/MV5BMTc5Mz...,24,"[{'Source': 'Internet Movie Database', 'Value'..."
3,Twisted,Twisted,2004,R,97 min,5.3,23599,"$25,198,598",27 Feb 2004,"Crime, Drama, Mystery",Philip Kaufman,Sarah Thorp,"Ashley Judd, Samuel L. Jackson, Andy Garcia","Jessica, whose father killed her mother and co...","English, Italian, Spanish","United States, Germany",1 win,https://m.media-amazon.com/images/M/MV5BMGJhNj...,26,"[{'Source': 'Internet Movie Database', 'Value'..."
4,Friends with Kids,Friends With Kids,2011,R,107 min,6.1,43369,"$7,251,073",16 Mar 2012,"Comedy, Drama, Romance",Jennifer Westfeldt,Jennifer Westfeldt,"Jennifer Westfeldt, Adam Scott, Maya Rudolph",Two best friends decide to have a child togeth...,English,United States,2 nominations,https://m.media-amazon.com/images/M/MV5BMjIyMD...,55,"[{'Source': 'Internet Movie Database', 'Value'..."


In [40]:
movies_df.info()

0      False
1      False
2      False
3      False
4      False
       ...  
145    False
146    False
147    False
148    False
149    False
Name: Box_Office, Length: 150, dtype: bool

In [36]:
# null values
movies_df.isnull().sum()

title          0
Title          0
Year           1
Rated          1
Runtime        1
IMDB_Rating    1
IMDB_Votes     1
Box_Office     1
Released       1
Genre          1
Director       1
Writer         1
Actors         1
Plot           1
Language       1
Country        1
Awards         1
Poster         1
Metascore      1
Ratings        1
dtype: int64

In [58]:
movies_df = movies_df.dropna()

In [59]:
# find movies where box office earnings are not available
boxoffice_na = movies_df[movies_df['Box_Office'] == 'N/A']
box_office_na_count = boxoffice_na.shape[0]
print(f'movies where box office earnings are not available: {box_office_na_count}')

movies where box office earnings are not available: 35


In [61]:
# find movies where released before 2006
movies_df['Year_numeric'] = pd.to_numeric(movies_df['Year'], errors='coerce')
before06 = movies_df[movies_df['Year_numeric'] < 2006]
before06_count = before06.shape[0]
print(f'movies released before 2006: {before06_count}')

movies released before 2006: 55


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['Year_numeric'] = pd.to_numeric(movies_df['Year'], errors='coerce')


In [62]:
movies_df_cleaned = movies_df[movies_df['Box_Office'] != 'N/A']
movies_df_cleaned = movies_df_cleaned[movies_df_cleaned['Year_numeric'] < 2006]

In [63]:
movies_df_cleaned = movies_df_cleaned.drop(columns=["Year_numeric"])

In [64]:
movies_df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 52 entries, 2 to 143
Data columns (total 20 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        52 non-null     object
 1   Title        52 non-null     object
 2   Year         52 non-null     object
 3   Rated        52 non-null     object
 4   Runtime      52 non-null     object
 5   IMDB_Rating  52 non-null     object
 6   IMDB_Votes   52 non-null     object
 7   Box_Office   52 non-null     object
 8   Released     52 non-null     object
 9   Genre        52 non-null     object
 10  Director     52 non-null     object
 11  Writer       52 non-null     object
 12  Actors       52 non-null     object
 13  Plot         52 non-null     object
 14  Language     52 non-null     object
 15  Country      52 non-null     object
 16  Awards       52 non-null     object
 17  Poster       52 non-null     object
 18  Metascore    52 non-null     object
 19  Ratings      52 non-null     object

In [37]:
# initialize pytrends

from pytrends.request import TrendReq

pytrends = TrendReq(hl='en-US', tz=360)