In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np

### TMDB Api key

In [2]:
api_key = '4f3ea99b4d5dc24f9a1c63a0371e8c09'

# TMDb API endpoints
base_url = 'https://api.themoviedb.org/3/'
genre_endpoint = 'genre/movie/list'
search_endpoint = 'search/movie'

In [3]:
genres = dict()
def get_movie_genres():
    url = f'{base_url}{genre_endpoint}?api_key={api_key}'
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        for genre in data['genres']:
            key = genre['id']
            value = genre['name']
            genres[key] = value
        return genres
    else:
        print(f"Failed to retrieve genres. Status code: {response.status_code}")
        return []
get_movie_genres()
print("Genres stored in dictionary 'genres'!!")

Genres stored in dictionary 'genres'!!


In [4]:
genres

{28: 'Action',
 12: 'Adventure',
 16: 'Animation',
 35: 'Comedy',
 80: 'Crime',
 99: 'Documentary',
 18: 'Drama',
 10751: 'Family',
 14: 'Fantasy',
 36: 'History',
 27: 'Horror',
 10402: 'Music',
 9648: 'Mystery',
 10749: 'Romance',
 878: 'Science Fiction',
 10770: 'TV Movie',
 53: 'Thriller',
 10752: 'War',
 37: 'Western'}

In [5]:
# Function to search for a movie by title and get its rating
def search_movie(title):
    url = f'{base_url}{search_endpoint}?api_key={api_key}&query={title}'
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        results = data.get('results', [])
        if results:
            movie = results[0]
            genre_ids= movie.get('genre_ids')
            overview = movie.get('overview')
            movie_id = movie.get('id')
            return genre_ids, overview, movie_id
        else:
            print("Movie not found.")
            return None, None, None
    else:
        print(f"Failed to retrieve movie data. Status code: {response.status_code}")
        return None, None, None

In [6]:
movie_title = 'The 6th Day'
genre_ids, overview, movie_id = search_movie(movie_title)
st = ""
for i in genre_ids:
    st = st + genres[i] + " "
print(st.replace("Science Fiction", "Sci-Fi"))
print(overview)
print(movie_id)

Sci-Fi Action Mystery 
A world of the very near future in which cattle, fish, and even the family pet can be cloned. But cloning humans is illegal - that is until family man Adam Gibson (Arnold Schwarzenegger) comes home from work one day to find a clone has replaced him. Taken from his family and plunged into a sinister world he doesn't understand, Gibson must not only save himself from the assassins who must destroy him to protect their secret, but uncover who and what is behind the horrible things happening to him.
8452


### Year: 2000 - 2001

In [7]:
def building_dataframe(link, year):
    movies = pd.DataFrame(columns=['Movie_id','Title','Director', 'Actor1', 'Actor2', 'Actor3', 'Genre', 'Overview', 'Language'])
    page = requests.get(link).text
    soup = BeautifulSoup(page, "html.parser")
    table = soup.find_all('table')[2]
    for row in table.find('tbody').find_all('tr'):
        if row.find('td') != None:
            col = row.find_all('td')
            Title = col[0].text.strip()
            Director = col[1].text.strip()
            Cast = col[2].text.split(",")
            Actor1 = Cast[0]
            Actor2 = np.nan
            Actor3 = np.nan
            if len(Cast) > 1:
                Actor2 = Cast[1]
            if len(Cast) > 2:
                Actor3 = Cast[2]

            Genre = ""
            genre_ids, Overview, movie_id = search_movie(Title)
        
            if genre_ids != None:
                for i in genre_ids:
                    Genre = Genre + genres[i] + " "
            Genre = Genre.replace("Science Fiction", "Sci-Fi").strip()
            
            movies = movies.append({'Movie_id': movie_id,'Title': Title, 'Director': Director, 'Actor1': Actor1, 'Actor2': Actor2, 'Actor3': Actor3, 'Genre': Genre, 'Overview': Overview, 'Language': 'English'}, ignore_index=True)
    try:
        movies.to_csv(f"movies_{year}.csv", index=False)
        print("File saved.")
    except:
        print("Failed to save file.")

In [8]:
# building_dataframe("https://en.wikipedia.org/wiki/List_of_American_films_of_2000", "2000")

File saved.


In [9]:
# building_dataframe("https://en.wikipedia.org/wiki/List_of_American_films_of_2001", "2001")

Movie not found.
File saved.


### Year: 2002 - 2003

In [10]:
def building_dataframe(link, year):
    movies = pd.DataFrame(columns=['Movie_id','Title','Director', 'Actor1', 'Actor2', 'Actor3', 'Genre', 'Overview', 'Language'])
    page = requests.get(link).text
    soup = BeautifulSoup(page, "html.parser")
    table = soup.find_all('table')[4]
    for row in table.find('tbody').find_all('tr'):
        if row.find('td') != None:
            col = row.find_all('td')
            Title = col[0].text.strip()
            Director = col[1].text.strip()
            Cast = col[2].text.split(",")
            Actor1 = Cast[0]
            Actor2 = np.nan
            Actor3 = np.nan
            if len(Cast) > 1:
                Actor2 = Cast[1]
            if len(Cast) > 2:
                Actor3 = Cast[2]

            Genre = ""
            genre_ids, Overview, movie_id = search_movie(Title)
        
            if genre_ids != None:
                for i in genre_ids:
                    Genre = Genre + genres[i] + " "
            Genre = Genre.replace("Science Fiction", "Sci-Fi").strip()
            
            movies = movies.append({'Movie_id': movie_id,'Title': Title, 'Director': Director, 'Actor1': Actor1, 'Actor2': Actor2, 'Actor3': Actor3, 'Genre': Genre, 'Overview': Overview, 'Language': 'English'}, ignore_index=True)
    try:
        movies.to_csv(f"movies_{year}.csv", index=False)
        print("File saved.")
    except:
        print("Failed to save file.")

In [11]:
# building_dataframe("https://en.wikipedia.org/wiki/List_of_American_films_of_2002", "2002")

File saved.


In [12]:
# building_dataframe("https://en.wikipedia.org/wiki/List_of_American_films_of_2003", "2003")

Movie not found.
File saved.


### Year: 2004 - 2005

In [13]:
def building_dataframe(link, year):
    df1 = pd.read_html(link, header=0)[2]
    df2 = pd.read_html(link, header=0)[3]
    df3 = pd.read_html(link, header=0)[4]
    df4 = pd.read_html(link, header=0)[5]
    movies = df1.append(df2.append(df3.append(df4,ignore_index=True),ignore_index=True),ignore_index=True)
    movies.drop(['Opening', 'Opening.1', 'Production company', 'Genre'], axis=1, inplace=True)
    movies['Year'] = year
    movies.drop(['Ref.'], inplace=True, axis=1)

    directors = []
    for director in movies['Cast and crew']:
        if str(" (director)") in str(director):
            directors.append(director.split(" (director)")[0].strip())
        elif " (directors)" in str(director):
            directors.append(director.split(" (directors)")[0].strip())
        elif " (director/screenplay)" in str(director):
            directors.append(director.split(" (director/screenplay)")[0].strip())
        else:
            directors.append(str(director).split(";")[0].strip())
    movies['Director'] = directors

    Actor1 = []
    Actor2 = []
    Actor3 = []

    for cast in movies['Cast and crew']:
        if(len(str(cast).split(";")[-1].split(",")) > 3):
            Actor1.append(str(cast).split(";")[-1].split(",")[0].strip())
            Actor2.append(str(cast).split(";")[-1].split(",")[1].strip())
            Actor3.append(str(cast).split(";")[-1].split(",")[2].strip())
        elif(len(str(cast).split(",")) > 2):
            Actor1.append(str(cast).split(";")[-1].split(",")[0].strip())
            Actor2.append(str(cast).split(";")[-1].split(",")[1].strip())
            Actor3.append(None)
        elif(len(str(cast).split(",")) > 1):
            Actor1.append(str(cast).split(";")[-1].split(",")[0].strip())
            Actor2.append(None)
            Actor3.append(None)
        else:
            Actor1.append(None)
            Actor2.append(None)
            Actor3.append(None)


    movies['Actor1'] = Actor1
    movies['Actor2'] = Actor2
    movies['Actor3'] = Actor3

    movie_ids = []
    g = []
    overviews = []
    for title in movies['Title']:
        Genre = ""
        genre_ids, Overview, movie_id = search_movie(title)

        if genre_ids != None:
            for i in genre_ids:
                if i in genres.keys():
                    Genre = Genre + genres[i] + " "
                    
        Genre = Genre.replace("Science Fiction", "Sci-Fi").strip()
        g.append(Genre)
        overviews.append(Overview)
        movie_ids.append(movie_id)
    movies['Genre'] = g
    movies['Overview'] = overviews
    movies['Movie_id'] = movie_ids
    movies['Language'] = 'English'

    movies = movies[['Movie_id', 'Title', 'Director', 'Actor1', 'Actor2', 'Actor3', 'Genre', 'Overview', 'Language']]
    
    try:
        movies.to_csv(f"movies_{year}.csv", index=False)
        print("File saved.")
    except:
        print("Failed to save file.")

In [14]:
# building_dataframe("https://en.wikipedia.org/wiki/List_of_American_films_of_2004", "2004")

Movie not found.
File saved.


In [15]:
# building_dataframe("https://en.wikipedia.org/wiki/List_of_American_films_of_2005", "2005")

Movie not found.
File saved.


### Year: 2006 - 2009

In [16]:
def building_dataframe(link, year):
    df1 = pd.read_html(link, header=0)[2]
    df2 = pd.read_html(link, header=0)[3]
    df3 = pd.read_html(link, header=0)[4]
    df4 = pd.read_html(link, header=0)[5]
    movies = df1.append(df2.append(df3.append(df4,ignore_index=True),ignore_index=True),ignore_index=True)
    movies.drop(['Opening', 'Opening.1', 'Production company'], axis=1, inplace=True)
    movies['Year'] = year

    directors = []
    for director in movies['Cast and crew']:
        if str(" (director)") in str(director):
            directors.append(director.split(" (director)")[0].strip())
        elif " (directors)" in str(director):
            directors.append(director.split(" (directors)")[0].strip())
        elif " (director/screenplay)" in str(director):
            directors.append(director.split(" (director/screenplay)")[0].strip())
        else:
            directors.append(str(director).split(";")[0].strip())
    movies['Director'] = directors

    Actor1 = []
    Actor2 = []
    Actor3 = []

    for cast in movies['Cast and crew']:
        if(len(str(cast).split(";")[-1].split(",")) > 3):
            Actor1.append(str(cast).split(";")[-1].split(",")[0].strip())
            Actor2.append(str(cast).split(";")[-1].split(",")[1].strip())
            Actor3.append(str(cast).split(";")[-1].split(",")[2].strip())
        elif(len(str(cast).split(";")[-1].split(",")) > 2):
            Actor1.append(str(cast).split(";")[-1].split(",")[0].strip())
            Actor2.append(str(cast).split(";")[-1].split(",")[1].strip())
            Actor3.append(None)
        elif(len(str(cast).split(";")[-1].split(",")) > 1):
            Actor1.append(str(cast).split(";")[-1].split(",")[0].strip())
            Actor2.append(None)
            Actor3.append(None)
        else:
            Actor1.append(None)
            Actor2.append(None)
            Actor3.append(None)
    
    movies['Actor1'] = Actor1
    movies['Actor2'] = Actor2
    movies['Actor3'] = Actor3
    
    movie_ids = []
    g = []
    overviews = []
    for title in movies['Title']:
        Genre = ""
        genre_ids, Overview, movie_id = search_movie(title)

        if genre_ids != None:
            for i in genre_ids:
                if i in genres.keys():
                    Genre = Genre + genres[i] + " "
                    
        Genre = Genre.replace("Science Fiction", "Sci-Fi").strip()
        g.append(Genre)
        overviews.append(Overview)
        movie_ids.append(movie_id)
    movies['Genre'] = g
    movies['Overview'] = overviews
    movies['Movie_id'] = movie_ids
    movies['Language'] = 'English'

    movies = movies[['Movie_id', 'Title', 'Director', 'Actor1', 'Actor2', 'Actor3', 'Genre', 'Overview', 'Language']]
    

    try:
        movies.to_csv(f"movies_{year}.csv", index=False)
        print("File saved.")
    except:
        print("Failed to save file.")

In [17]:
# building_dataframe("https://en.wikipedia.org/wiki/List_of_American_films_of_2006", "2006")

File saved.


In [18]:
# building_dataframe("https://en.wikipedia.org/wiki/List_of_American_films_of_2007", "2007")

Movie not found.
Movie not found.
Movie not found.
Movie not found.
File saved.


In [19]:
# building_dataframe("https://en.wikipedia.org/wiki/List_of_American_films_of_2008", "2008")

Movie not found.
File saved.


In [20]:
# building_dataframe("https://en.wikipedia.org/wiki/List_of_American_films_of_2009", "2009")

Movie not found.
Movie not found.
Movie not found.
Movie not found.
Movie not found.
Movie not found.
Movie not found.
File saved.


In [21]:
# building_dataframe("https://en.wikipedia.org/wiki/List_of_American_films_of_2011", "2011")

Movie not found.
File saved.


In [22]:
# building_dataframe("https://en.wikipedia.org/wiki/List_of_American_films_of_2013", "2013")

Movie not found.
Movie not found.
File saved.


In [23]:
# building_dataframe("https://en.wikipedia.org/wiki/List_of_American_films_of_2014", "2014")

File saved.


In [24]:
# building_dataframe("https://en.wikipedia.org/wiki/List_of_American_films_of_2015", "2015")

File saved.


In [25]:
# building_dataframe("https://en.wikipedia.org/wiki/List_of_American_films_of_2017", "2017")

File saved.


In [26]:
# building_dataframe("https://en.wikipedia.org/wiki/List_of_American_films_of_2018", "2018")

File saved.


In [27]:
# building_dataframe("https://en.wikipedia.org/wiki/List_of_American_films_of_2019", "2019")

File saved.


In [28]:
# building_dataframe("https://en.wikipedia.org/wiki/List_of_American_films_of_2020", "2020")

Movie not found.
Movie not found.
File saved.


In [29]:
# building_dataframe("https://en.wikipedia.org/wiki/List_of_American_films_of_2021", "2021")

Movie not found.
File saved.


In [30]:
# building_dataframe("https://en.wikipedia.org/wiki/List_of_American_films_of_2022", "2022")

Movie not found.
Movie not found.
File saved.


In [31]:
def building_dataframe(link, year):
    df1 = pd.read_html(link, header=0)[2]
    df2 = pd.read_html(link, header=0)[3]
    df3 = pd.read_html(link, header=0)[4]
    df4 = pd.read_html(link, header=0)[5]
    movies = df1.append(df2.append(df3.append(df4,ignore_index=True),ignore_index=True),ignore_index=True)
    movies.drop(['Opening', 'Opening.1', 'Production Company'], axis=1, inplace=True)
    movies['Year'] = year

    directors = []
    for director in movies['Cast and crew']:
        if str(" (director)") in str(director):
            directors.append(director.split(" (director)")[0].strip())
        elif " (directors)" in str(director):
            directors.append(director.split(" (directors)")[0].strip())
        elif " (director/screenplay)" in str(director):
            directors.append(director.split(" (director/screenplay)")[0].strip())
        else:
            directors.append(str(director).split(";")[0].strip())
    movies['Director'] = directors

    Actor1 = []
    Actor2 = []
    Actor3 = []

    for cast in movies['Cast and crew']:
        if(len(str(cast).split(";")[-1].split(",")) > 3):
            Actor1.append(str(cast).split(";")[-1].split(",")[0].strip())
            Actor2.append(str(cast).split(";")[-1].split(",")[1].strip())
            Actor3.append(str(cast).split(";")[-1].split(",")[2].strip())
        elif(len(str(cast).split(";")[-1].split(",")) > 2):
            Actor1.append(str(cast).split(";")[-1].split(",")[0].strip())
            Actor2.append(str(cast).split(";")[-1].split(",")[1].strip())
            Actor3.append(None)
        elif(len(str(cast).split(";")[-1].split(",")) > 1):
            Actor1.append(str(cast).split(";")[-1].split(",")[0].strip())
            Actor2.append(None)
            Actor3.append(None)
        else:
            Actor1.append(None)
            Actor2.append(None)
            Actor3.append(None)
    
    movies['Actor1'] = Actor1
    movies['Actor2'] = Actor2
    movies['Actor3'] = Actor3

    movie_ids = []
    g = []
    overviews = []
    for title in movies['Title']:
        Genre = ""
        genre_ids, Overview, movie_id = search_movie(title)

        if genre_ids != None:
            for i in genre_ids:
                if i in genres.keys():
                    Genre = Genre + genres[i] + " "
                    
        Genre = Genre.replace("Science Fiction", "Sci-Fi").strip()
        g.append(Genre)
        overviews.append(Overview)
        movie_ids.append(movie_id)
    movies['Genre'] = g
    movies['Overview'] = overviews
    movies['Movie_id'] = movie_ids
    movies['Language'] = 'English'

    movies = movies[['Movie_id', 'Title', 'Director', 'Actor1', 'Actor2', 'Actor3', 'Genre', 'Overview', 'Language']]
    

    try:
        movies.to_csv(f"movies_{year}.csv", index=False)
        print("File saved.")
    except:
        print("Failed to save file.")

In [32]:
# building_dataframe("https://en.wikipedia.org/wiki/List_of_American_films_of_2010", "2010")

Movie not found.
Movie not found.
Movie not found.
Movie not found.
Movie not found.
Movie not found.
File saved.


In [33]:
# building_dataframe("https://en.wikipedia.org/wiki/List_of_American_films_of_2012", "2012")

Movie not found.
Movie not found.
Movie not found.
Movie not found.
Movie not found.
Movie not found.
File saved.


In [34]:
# building_dataframe("https://en.wikipedia.org/wiki/List_of_American_films_of_2016", "2016")

File saved.
