In [27]:
import pandas as pd
import numpy as np
import os

from tmdbv3api import Movie
from tmdbv3api import TMDb
import requests
import json



In [2]:
def get_movies(year):
    link = f"https://en.wikipedia.org/wiki/List_of_American_films_of_{year}"
    df1 = pd.read_html(link, header=0)[2]
    df2 = pd.read_html(link, header=0)[3]
    df3 = pd.read_html(link, header=0)[4]
    df4 = pd.read_html(link, header=0)[5]
    df = df1.append(df2.append(df3.append(df4,ignore_index=True),ignore_index=True),ignore_index=True)

    return df


In [3]:
df_2017 = get_movies(2017)
df_2018 = get_movies(2018)
df_2019 = get_movies(2019)
df_2020 = get_movies(2020)
df_2021 = get_movies(2021)
df_2022 = get_movies(2022)
df_2023 = get_movies(2023)



In [5]:
df_2017 = df_2017[["Title", "Cast and crew"]]
df_2018 = df_2018[["Title", "Cast and crew"]]
df_2019 = df_2019[["Title", "Cast and crew"]]
df_2020 = df_2020[["Title", "Cast and crew"]]
df_2021 = df_2021[["Title", "Cast and crew"]]
df_2022 = df_2022[["Title", "Cast and crew"]]
df_2023 = df_2023[["Title", "Cast and crew"]]


In [38]:
def get_genres(movie_name):
    tmdb = TMDb()
    movie = Movie()
    tmdb.api_key = os.environ.get('api_key')

    genres = ""
    result = movie.search(movie_name)

    if len(result) > 0 :
        id = result[0]["id"]
    else:
        id = np.NaN

    url = f"http://api.themoviedb.org/3/movie/{id}?api_key={tmdb.api_key}"
    response = requests.get(url)
    data = response.json()
    
    if len(data) > 3:
        for i in range(0,len(data.get("genres"))):
            if i < len(data.get("genres"))-1: # This is to remove the space at the end of the string
                genres +=((data["genres"][i]["name"])) + " "
            else:
                genres +=((data["genres"][i]["name"]))
                
        return genres
    else:
        return np.NaN
    
    


In [39]:
df_2021["Title"] = df_2021["Title"].astype("str") 
df_2022["Title"] = df_2022["Title"].astype("str") 
df_2023["Title"] = df_2023["Title"].astype("str") 

In [42]:

df_2017["genres"] = df_2017["Title"].apply(get_genres)
df_2018["genres"] = df_2018["Title"].apply(get_genres)
df_2019["genres"] = df_2019["Title"].apply(get_genres)
df_2020["genres"] = df_2020["Title"].apply(get_genres)
df_2021["genres"] = df_2021["Title"].apply(get_genres)
df_2022["genres"] = df_2022["Title"].apply(get_genres)
df_2023["genres"] = df_2023["Title"].apply(get_genres)

In [49]:
df_total = pd.concat([df_2017, df_2018, df_2019, df_2020, df_2021, df_2022, df_2023], ignore_index=True)

In [50]:
df_total

Unnamed: 0,Title,Cast and crew,genres
0,Underworld: Blood Wars,Anna Foerster (director); Cory Goodman (screen...,Fantasy Action Thriller Horror
1,Arsenal,Steven C. Miller (director); Jason Mosberg (sc...,Thriller Crime
2,Between Us,Rafael Palacio Illingworth (director/screenpla...,Drama
3,Monster Trucks,Chris Wedge (director); Derek Connolly (screen...,Action Comedy Science Fiction
4,The Bye Bye Man,Stacy Title (director); Jonathan Penner (scree...,Horror Thriller
...,...,...,...
1941,It Lives Inside,Bishal Dutta (director/screenplay); Megan Suri...,Horror
1942,The Kill Room,Nicol Paone (director) Jonathan Jacobson (scre...,Comedy Thriller
1943,PAW Patrol: The Mighty Movie,"Cal Brunker (director); Mckenna Grace, Taraji ...",Animation Family Comedy Action
1944,The Creator,Gareth Edwards (director/screenplay); Chris We...,Science Fiction Action Thriller


In [51]:
def get_director(x):
    return str(x).split("(")[0]

df_total["director_name"] = df_total["Cast and crew"].apply(get_director)


In [52]:
def get_actor1(x):
    if len(str(x).split("screenplay);")[-1].split(",")) >= 1:
        return str(x).split("screenplay);")[-1].split(",")[0]
    else:
        return np.NaN

def get_actor2(x):
    if len(str(x).split("screenplay);")[-1].split(",")) >= 2:
        return str(x).split("screenplay);")[-1].split(",")[1]
    else:
        return np.NaN

def get_actor3(x):
    if len(str(x).split("screenplay);")[-1].split(",")) >= 3:
        return str(x).split("screenplay);")[-1].split(",")[2]
    else:
        return np.NaN

In [53]:
df_total["actor_1_name"] = df_total["Cast and crew"].apply(get_actor1)
df_total["actor_2_name"] = df_total["Cast and crew"].apply(get_actor2)
df_total["actor_3_name"] = df_total["Cast and crew"].apply(get_actor3)

In [54]:
df_total_new = df_total[["Title", "genres", "director_name", "actor_1_name", "actor_2_name", "actor_3_name"]]

df_total_new.head(10)

Unnamed: 0,Title,genres,director_name,actor_1_name,actor_2_name,actor_3_name
0,Underworld: Blood Wars,Fantasy Action Thriller Horror,Anna Foerster,Kate Beckinsale,Theo James,Lara Pulver
1,Arsenal,Thriller Crime,Steven C. Miller,Adrian Grenier,Johnathon Schaech,Nicolas Cage
2,Between Us,Drama,Rafael Palacio Illingworth,Olivia Thirlby,Ben Feldman,Adam Goldberg
3,Monster Trucks,Action Comedy Science Fiction,Chris Wedge,Lucas Till,Jane Levy,Amy Ryan
4,The Bye Bye Man,Horror Thriller,Stacy Title,Douglas Smith,Doug Jones,Michael Trucco
5,Sleepless,Action Crime Thriller,Baran bo Odar,Jamie Foxx,Dermot Mulroney,"Tip ""T.I."" Harris"
6,100 Streets,Drama,Jim O'Hanlon,Idris Elba,Gemma Arterton,Charlie Creed-Miles
7,The Book of Love,Drama,Bill Purple,Jason Sudeikis,Jessica Biel,Maisie Williams
8,Split,Horror Thriller,M. Night Shyamalan,James McAvoy,Anya Taylor-Joy,Betty Buckley
9,XXX: Return of Xander Cage,Action Adventure Crime,D. J. Caruso,Vin Diesel,Samuel L. Jackson,Donnie Yen


In [55]:
len(df_total_new)

1946

In [62]:
def get_overview(movie_name):
    tmdb = TMDb()
    movie = Movie()
    tmdb.api_key = os.environ.get('api_key')

    overview = ""
    result = movie.search(movie_name)

    if len(result) > 0 :
        id = result[0]["id"]
    else:
        id = np.NaN

    url = f"http://api.themoviedb.org/3/movie/{id}?api_key={tmdb.api_key}"
    response = requests.get(url)
    data = response.json()
    
    if len(data) > 3:
        overview = data.get("overview")        
        return overview
    else:
        return np.NaN
    

In [64]:
df_total_new["overview"] = df_total["Title"].apply(get_overview)

In [65]:
df_total_new.head()

Unnamed: 0,Title,genres,director_name,actor_1_name,actor_2_name,actor_3_name,overview
0,Underworld: Blood Wars,Fantasy Action Thriller Horror,Anna Foerster,Kate Beckinsale,Theo James,Lara Pulver,Vampire death dealer Selene fends off brutal a...
1,Arsenal,Thriller Crime,Steven C. Miller,Adrian Grenier,Johnathon Schaech,Nicolas Cage,After the deadbeat brother of a businessman is...
2,Between Us,Drama,Rafael Palacio Illingworth,Olivia Thirlby,Ben Feldman,Adam Goldberg,A New York couple and their Midwestern friends...
3,Monster Trucks,Action Comedy Science Fiction,Chris Wedge,Lucas Till,Jane Levy,Amy Ryan,Tripp is a high school senior with a knack for...
4,The Bye Bye Man,Horror Thriller,Stacy Title,Douglas Smith,Doug Jones,Michael Trucco,When three college students move into an old h...


In [87]:
df_total_new.rename(columns = {"Title":"movie_title"}, inplace = True)

In [89]:
df_total_new.to_csv("MovieWeb.csv", index = False)

In [68]:
movie_test = pd.read_csv("movie")

In [70]:
movie_test.head()

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres
0,Avatar,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi
1,Pirates of the Caribbean: At World's End,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy
2,Spectre,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller
3,The Dark Knight Rises,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller
4,Star Wars: Episode VII - The Force Awakens ...,Doug Walker,Doug Walker,Rob Walker,,Documentary


In [82]:
movie_test["movie_title"] = movie_test["movie_title"].astype("str") 

In [83]:
movie_test["overview"] =  movie_test["movie_title"].apply(get_overview)

In [90]:
movie_test.to_csv("movie.csv", index = False)

In [129]:
df_total_new.rename(columns = {"Title":"movie_title"}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [130]:
df_total_new.head()

Unnamed: 0,movie_title,genres,director_name,actor_1_name,actor_2_name,actor_3_name
0,Underworld: Blood Wars,Fantasy Action Thriller Horror,Anna Foerster,Kate Beckinsale,Theo James,Lara Pulver
1,Arsenal,Thriller Crime,Steven C. Miller,Adrian Grenier,Johnathon Schaech,Nicolas Cage
2,Between Us,Drama,Rafael Palacio Illingworth,Olivia Thirlby,Ben Feldman,Adam Goldberg
3,Monster Trucks,Action Comedy Science Fiction,Chris Wedge,Lucas Till,Jane Levy,Amy Ryan
4,The Bye Bye Man,Horror Thriller,Stacy Title,Douglas Smith,Doug Jones,Michael Trucco


In [91]:
df_final = pd.concat([df_total_new, movie_test], ignore_index=True)
df_final.head()

Unnamed: 0,movie_title,genres,director_name,actor_1_name,actor_2_name,actor_3_name,overview
0,Underworld: Blood Wars,Fantasy Action Thriller Horror,Anna Foerster,Kate Beckinsale,Theo James,Lara Pulver,Vampire death dealer Selene fends off brutal a...
1,Arsenal,Thriller Crime,Steven C. Miller,Adrian Grenier,Johnathon Schaech,Nicolas Cage,After the deadbeat brother of a businessman is...
2,Between Us,Drama,Rafael Palacio Illingworth,Olivia Thirlby,Ben Feldman,Adam Goldberg,A New York couple and their Midwestern friends...
3,Monster Trucks,Action Comedy Science Fiction,Chris Wedge,Lucas Till,Jane Levy,Amy Ryan,Tripp is a high school senior with a knack for...
4,The Bye Bye Man,Horror Thriller,Stacy Title,Douglas Smith,Doug Jones,Michael Trucco,When three college students move into an old h...


In [105]:
df_final["movie_title"] = df_final["movie_title"].astype("str") 
df_final["genres"] = df_final["genres"].astype("str") 
df_final["director_name"] = df_final["director_name"].astype("str") 
df_final["actor_1_name"] = df_final["actor_1_name"].astype("str") 
df_final["actor_2_name"] = df_final["actor_2_name"].astype("str") 
df_final["actor_3_name"] = df_final["actor_3_name"].astype("str") 
df_final["overview"] = df_final["overview"].astype("str") 

In [136]:
df_final["combine"] = df_final["movie_title"] + " " + df_final["genres"] + " " + df_final["director_name"]+ "" + df_final["actor_1_name"]+ df_final["actor_2_name"] + df_final["actor_3_name"] + " " + df_final["overview"]   

In [139]:
df_final.head()

Unnamed: 0,movie_title,genres,director_name,actor_1_name,actor_2_name,actor_3_name,overview,combine
0,Underworld: Blood Wars,Fantasy Action Thriller Horror,Anna Foerster,Kate Beckinsale,Theo James,Lara Pulver,Vampire death dealer Selene fends off brutal a...,Underworld: Blood Wars Fantasy Action Thriller...
1,Arsenal,Thriller Crime,Steven C. Miller,Adrian Grenier,Johnathon Schaech,Nicolas Cage,After the deadbeat brother of a businessman is...,Arsenal Thriller Crime Steven C. Miller Adria...
2,Between Us,Drama,Rafael Palacio Illingworth,Olivia Thirlby,Ben Feldman,Adam Goldberg,A New York couple and their Midwestern friends...,Between Us Drama Rafael Palacio Illingworth O...
3,Monster Trucks,Action Comedy Science Fiction,Chris Wedge,Lucas Till,Jane Levy,Amy Ryan,Tripp is a high school senior with a knack for...,Monster Trucks Action Comedy Science Fiction C...
4,The Bye Bye Man,Horror Thriller,Stacy Title,Douglas Smith,Doug Jones,Michael Trucco,When three college students move into an old h...,The Bye Bye Man Horror Thriller Stacy Title D...


In [140]:
df_final.to_csv("MovieFinal.csv", index = False)

In [138]:
len(df_final)

6678

In [93]:
df_final.isna().sum()

movie_title        0
genres             4
director_name    104
actor_1_name       7
actor_2_name      62
actor_3_name     194
overview          29
dtype: int64

In [94]:
df_final = df_final.dropna().reset_index(drop=True)

In [95]:
df_final.isna().sum()

movie_title      0
genres           0
director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
overview         0
dtype: int64