In [1]:
import pandas as pd
import numpy as np
import ast
import json
import requests
import urllib.request as request
import bs4 as bs

from tmdbv3api import TMDb, Movie

# Preprocessing 1

In [2]:
# Import movie_metadata.csv
metadata = pd.read_csv('../dataset/movie_metadata.csv')
metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   color                      5024 non-null   object 
 1   director_name              4939 non-null   object 
 2   num_critic_for_reviews     4993 non-null   float64
 3   duration                   5028 non-null   float64
 4   director_facebook_likes    4939 non-null   float64
 5   actor_3_facebook_likes     5020 non-null   float64
 6   actor_2_name               5030 non-null   object 
 7   actor_1_facebook_likes     5036 non-null   float64
 8   gross                      4159 non-null   float64
 9   genres                     5043 non-null   object 
 10  actor_1_name               5036 non-null   object 
 11  movie_title                5043 non-null   object 
 12  num_voted_users            5043 non-null   int64  
 13  cast_total_facebook_likes  5043 non-null   int64

In [3]:
# Mengambil atribut yang dibutuhkan
metadata = metadata[['director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'genres', 'movie_title']]
metadata.head()

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action|Adventure|Fantasy|Sci-Fi,Avatar
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action|Adventure|Fantasy,Pirates of the Caribbean: At World's End
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action|Adventure|Thriller,Spectre
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action|Thriller,The Dark Knight Rises
4,Doug Walker,Doug Walker,Rob Walker,,Documentary,Star Wars: Episode VII - The Force Awakens ...


In [4]:
# Mengganti nilai NaN dengan 'unknown' dan menggantikan | dengan spasi
for col in metadata.columns:
    metadata[col] = metadata[col].replace(np.nan, 'unknown')

metadata['genres'] = metadata['genres'].apply(lambda x: x.replace('|', ' '))
metadata.head()

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,Avatar
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,Pirates of the Caribbean: At World's End
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,Spectre
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,The Dark Knight Rises
4,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary,Star Wars: Episode VII - The Force Awakens ...


In [5]:
# Menghilangkan spasi dan mengecilkan huruf pada awal huruf
metadata['movie_title'] = metadata['movie_title'].apply(lambda x: x[:-1])
metadata['movie_title'] = metadata['movie_title'].apply(lambda x: x.lower())
metadata.head()

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,avatar
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,pirates of the caribbean: at world's end
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,spectre
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,the dark knight rises
4,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary,star wars: episode vii - the force awakens ...


# Preprocessing 2

## Prepared All Data

In [6]:
# Import credits.csv
credits = pd.read_csv('../dataset/credits.csv')
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [7]:
# Import movies_metadata.csv
metadatas = pd.read_csv('../dataset/movies_metadata.csv')
metadatas.head()

  metadatas = pd.read_csv('../dataset/movies_metadata.csv')


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [8]:
# Pengubahan tipe data
metadatas['release_date'] = pd.to_datetime(metadatas['release_date'], errors='coerce')
metadatas['year'] = metadatas['release_date'].dt.year
metadatas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   adult                  45466 non-null  object        
 1   belongs_to_collection  4494 non-null   object        
 2   budget                 45466 non-null  object        
 3   genres                 45466 non-null  object        
 4   homepage               7782 non-null   object        
 5   id                     45466 non-null  object        
 6   imdb_id                45449 non-null  object        
 7   original_language      45455 non-null  object        
 8   original_title         45466 non-null  object        
 9   overview               44512 non-null  object        
 10  popularity             45461 non-null  object        
 11  poster_path            45080 non-null  object        
 12  production_companies   45463 non-null  object        
 13  p

In [9]:
# Pengambilan data pada tahun 2017
metadatas = metadatas.loc[metadatas['year'] == 2017, ['genres','id','title','year']]
metadatas.head()

Unnamed: 0,genres,id,title,year
26560,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",166426,Pirates of the Caribbean: Dead Men Tell No Tales,2017.0
26561,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",141052,Justice League,2017.0
26565,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",284053,Thor: Ragnarok,2017.0
26566,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",283995,Guardians of the Galaxy Vol. 2,2017.0
30536,"[{'id': 14, 'name': 'Fantasy'}, {'id': 28, 'na...",245842,The King's Daughter,2017.0


In [10]:
metadatas['id'] = metadatas['id'].astype(int)

In [12]:
# Penggabungan data
movie1 = metadatas.merge(credits, on='id')
movie1.head()

Unnamed: 0,genres,id,title,year,cast,crew
0,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",166426,Pirates of the Caribbean: Dead Men Tell No Tales,2017.0,"[{'cast_id': 1, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4c9cc3a36847f8236a65', 'de..."
1,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",141052,Justice League,2017.0,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{'credit_id': '55ef66dbc3a3686f1700a52d', 'de..."
2,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",284053,Thor: Ragnarok,2017.0,"[{'cast_id': 0, 'character': 'Thor Odinson', '...","[{'credit_id': '56a93fa4c3a36872db001e7a', 'de..."
3,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",283995,Guardians of the Galaxy Vol. 2,2017.0,"[{'cast_id': 3, 'character': 'Peter Quill / St...","[{'credit_id': '59171547925141583c0315a6', 'de..."
4,"[{'id': 14, 'name': 'Fantasy'}, {'id': 28, 'na...",245842,The King's Daughter,2017.0,"[{'cast_id': 0, 'character': 'King Louis XIV',...","[{'credit_id': '5431de49c3a36825d300007e', 'de..."


In [13]:
# Pengembalian struktur data
movie1['genres'] = movie1['genres'].map(lambda x: ast.literal_eval(x))
movie1['cast'] = movie1['cast'].map(lambda x: ast.literal_eval(x))
movie1['crew'] = movie1['crew'].map(lambda x: ast.literal_eval(x))
movie1.head()

Unnamed: 0,genres,id,title,year,cast,crew
0,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",166426,Pirates of the Caribbean: Dead Men Tell No Tales,2017.0,"[{'cast_id': 1, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4c9cc3a36847f8236a65', 'de..."
1,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",141052,Justice League,2017.0,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{'credit_id': '55ef66dbc3a3686f1700a52d', 'de..."
2,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",284053,Thor: Ragnarok,2017.0,"[{'cast_id': 0, 'character': 'Thor Odinson', '...","[{'credit_id': '56a93fa4c3a36872db001e7a', 'de..."
3,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",283995,Guardians of the Galaxy Vol. 2,2017.0,"[{'cast_id': 3, 'character': 'Peter Quill / St...","[{'credit_id': '59171547925141583c0315a6', 'de..."
4,"[{'id': 14, 'name': 'Fantasy'}, {'id': 28, 'na...",245842,The King's Daughter,2017.0,"[{'cast_id': 0, 'character': 'King Louis XIV',...","[{'credit_id': '5431de49c3a36825d300007e', 'de..."


### Preprocessing Genres Columns

In [14]:
def makeGenreList(x):
    gen = []
    st = " "
    for i in x:
        if i.get('name') == 'Science Fiction':
            scifi = 'Sci-Fi'
            gen.append(scifi)
        else:
            gen.append(i.get('name'))
    if gen == []:
        return np.NaN
    else:
        return (st.join(gen))
    
movie1['genre_list'] = movie1['genres'].map(lambda x: makeGenreList(x))
movie1['genre_list']

0      Adventure Action Fantasy Comedy
1      Action Adventure Fantasy Sci-Fi
2      Action Adventure Fantasy Sci-Fi
3       Action Adventure Comedy Sci-Fi
4             Fantasy Action Adventure
                    ...               
526                     Romance Comedy
527         Crime Comedy Action Family
528    Family Animation Romance Comedy
529               Crime Drama Thriller
530                                NaN
Name: genre_list, Length: 531, dtype: object

### Preprcessing Casts Columns

In [16]:
def getActor1(x):
    casts = []
    for i in x:
        casts.append(i.get('name'))
    if casts == []:
        return np.NaN
    else:
        return casts[0]
    
movie1['name_actor1'] = movie1['cast'].map(lambda x: getActor1(x))
movie1['name_actor1']

0               Johnny Depp
1               Ben Affleck
2           Chris Hemsworth
3               Chris Pratt
4            Pierce Brosnan
               ...         
526          Inka Haapamäki
527    Lou Diamond Phillips
528                     NaN
529          Sridevi Kapoor
530                     NaN
Name: name_actor1, Length: 531, dtype: object

In [17]:
def getActor2(x):
    casts = []
    for i in x:
        casts.append(i.get('name'))
    if casts == [] or len(casts) <= 1:
        return np.NaN
    else:
        return casts[1]
    
movie1['name_actor2'] = movie1['cast'].map(lambda x: getActor2(x))   
movie1['name_actor2']

0       Javier Bardem
1        Henry Cavill
2      Tom Hiddleston
3         Zoe Saldana
4        William Hurt
            ...      
526     Rosa Honkonen
527     Wallace Shawn
528               NaN
529         Sajal Ali
530               NaN
Name: name_actor2, Length: 531, dtype: object

In [18]:
def getActor3(x):
    casts = []
    for i in x:
        casts.append(i.get('name'))
    if casts == [] or len(casts) <= 2:
        return np.NaN
    else:
        return casts[2]
    
movie1['name_actor3'] = movie1['cast'].map(lambda x: getActor3(x))
movie1['name_actor3']

0        Geoffrey Rush
1            Gal Gadot
2       Cate Blanchett
3        Dave Bautista
4      Benjamin Walker
            ...       
526     Tiitus Rantala
527        Gina Holden
528                NaN
529     Akshaye Khanna
530                NaN
Name: name_actor3, Length: 531, dtype: object

### Preprocessing Crew Columns

In [19]:
def getDirector(x):
    dt = []
    st = ' '
    for i in x:
        if i.get('job') == 'Director':
            dt.append(i.get('name'))
    if dt == []:
        return np.NaN
    else:
        return st.join(dt)
    
movie1['name_director'] = movie1['crew'].map(lambda x: getDirector(x))
movie1['name_director']

0      Joachim Rønning Espen Sandberg
1                         Zack Snyder
2                       Taika Waititi
3                          James Gunn
4                       Sean McNamara
                    ...              
526                  Hannaleena Hauru
527             Jonathan A. Rosenbaum
528          Beth David Esteban Bravo
529                      Ravi Udyawar
530                     Daisy Asquith
Name: name_director, Length: 531, dtype: object

## Combining All Preprocessed Data

In [20]:
movie1.head()

Unnamed: 0,genres,id,title,year,cast,crew,genre_list,name_actor1,name_actor2,name_actor3,name_director
0,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",166426,Pirates of the Caribbean: Dead Men Tell No Tales,2017.0,"[{'cast_id': 1, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4c9cc3a36847f8236a65', 'de...",Adventure Action Fantasy Comedy,Johnny Depp,Javier Bardem,Geoffrey Rush,Joachim Rønning Espen Sandberg
1,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",141052,Justice League,2017.0,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{'credit_id': '55ef66dbc3a3686f1700a52d', 'de...",Action Adventure Fantasy Sci-Fi,Ben Affleck,Henry Cavill,Gal Gadot,Zack Snyder
2,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",284053,Thor: Ragnarok,2017.0,"[{'cast_id': 0, 'character': 'Thor Odinson', '...","[{'credit_id': '56a93fa4c3a36872db001e7a', 'de...",Action Adventure Fantasy Sci-Fi,Chris Hemsworth,Tom Hiddleston,Cate Blanchett,Taika Waititi
3,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",283995,Guardians of the Galaxy Vol. 2,2017.0,"[{'cast_id': 3, 'character': 'Peter Quill / St...","[{'credit_id': '59171547925141583c0315a6', 'de...",Action Adventure Comedy Sci-Fi,Chris Pratt,Zoe Saldana,Dave Bautista,James Gunn
4,"[{'id': 14, 'name': 'Fantasy'}, {'id': 28, 'na...",245842,The King's Daughter,2017.0,"[{'cast_id': 0, 'character': 'King Louis XIV',...","[{'credit_id': '5431de49c3a36825d300007e', 'de...",Fantasy Action Adventure,Pierce Brosnan,William Hurt,Benjamin Walker,Sean McNamara


In [21]:
movie1 = movie1[['title', 'name_actor1', 'name_actor2', 'name_actor3', 'name_director', 'genre_list']]
movie1.head()

Unnamed: 0,title,name_actor1,name_actor2,name_actor3,name_director,genre_list
0,Pirates of the Caribbean: Dead Men Tell No Tales,Johnny Depp,Javier Bardem,Geoffrey Rush,Joachim Rønning Espen Sandberg,Adventure Action Fantasy Comedy
1,Justice League,Ben Affleck,Henry Cavill,Gal Gadot,Zack Snyder,Action Adventure Fantasy Sci-Fi
2,Thor: Ragnarok,Chris Hemsworth,Tom Hiddleston,Cate Blanchett,Taika Waititi,Action Adventure Fantasy Sci-Fi
3,Guardians of the Galaxy Vol. 2,Chris Pratt,Zoe Saldana,Dave Bautista,James Gunn,Action Adventure Comedy Sci-Fi
4,The King's Daughter,Pierce Brosnan,William Hurt,Benjamin Walker,Sean McNamara,Fantasy Action Adventure


In [22]:
movie1.isnull().sum()

title             0
name_actor1      22
name_actor2      55
name_actor3      70
name_director     4
genre_list        7
dtype: int64

In [23]:
movie1 = movie1.dropna(how='any')
movie1.isnull().sum()

title            0
name_actor1      0
name_actor2      0
name_actor3      0
name_director    0
genre_list       0
dtype: int64

In [24]:
movie1 = movie1.rename(columns={'genre_list':'genres', 'title':'movie_title'})

In [25]:
movie1['movie_title'] = movie1['movie_title'].str.lower()

In [26]:
movie1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 458 entries, 0 to 529
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   movie_title    458 non-null    object
 1   name_actor1    458 non-null    object
 2   name_actor2    458 non-null    object
 3   name_actor3    458 non-null    object
 4   name_director  458 non-null    object
 5   genres         458 non-null    object
dtypes: object(6)
memory usage: 25.0+ KB


In [27]:
movie1.head()

Unnamed: 0,movie_title,name_actor1,name_actor2,name_actor3,name_director,genres
0,pirates of the caribbean: dead men tell no tales,Johnny Depp,Javier Bardem,Geoffrey Rush,Joachim Rønning Espen Sandberg,Adventure Action Fantasy Comedy
1,justice league,Ben Affleck,Henry Cavill,Gal Gadot,Zack Snyder,Action Adventure Fantasy Sci-Fi
2,thor: ragnarok,Chris Hemsworth,Tom Hiddleston,Cate Blanchett,Taika Waititi,Action Adventure Fantasy Sci-Fi
3,guardians of the galaxy vol. 2,Chris Pratt,Zoe Saldana,Dave Bautista,James Gunn,Action Adventure Comedy Sci-Fi
4,the king's daughter,Pierce Brosnan,William Hurt,Benjamin Walker,Sean McNamara,Fantasy Action Adventure


In [29]:
movie1['combination'] = movie1['name_actor1'] + ' ' + movie1['name_actor2'] + ' ' + movie1['name_actor3'] + ' ' + movie1['movie_title'] + ' ' +  movie1['name_director'] + ' ' + movie1['genres'] 
movie1.head()

Unnamed: 0,movie_title,name_actor1,name_actor2,name_actor3,name_director,genres,combination
0,pirates of the caribbean: dead men tell no tales,Johnny Depp,Javier Bardem,Geoffrey Rush,Joachim Rønning Espen Sandberg,Adventure Action Fantasy Comedy,Johnny Depp Javier Bardem Geoffrey Rush pirate...
1,justice league,Ben Affleck,Henry Cavill,Gal Gadot,Zack Snyder,Action Adventure Fantasy Sci-Fi,Ben Affleck Henry Cavill Gal Gadot justice lea...
2,thor: ragnarok,Chris Hemsworth,Tom Hiddleston,Cate Blanchett,Taika Waititi,Action Adventure Fantasy Sci-Fi,Chris Hemsworth Tom Hiddleston Cate Blanchett ...
3,guardians of the galaxy vol. 2,Chris Pratt,Zoe Saldana,Dave Bautista,James Gunn,Action Adventure Comedy Sci-Fi,Chris Pratt Zoe Saldana Dave Bautista guardian...
4,the king's daughter,Pierce Brosnan,William Hurt,Benjamin Walker,Sean McNamara,Fantasy Action Adventure,Pierce Brosnan William Hurt Benjamin Walker th...


## Combine All Data

In [30]:
metadata.head()

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,avatar
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,pirates of the caribbean: at world's end
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,spectre
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,the dark knight rises
4,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary,star wars: episode vii - the force awakens ...


In [31]:
# Rename actor columns
metadata = metadata.rename(columns={'actor_1_name':'name_actor1', 'actor_2_name':'name_actor2', 'actor_3_name':'name_actor3', 'director_name':'name_director'})
metadata.head()

Unnamed: 0,name_director,name_actor1,name_actor2,name_actor3,genres,movie_title
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,avatar
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,pirates of the caribbean: at world's end
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,spectre
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,the dark knight rises
4,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary,star wars: episode vii - the force awakens ...


In [32]:
metadata['combination'] = metadata['name_actor1'] + ' ' + metadata['name_actor2'] + ' ' + metadata['name_actor3'] + ' ' + metadata['name_director'] + ' ' + metadata['genres']
metadata.head()

Unnamed: 0,name_director,name_actor1,name_actor2,name_actor3,genres,movie_title,combination
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,avatar,CCH Pounder Joel David Moore Wes Studi James C...
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,pirates of the caribbean: at world's end,Johnny Depp Orlando Bloom Jack Davenport Gore ...
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,spectre,Christoph Waltz Rory Kinnear Stephanie Sigman ...
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,the dark knight rises,Tom Hardy Christian Bale Joseph Gordon-Levitt ...
4,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary,star wars: episode vii - the force awakens ...,Doug Walker Rob Walker unknown Doug Walker Doc...


In [33]:
combine = metadata.append(movie1)
combine.drop_duplicates(subset ="movie_title", keep='last', inplace=True)
combine.head()

  combine = metadata.append(movie1)


Unnamed: 0,name_director,name_actor1,name_actor2,name_actor3,genres,movie_title,combination
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,avatar,CCH Pounder Joel David Moore Wes Studi James C...
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,pirates of the caribbean: at world's end,Johnny Depp Orlando Bloom Jack Davenport Gore ...
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,spectre,Christoph Waltz Rory Kinnear Stephanie Sigman ...
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,the dark knight rises,Tom Hardy Christian Bale Joseph Gordon-Levitt ...
4,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary,star wars: episode vii - the force awakens ...,Doug Walker Rob Walker unknown Doug Walker Doc...


# Preprocessing 3

In [34]:
tmdb = TMDb()
tmdb_movie = Movie()
tmdb.api_key = 'eef7bdda09ab793e9d40acae8f514be4' # Your API key

## Define Function

In [35]:
def get_genre(x):
    genres = []
    result = tmdb_movie.search(x)
    if not result:
        return np.NaN
    else:
        movie_id = result[0].id
        response = requests.get('https://api.themoviedb.org/3/movie/{}?api_key={}'.format(movie_id, tmdb.api_key))
        data_json = response.json()
        if data_json['genres']:
            genre_str = " "
            for i in range(0, len(data_json['genres'])):
                genres.append(data_json['genres'][i]['name'])
            return genre_str.join(genres)
        else:
            np.NaN

In [36]:
def get_director(x):
    if " (director)" in x:
        return x.split(" (director)")[0]
    elif " (directors)" in x:
        return x.split(" (directors)")[0]
    else:
        return x.split(" (director/screenplay)")[0]

In [37]:
def get_actor1(x):
    return ((x.split("screenplay); ")[-1]).split(", ")[0])

In [38]:
def get_actor2(x):
    if len((x.split("screenplay); ")[-1]).split(", ")) < 2:
        return np.NaN
    else:
        return ((x.split("screenplay); ")[-1]).split(", ")[1])

In [39]:
def get_actor3(x):
    if len((x.split("screenplay); ")[-1]).split(", ")) < 3:
        return np.NaN
    else:
        return ((x.split("screenplay); ")[-1]).split(", ")[2])

## Extracting Features

### Extracting Feature of 2018 Movies From Wikipedia

In [40]:
link = "https://en.wikipedia.org/wiki/List_of_American_films_of_2018"
df1 = pd.read_html(link, header=0)[2]
df2 = pd.read_html(link, header=0)[3]
df3 = pd.read_html(link, header=0)[4]
df4 = pd.read_html(link, header=0)[5]

In [41]:
df = df1.append(df2.append(df3.append(df4,ignore_index=True),ignore_index=True),ignore_index=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 275 entries, 0 to 274
Data columns (total 7 columns):
 #   Column                                                                       Non-Null Count  Dtype 
---  ------                                                                       --------------  ----- 
 0   Opening                                                                      275 non-null    object
 1   Opening.1                                                                    275 non-null    int64 
 2   Title                                                                        275 non-null    object
 3   Production company                                                           275 non-null    object
 4   Cast and crew                                                                275 non-null    object
 5   .mw-parser-output .tooltip-dotted{border-bottom:1px dotted;cursor:help}Ref.  74 non-null     object
 6   Ref.                                               

  df = df1.append(df2.append(df3.append(df4,ignore_index=True),ignore_index=True),ignore_index=True)


In [42]:
df['genres'] = df['Title'].map(lambda x: get_genre(str(x)))
df.head()

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,.mw-parser-output .tooltip-dotted{border-bottom:1px dotted;cursor:help}Ref.,Ref.,genres
0,JANUARY,5,Insidious: The Last Key,Universal Pictures / Blumhouse Productions / S...,Adam Robitel (director); Leigh Whannell (scree...,[2],,Horror Mystery Thriller
1,JANUARY,5,The Strange Ones,Vertical Entertainment,Lauren Wolkstein (director); Christopher Radcl...,[3],,Thriller Drama
2,JANUARY,5,Stratton,Momentum Pictures / GFM Films,"Simon West (director); Duncan Falconer, Warren...",[4],,Action Thriller
3,JANUARY,10,Sweet Country,Samuel Goldwyn Films,"Warwick Thornton (director); David Tranter, St...",[5],,Western Crime Thriller Drama
4,JANUARY,12,The Commuter,Lionsgate / StudioCanal / The Picture Company,Jaume Collet-Serra (director); Byron Willinger...,[6],,Action Thriller Mystery


In [43]:
df_2018 = df[['Title', 'Cast and crew', 'genres']]
df_2018.head()

Unnamed: 0,Title,Cast and crew,genres
0,Insidious: The Last Key,Adam Robitel (director); Leigh Whannell (scree...,Horror Mystery Thriller
1,The Strange Ones,Lauren Wolkstein (director); Christopher Radcl...,Thriller Drama
2,Stratton,"Simon West (director); Duncan Falconer, Warren...",Action Thriller
3,Sweet Country,"Warwick Thornton (director); David Tranter, St...",Western Crime Thriller Drama
4,The Commuter,Jaume Collet-Serra (director); Byron Willinger...,Action Thriller Mystery


In [46]:
df_2018['name_director'] = df_2018['Cast and crew'].map(lambda x: get_director(x))
df_2018['name_actor1'] = df_2018['Cast and crew'].map(lambda x: get_actor1(x))
df_2018['name_actor2'] = df_2018['Cast and crew'].map(lambda x: get_actor2(x))
df_2018['name_actor3'] = df_2018['Cast and crew'].map(lambda x: get_actor3(x))
df_2018.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2018['name_director'] = df_2018['Cast and crew'].map(lambda x: get_director(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2018['name_actor1'] = df_2018['Cast and crew'].map(lambda x: get_actor1(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2018['name_actor2'] = df_2018['Cast and 

Unnamed: 0,Title,Cast and crew,genres,name_director,name_actor1,name_actor2,name_actor3
0,Insidious: The Last Key,Adam Robitel (director); Leigh Whannell (scree...,Horror Mystery Thriller,Adam Robitel,Lin Shaye,Angus Sampson,Leigh Whannell
1,The Strange Ones,Lauren Wolkstein (director); Christopher Radcl...,Thriller Drama,Lauren Wolkstein,Alex Pettyfer,James Freedson-Jackson,Emily Althaus
2,Stratton,"Simon West (director); Duncan Falconer, Warren...",Action Thriller,Simon West,Dominic Cooper,Austin Stowell,Gemma Chan
3,Sweet Country,"Warwick Thornton (director); David Tranter, St...",Western Crime Thriller Drama,Warwick Thornton,Bryan Brown,Sam Neill,
4,The Commuter,Jaume Collet-Serra (director); Byron Willinger...,Action Thriller Mystery,Jaume Collet-Serra,Liam Neeson,Vera Farmiga,Patrick Wilson


In [47]:
df_2018 = df_2018.rename(columns={'Title' : 'movie_title', 'director_name' : 'name_director', 'genres' : 'genres'})
df_2018.head()

Unnamed: 0,movie_title,Cast and crew,genres,name_director,name_actor1,name_actor2,name_actor3
0,Insidious: The Last Key,Adam Robitel (director); Leigh Whannell (scree...,Horror Mystery Thriller,Adam Robitel,Lin Shaye,Angus Sampson,Leigh Whannell
1,The Strange Ones,Lauren Wolkstein (director); Christopher Radcl...,Thriller Drama,Lauren Wolkstein,Alex Pettyfer,James Freedson-Jackson,Emily Althaus
2,Stratton,"Simon West (director); Duncan Falconer, Warren...",Action Thriller,Simon West,Dominic Cooper,Austin Stowell,Gemma Chan
3,Sweet Country,"Warwick Thornton (director); David Tranter, St...",Western Crime Thriller Drama,Warwick Thornton,Bryan Brown,Sam Neill,
4,The Commuter,Jaume Collet-Serra (director); Byron Willinger...,Action Thriller Mystery,Jaume Collet-Serra,Liam Neeson,Vera Farmiga,Patrick Wilson


In [48]:
df_2018 = df_2018.drop('Cast and crew', axis=1)
df_2018.head()

Unnamed: 0,movie_title,genres,name_director,name_actor1,name_actor2,name_actor3
0,Insidious: The Last Key,Horror Mystery Thriller,Adam Robitel,Lin Shaye,Angus Sampson,Leigh Whannell
1,The Strange Ones,Thriller Drama,Lauren Wolkstein,Alex Pettyfer,James Freedson-Jackson,Emily Althaus
2,Stratton,Action Thriller,Simon West,Dominic Cooper,Austin Stowell,Gemma Chan
3,Sweet Country,Western Crime Thriller Drama,Warwick Thornton,Bryan Brown,Sam Neill,
4,The Commuter,Action Thriller Mystery,Jaume Collet-Serra,Liam Neeson,Vera Farmiga,Patrick Wilson


In [49]:
for i in df_2018.columns:
    df_2018[i] = df_2018[i].replace(np.nan, 'unknown')
    
df_2018['movie_title'] = df_2018['movie_title'].str.lower()
df_2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 275 entries, 0 to 274
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   movie_title    275 non-null    object
 1   genres         275 non-null    object
 2   name_director  275 non-null    object
 3   name_actor1    275 non-null    object
 4   name_actor2    275 non-null    object
 5   name_actor3    275 non-null    object
dtypes: object(6)
memory usage: 13.0+ KB


In [50]:
# Combination of all the columns
df_2018['combination'] = df_2018['name_actor1'] + ' ' + df_2018['name_actor2'] + ' '+ df_2018['name_actor3'] + ' '+ df_2018['name_director'] +' ' + df_2018['genres']
df_2018.head()

Unnamed: 0,movie_title,genres,name_director,name_actor1,name_actor2,name_actor3,combination
0,insidious: the last key,Horror Mystery Thriller,Adam Robitel,Lin Shaye,Angus Sampson,Leigh Whannell,Lin Shaye Angus Sampson Leigh Whannell Adam Ro...
1,the strange ones,Thriller Drama,Lauren Wolkstein,Alex Pettyfer,James Freedson-Jackson,Emily Althaus,Alex Pettyfer James Freedson-Jackson Emily Alt...
2,stratton,Action Thriller,Simon West,Dominic Cooper,Austin Stowell,Gemma Chan,Dominic Cooper Austin Stowell Gemma Chan Simon...
3,sweet country,Western Crime Thriller Drama,Warwick Thornton,Bryan Brown,Sam Neill,unknown,Bryan Brown Sam Neill unknown Warwick Thornton...
4,the commuter,Action Thriller Mystery,Jaume Collet-Serra,Liam Neeson,Vera Farmiga,Patrick Wilson,Liam Neeson Vera Farmiga Patrick Wilson Jaume ...


### Extracting Feature of 2019 Movies From Wikipedia

In [51]:
link = "https://en.wikipedia.org/wiki/List_of_American_films_of_2019"
df1 = pd.read_html(link, header=0)[2]
df2 = pd.read_html(link, header=0)[3]
df3 = pd.read_html(link, header=0)[4]
df4 = pd.read_html(link, header=0)[5]

In [52]:
df = df1.append(df2.append(df3.append(df4,ignore_index=True),ignore_index=True),ignore_index=True)
df.head()

  df = df1.append(df2.append(df3.append(df4,ignore_index=True),ignore_index=True),ignore_index=True)


Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref.
0,JANUARY,4,Escape Room,Columbia Pictures / Original Film,"Adam Robitel (director); Bragi F. Schut, Maria...",[2]
1,JANUARY,4,Rust Creek,IFC Films / Lunacy Productions,Jen McGowan (director); Julie Lipson (screenpl...,[3]
2,JANUARY,4,American Hangman,Hangman Justice Productions,Wilson Coneybeare (director/screenplay); Donal...,[4]
3,JANUARY,11,A Dog's Way Home,Columbia Pictures,Charles Martin Smith (director); W. Bruce Came...,[5]
4,JANUARY,11,The Upside,STX Entertainment,Neil Burger (director); Jon Hartmere (screenpl...,[6]


In [53]:
df['genres'] = df['Title'].map(lambda x: get_genre(str(x)))

In [54]:
df_2019 = df[['Title', 'Cast and crew', 'genres']]
df_2019.head()

Unnamed: 0,Title,Cast and crew,genres
0,Escape Room,"Adam Robitel (director); Bragi F. Schut, Maria...",Horror Thriller Mystery
1,Rust Creek,Jen McGowan (director); Julie Lipson (screenpl...,Thriller Drama Action Crime
2,American Hangman,Wilson Coneybeare (director/screenplay); Donal...,Thriller
3,A Dog's Way Home,Charles Martin Smith (director); W. Bruce Came...,Drama Adventure Family
4,The Upside,Neil Burger (director); Jon Hartmere (screenpl...,Comedy Drama


In [55]:
df_2019['name_director'] = df_2019['Cast and crew'].map(lambda x: get_director(x))
df_2019['name_actor1'] = df_2019['Cast and crew'].map(lambda x: get_actor1(x))
df_2019['name_actor2'] = df_2019['Cast and crew'].map(lambda x: get_actor2(x))
df_2019['name_actor3'] = df_2019['Cast and crew'].map(lambda x: get_actor3(x))
df_2019.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2019['name_director'] = df_2019['Cast and crew'].map(lambda x: get_director(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2019['name_actor1'] = df_2019['Cast and crew'].map(lambda x: get_actor1(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2019['name_actor2'] = df_2019['Cast and 

Unnamed: 0,Title,Cast and crew,genres,name_director,name_actor1,name_actor2,name_actor3
0,Escape Room,"Adam Robitel (director); Bragi F. Schut, Maria...",Horror Thriller Mystery,Adam Robitel,Taylor Russell,Logan Miller,Deborah Ann Woll
1,Rust Creek,Jen McGowan (director); Julie Lipson (screenpl...,Thriller Drama Action Crime,Jen McGowan,Hermione Corfield,Jay Paulson,Sean O'Bryan
2,American Hangman,Wilson Coneybeare (director/screenplay); Donal...,Thriller,Wilson Coneybeare,Donald Sutherland,Vincent Kartheiser,Oliver Dennis
3,A Dog's Way Home,Charles Martin Smith (director); W. Bruce Came...,Drama Adventure Family,Charles Martin Smith,Bryce Dallas Howard,Edward James Olmos,Alexandra Shipp
4,The Upside,Neil Burger (director); Jon Hartmere (screenpl...,Comedy Drama,Neil Burger,Bryan Cranston,Kevin Hart,Nicole Kidman


In [56]:
df_2019 = df_2019.rename(columns={'Title' : 'movie_title', 'director_name' : 'name_director', 'genres' : 'genres'})

In [57]:
for i in df_2019.columns:
    df_2019[i] = df_2019[i].replace(np.nan, 'unknown')
    
df_2019['movie_title'] = df_2019['movie_title'].str.lower()
df_2019.head()

Unnamed: 0,movie_title,Cast and crew,genres,name_director,name_actor1,name_actor2,name_actor3
0,escape room,"Adam Robitel (director); Bragi F. Schut, Maria...",Horror Thriller Mystery,Adam Robitel,Taylor Russell,Logan Miller,Deborah Ann Woll
1,rust creek,Jen McGowan (director); Julie Lipson (screenpl...,Thriller Drama Action Crime,Jen McGowan,Hermione Corfield,Jay Paulson,Sean O'Bryan
2,american hangman,Wilson Coneybeare (director/screenplay); Donal...,Thriller,Wilson Coneybeare,Donald Sutherland,Vincent Kartheiser,Oliver Dennis
3,a dog's way home,Charles Martin Smith (director); W. Bruce Came...,Drama Adventure Family,Charles Martin Smith,Bryce Dallas Howard,Edward James Olmos,Alexandra Shipp
4,the upside,Neil Burger (director); Jon Hartmere (screenpl...,Comedy Drama,Neil Burger,Bryan Cranston,Kevin Hart,Nicole Kidman


In [58]:
df_2019 = df_2019.drop('Cast and crew', axis=1)
df_2019.head()

Unnamed: 0,movie_title,genres,name_director,name_actor1,name_actor2,name_actor3
0,escape room,Horror Thriller Mystery,Adam Robitel,Taylor Russell,Logan Miller,Deborah Ann Woll
1,rust creek,Thriller Drama Action Crime,Jen McGowan,Hermione Corfield,Jay Paulson,Sean O'Bryan
2,american hangman,Thriller,Wilson Coneybeare,Donald Sutherland,Vincent Kartheiser,Oliver Dennis
3,a dog's way home,Drama Adventure Family,Charles Martin Smith,Bryce Dallas Howard,Edward James Olmos,Alexandra Shipp
4,the upside,Comedy Drama,Neil Burger,Bryan Cranston,Kevin Hart,Nicole Kidman


In [59]:
df_2019['combination'] = df_2019['name_actor1'] + ' ' + df_2019['name_actor2'] + ' '+ df_2019['name_actor3'] + ' '+ df_2019['name_director'] +' ' + df_2019['genres']
df_2019.head()

Unnamed: 0,movie_title,genres,name_director,name_actor1,name_actor2,name_actor3,combination
0,escape room,Horror Thriller Mystery,Adam Robitel,Taylor Russell,Logan Miller,Deborah Ann Woll,Taylor Russell Logan Miller Deborah Ann Woll A...
1,rust creek,Thriller Drama Action Crime,Jen McGowan,Hermione Corfield,Jay Paulson,Sean O'Bryan,Hermione Corfield Jay Paulson Sean O'Bryan Jen...
2,american hangman,Thriller,Wilson Coneybeare,Donald Sutherland,Vincent Kartheiser,Oliver Dennis,Donald Sutherland Vincent Kartheiser Oliver De...
3,a dog's way home,Drama Adventure Family,Charles Martin Smith,Bryce Dallas Howard,Edward James Olmos,Alexandra Shipp,Bryce Dallas Howard Edward James Olmos Alexand...
4,the upside,Comedy Drama,Neil Burger,Bryan Cranston,Kevin Hart,Nicole Kidman,Bryan Cranston Kevin Hart Nicole Kidman Neil B...


### Extracting Feature of 2020 Movies From Wikipedia

In [60]:
# Fetch the 2020 data
link = 'https://en.wikipedia.org/wiki/List_of_American_films_of_2020'

source = request.urlopen(link).read()
soup = bs.BeautifulSoup(source, 'lxml')
tables = soup.find_all('table', class_='wikitable sortable')

In [61]:
df1 = pd.read_html(str(tables[0]), header=0)[0]
df2 = pd.read_html(str(tables[1]), header=0)[0]
df3 = pd.read_html(str(tables[2]), header=0)[0]
df4 = pd.read_html(str(tables[3]).replace("'1\"\'",'"1"'), header=0)[0]

In [62]:
df = df1.append(df2.append(df3.append(df4,ignore_index=True),ignore_index=True),ignore_index=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 275 entries, 0 to 274
Data columns (total 7 columns):
 #   Column                                                                       Non-Null Count  Dtype 
---  ------                                                                       --------------  ----- 
 0   Opening                                                                      275 non-null    object
 1   Opening.1                                                                    275 non-null    int64 
 2   Title                                                                        275 non-null    object
 3   Production company                                                           275 non-null    object
 4   Cast and crew                                                                275 non-null    object
 5   .mw-parser-output .tooltip-dotted{border-bottom:1px dotted;cursor:help}Ref.  56 non-null     object
 6   Ref.                                               

  df = df1.append(df2.append(df3.append(df4,ignore_index=True),ignore_index=True),ignore_index=True)


In [63]:
df_2020 = df[['Title', 'Cast and crew']]

In [64]:
df_2020['genres'] = df_2020['Title'].map(lambda x: get_genre(str(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2020['genres'] = df_2020['Title'].map(lambda x: get_genre(str(x)))


In [65]:
df_2020['name_director'] = df_2020['Cast and crew'].map(lambda x: get_director(x))
df_2020['name_actor1'] = df_2020['Cast and crew'].map(lambda x: get_actor1(x))
df_2020['name_actor2'] = df_2020['Cast and crew'].map(lambda x: get_actor2(x))
df_2020['name_actor3'] = df_2020['Cast and crew'].map(lambda x: get_actor3(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2020['name_director'] = df_2020['Cast and crew'].map(lambda x: get_director(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2020['name_actor1'] = df_2020['Cast and crew'].map(lambda x: get_actor1(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2020['name_actor2'] = df_2020['Cast and 

In [66]:
df_2020 = df_2020.rename(columns={'Title' : 'movie_title', 'director_name' : 'name_director', 'genres' : 'genres'})

In [67]:
for i in df_2020.columns:
    df_2020[i] = df_2020[i].replace(np.nan, 'unknown')
    
df_2020['movie_title'] = df_2020['movie_title'].str.lower()
df_2020.head()

Unnamed: 0,movie_title,Cast and crew,genres,name_director,name_actor1,name_actor2,name_actor3
0,the grudge,Nicolas Pesce (director/screenplay); Andrea Ri...,Horror Mystery,Nicolas Pesce,Andrea Riseborough,Demián Bichir,John Cho
1,underwater,"William Eubank (director); Brian Duffield, Ada...",Horror Science Fiction Action Adventure,William Eubank,Kristen Stewart,Vincent Cassel,Jessica Henwick
2,like a boss,"Miguel Arteta (director); Sam Pitman, Adam Col...",Comedy,Miguel Arteta,Tiffany Haddish,Rose Byrne,Salma Hayek
3,three christs,Jon Avnet (director/screenplay); Eric Nazarian...,Drama,Jon Avnet,Richard Gere,Peter Dinklage,Walton Goggins
4,inherit the viper,Anthony Jerjen (director); Andrew Crabtree (sc...,Crime Thriller Drama,Anthony Jerjen,Josh Hartnett,Margarita Levieva,Chandler Riggs


In [68]:
df_2020 = df_2020.drop('Cast and crew', axis=1)
df_2020.head()

Unnamed: 0,movie_title,genres,name_director,name_actor1,name_actor2,name_actor3
0,the grudge,Horror Mystery,Nicolas Pesce,Andrea Riseborough,Demián Bichir,John Cho
1,underwater,Horror Science Fiction Action Adventure,William Eubank,Kristen Stewart,Vincent Cassel,Jessica Henwick
2,like a boss,Comedy,Miguel Arteta,Tiffany Haddish,Rose Byrne,Salma Hayek
3,three christs,Drama,Jon Avnet,Richard Gere,Peter Dinklage,Walton Goggins
4,inherit the viper,Crime Thriller Drama,Anthony Jerjen,Josh Hartnett,Margarita Levieva,Chandler Riggs


In [69]:
df_2020['combination'] = df_2020['name_actor1'] + ' ' + df_2020['name_actor2'] + ' ' + df_2020['name_actor3'] + ' ' + df_2020['name_director'] + ' ' + df_2020['genres']
df_2020.head()

Unnamed: 0,movie_title,genres,name_director,name_actor1,name_actor2,name_actor3,combination
0,the grudge,Horror Mystery,Nicolas Pesce,Andrea Riseborough,Demián Bichir,John Cho,Andrea Riseborough Demián Bichir John Cho Nico...
1,underwater,Horror Science Fiction Action Adventure,William Eubank,Kristen Stewart,Vincent Cassel,Jessica Henwick,Kristen Stewart Vincent Cassel Jessica Henwick...
2,like a boss,Comedy,Miguel Arteta,Tiffany Haddish,Rose Byrne,Salma Hayek,Tiffany Haddish Rose Byrne Salma Hayek Miguel ...
3,three christs,Drama,Jon Avnet,Richard Gere,Peter Dinklage,Walton Goggins,Richard Gere Peter Dinklage Walton Goggins Jon...
4,inherit the viper,Crime Thriller Drama,Anthony Jerjen,Josh Hartnett,Margarita Levieva,Chandler Riggs,Josh Hartnett Margarita Levieva Chandler Riggs...


## Combining All Preprocessed Data

In [70]:
data = df_2018.append(df_2019.append(df_2020, ignore_index=True), ignore_index=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 795 entries, 0 to 794
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   movie_title    795 non-null    object
 1   genres         795 non-null    object
 2   name_director  795 non-null    object
 3   name_actor1    795 non-null    object
 4   name_actor2    795 non-null    object
 5   name_actor3    795 non-null    object
 6   combination    795 non-null    object
dtypes: object(7)
memory usage: 43.6+ KB


  data = df_2018.append(df_2019.append(df_2020, ignore_index=True), ignore_index=True)


In [71]:
combine.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5364 entries, 0 to 529
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name_director  5364 non-null   object
 1   name_actor1    5364 non-null   object
 2   name_actor2    5364 non-null   object
 3   name_actor3    5364 non-null   object
 4   genres         5364 non-null   object
 5   movie_title    5364 non-null   object
 6   combination    5364 non-null   object
dtypes: object(7)
memory usage: 335.2+ KB


In [72]:
main = combine.append(data, ignore_index=True)
main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6159 entries, 0 to 6158
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name_director  6159 non-null   object
 1   name_actor1    6159 non-null   object
 2   name_actor2    6159 non-null   object
 3   name_actor3    6159 non-null   object
 4   genres         6159 non-null   object
 5   movie_title    6159 non-null   object
 6   combination    6159 non-null   object
dtypes: object(7)
memory usage: 336.9+ KB


  main = combine.append(data, ignore_index=True)


In [73]:
main.drop_duplicates(subset="movie_title", keep='last', inplace=True)
main.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6128 entries, 0 to 6158
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name_director  6128 non-null   object
 1   name_actor1    6128 non-null   object
 2   name_actor2    6128 non-null   object
 3   name_actor3    6128 non-null   object
 4   genres         6128 non-null   object
 5   movie_title    6128 non-null   object
 6   combination    6128 non-null   object
dtypes: object(7)
memory usage: 383.0+ KB


In [74]:
main.to_csv('../dataset/main.csv', index=False)