### *Required Libraries*

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import ast
# ast module, which is the Abstract Syntax Trees module in Python. 
# The ast module provides functionality to parse Python source code into an abstract syntax tree, 
# which can then be analyzed or modified programmatically.

### *Loading Our Dataset*

In [2]:
credits=pd.read_csv("credits.csv")
credits

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862
...,...,...,...
45471,"[{'cast_id': 0, 'character': '', 'credit_id': ...","[{'credit_id': '5894a97d925141426c00818c', 'de...",439050
45472,"[{'cast_id': 1002, 'character': 'Sister Angela...","[{'credit_id': '52fe4af1c3a36847f81e9b15', 'de...",111109
45473,"[{'cast_id': 6, 'character': 'Emily Shaw', 'cr...","[{'credit_id': '52fe4776c3a368484e0c8387', 'de...",67758
45474,"[{'cast_id': 2, 'character': '', 'credit_id': ...","[{'credit_id': '533bccebc3a36844cf0011a7', 'de...",227506


In [3]:
meta=pd.read_csv("movies_metadata.csv")
meta.head()

  meta=pd.read_csv("movies_metadata.csv")


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


*Formatting Release Date Column in datetime format*

In [4]:
meta['release_date']

0        1995-10-30
1        1995-12-15
2        1995-12-22
3        1995-12-22
4        1995-02-10
            ...    
45461           NaN
45462    2011-11-17
45463    2003-08-01
45464    1917-10-21
45465    2017-06-09
Name: release_date, Length: 45466, dtype: object

In [5]:
meta['release_date']=pd.to_datetime(meta['release_date'],errors='coerce')

# 'raise' (default): This is the default behavior. If any errors occur during the conversion, such as encountering an unparseable date, the function will raise an error (typically a ValueError), and the conversion will be aborted.
# 'coerce': If set to 'coerce', errors will be set as NaT (Not a Time) values. This means that if a particular element in the input cannot be converted to a datetime, Pandas will replace that element with a NaT.
# 'ignore': If set to 'ignore', the function will simply return the original input for any elements that cannot be converted, and no error will be raised.

In [6]:
meta['release_date']

0       1995-10-30
1       1995-12-15
2       1995-12-22
3       1995-12-22
4       1995-02-10
           ...    
45461          NaT
45462   2011-11-17
45463   2003-08-01
45464   1917-10-21
45465   2017-06-09
Name: release_date, Length: 45466, dtype: datetime64[ns]

In [7]:
meta['year']=meta['release_date'].dt.year

In [8]:
meta['year'].value_counts().sort_index()

year
1874.0       1
1878.0       1
1883.0       1
1887.0       1
1888.0       2
          ... 
2015.0    1905
2016.0    1604
2017.0     532
2018.0       5
2020.0       1
Name: count, Length: 135, dtype: int64

+ *Getting Only 2017 movies data as we already have movies data upto 2016 Data 1 Processed File*
+ *We don't have enough movies data from 2018,2019,2020, we will deal with it in the upcoming preprocessing file*

In [9]:
new_meta=meta.loc[meta['year']==2017,['genres','id','title','year']]

In [10]:
new_meta

Unnamed: 0,genres,id,title,year
26560,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",166426,Pirates of the Caribbean: Dead Men Tell No Tales,2017.0
26561,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",141052,Justice League,2017.0
26565,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",284053,Thor: Ragnarok,2017.0
26566,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",283995,Guardians of the Galaxy Vol. 2,2017.0
30536,"[{'id': 14, 'name': 'Fantasy'}, {'id': 28, 'na...",245842,The King's Daughter,2017.0
...,...,...,...,...
45398,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",468707,Thick Lashes of Lauri Mäntyvaara,2017.0
45417,"[{'id': 80, 'name': 'Crime'}, {'id': 35, 'name...",461297,Cop and a Half: New Recruit,2017.0
45437,"[{'id': 10751, 'name': 'Family'}, {'id': 16, '...",455661,In a Heartbeat,2017.0
45453,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",404604,Mom,2017.0


In [11]:
new_meta.dtypes

genres     object
id         object
title      object
year      float64
dtype: object

*converting 'id' into integer type*

In [12]:
new_meta['id']=new_meta['id'].astype(int)

*Adding the 'new_meta' data with 'credits' data.*

In [13]:
data=pd.merge(new_meta,credits,on='id')

In [14]:
data

Unnamed: 0,genres,id,title,year,cast,crew
0,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",166426,Pirates of the Caribbean: Dead Men Tell No Tales,2017.0,"[{'cast_id': 1, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4c9cc3a36847f8236a65', 'de..."
1,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",141052,Justice League,2017.0,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{'credit_id': '55ef66dbc3a3686f1700a52d', 'de..."
2,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",284053,Thor: Ragnarok,2017.0,"[{'cast_id': 0, 'character': 'Thor Odinson', '...","[{'credit_id': '56a93fa4c3a36872db001e7a', 'de..."
3,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",283995,Guardians of the Galaxy Vol. 2,2017.0,"[{'cast_id': 3, 'character': 'Peter Quill / St...","[{'credit_id': '59171547925141583c0315a6', 'de..."
4,"[{'id': 14, 'name': 'Fantasy'}, {'id': 28, 'na...",245842,The King's Daughter,2017.0,"[{'cast_id': 0, 'character': 'King Louis XIV',...","[{'credit_id': '5431de49c3a36825d300007e', 'de..."
...,...,...,...,...,...,...
526,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",468707,Thick Lashes of Lauri Mäntyvaara,2017.0,"[{'cast_id': 0, 'character': 'Satu', 'credit_i...","[{'credit_id': '597e22f69251415d7801c74a', 'de..."
527,"[{'id': 80, 'name': 'Crime'}, {'id': 35, 'name...",461297,Cop and a Half: New Recruit,2017.0,"[{'cast_id': 0, 'character': 'Detective Simmon...","[{'credit_id': '593ba0c29251410593009be3', 'de..."
528,"[{'id': 10751, 'name': 'Family'}, {'id': 16, '...",455661,In a Heartbeat,2017.0,[],"[{'credit_id': '5981a15c92514151e0011b51', 'de..."
529,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",404604,Mom,2017.0,"[{'cast_id': 1, 'character': 'Devki Sabarwal',...","[{'credit_id': '58ee55bbc3a3683df500bd0f', 'de..."


*We will convert the "genre", "cast" and "crew" column into a list. if we observe it carefully, we will realised that it's a list containing dictionary.*

In [15]:
#the "literal_eval" helps to convert the string into a list
data['genres']=data['genres'].map(lambda x : ast.literal_eval(x)) 
data['cast']=data['cast'].map(lambda x : ast.literal_eval(x)) 
data['crew']=data['crew'].map(lambda x : ast.literal_eval(x)) 

*Now we will have to take out the individual genres like adventure, action, sci-fi, etc. using the following function*

In [16]:
def make_genre_list(x):
    genre=[]
    st=" "
    for i in x:
        if i.get('name')=='Science Fiction':
            genre.append('Sci-Fi')
        else:
            genre.append(i.get('name'))
    
    if genre==[]:
        return np.nan
    else:
        return (st.join(genre))

In [17]:
data['genres']=data['genres'].map(lambda x: make_genre_list(x))

In [18]:
data['genres']

0      Adventure Action Fantasy Comedy
1      Action Adventure Fantasy Sci-Fi
2      Action Adventure Fantasy Sci-Fi
3       Action Adventure Comedy Sci-Fi
4             Fantasy Action Adventure
                    ...               
526                     Romance Comedy
527         Crime Comedy Action Family
528    Family Animation Romance Comedy
529               Crime Drama Thriller
530                                NaN
Name: genres, Length: 531, dtype: object

+ *Actor 1*

In [19]:
def actor_1_name(x):
    cast=[]
    for i in x:
        cast.append(i.get('name'))
    
    if cast==[]:
        return np.nan
    else:
        return cast[0]

+ *Actor 2*

In [20]:
def actor_2_name(x):
    cast=[]
    for i in x:
        cast.append(i.get('name'))
    
    if cast==[] or len(cast)<=1:
        return np.nan
    else:
        return cast[1]

+ *Actor 3*

In [21]:
def actor_3_name(x):
    cast=[]
    for i in x:
        cast.append(i.get('name'))
    
    if cast==[] or len(cast)<=2:
        return np.nan
    else:
        return cast[2]

In [22]:
data['actor_1_name']=data['cast'].map(lambda x : actor_1_name(x))
data['actor_2_name']=data['cast'].map(lambda x : actor_2_name(x))
data['actor_3_name']=data['cast'].map(lambda x : actor_3_name(x))

In [23]:
data['actor_1_name']

0               Johnny Depp
1               Ben Affleck
2           Chris Hemsworth
3               Chris Pratt
4            Pierce Brosnan
               ...         
526          Inka Haapamäki
527    Lou Diamond Phillips
528                     NaN
529          Sridevi Kapoor
530                     NaN
Name: actor_1_name, Length: 531, dtype: object

In [24]:
data['actor_2_name']

0       Javier Bardem
1        Henry Cavill
2      Tom Hiddleston
3         Zoe Saldana
4        William Hurt
            ...      
526     Rosa Honkonen
527     Wallace Shawn
528               NaN
529         Sajal Ali
530               NaN
Name: actor_2_name, Length: 531, dtype: object

In [25]:
data['actor_3_name']

0        Geoffrey Rush
1            Gal Gadot
2       Cate Blanchett
3        Dave Bautista
4      Benjamin Walker
            ...       
526     Tiitus Rantala
527        Gina Holden
528                NaN
529     Akshaye Khanna
530                NaN
Name: actor_3_name, Length: 531, dtype: object

+ *Directors*

In [26]:
def make_director_list(x):
    director=[]
    st=" "
    for i in x:
        if i.get('job')=='Director':
            director.append(i.get('name'))
    
    if director==[]:
        return np.nan
    else:
        return (st.join(director))

In [27]:
data['director_name']=data['crew'].map(lambda x : make_director_list(x))

In [28]:
data['director_name']

0      Joachim Rønning Espen Sandberg
1                         Zack Snyder
2                       Taika Waititi
3                          James Gunn
4                       Sean McNamara
                    ...              
526                  Hannaleena Hauru
527             Jonathan A. Rosenbaum
528          Beth David Esteban Bravo
529                      Ravi Udyawar
530                     Daisy Asquith
Name: director_name, Length: 531, dtype: object

*Selecting Only Prepared Data*

In [29]:
movie=data.loc[:,['director_name','actor_1_name','actor_2_name','actor_3_name','genres','title']]

In [30]:
movie

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,title
0,Joachim Rønning Espen Sandberg,Johnny Depp,Javier Bardem,Geoffrey Rush,Adventure Action Fantasy Comedy,Pirates of the Caribbean: Dead Men Tell No Tales
1,Zack Snyder,Ben Affleck,Henry Cavill,Gal Gadot,Action Adventure Fantasy Sci-Fi,Justice League
2,Taika Waititi,Chris Hemsworth,Tom Hiddleston,Cate Blanchett,Action Adventure Fantasy Sci-Fi,Thor: Ragnarok
3,James Gunn,Chris Pratt,Zoe Saldana,Dave Bautista,Action Adventure Comedy Sci-Fi,Guardians of the Galaxy Vol. 2
4,Sean McNamara,Pierce Brosnan,William Hurt,Benjamin Walker,Fantasy Action Adventure,The King's Daughter
...,...,...,...,...,...,...
526,Hannaleena Hauru,Inka Haapamäki,Rosa Honkonen,Tiitus Rantala,Romance Comedy,Thick Lashes of Lauri Mäntyvaara
527,Jonathan A. Rosenbaum,Lou Diamond Phillips,Wallace Shawn,Gina Holden,Crime Comedy Action Family,Cop and a Half: New Recruit
528,Beth David Esteban Bravo,,,,Family Animation Romance Comedy,In a Heartbeat
529,Ravi Udyawar,Sridevi Kapoor,Sajal Ali,Akshaye Khanna,Crime Drama Thriller,Mom


*Removing The Null Values*

In [31]:
movie.isna().sum()

director_name     4
actor_1_name     22
actor_2_name     55
actor_3_name     70
genres            7
title             0
dtype: int64

In [32]:
movie.dropna(how='any')

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,title
0,Joachim Rønning Espen Sandberg,Johnny Depp,Javier Bardem,Geoffrey Rush,Adventure Action Fantasy Comedy,Pirates of the Caribbean: Dead Men Tell No Tales
1,Zack Snyder,Ben Affleck,Henry Cavill,Gal Gadot,Action Adventure Fantasy Sci-Fi,Justice League
2,Taika Waititi,Chris Hemsworth,Tom Hiddleston,Cate Blanchett,Action Adventure Fantasy Sci-Fi,Thor: Ragnarok
3,James Gunn,Chris Pratt,Zoe Saldana,Dave Bautista,Action Adventure Comedy Sci-Fi,Guardians of the Galaxy Vol. 2
4,Sean McNamara,Pierce Brosnan,William Hurt,Benjamin Walker,Fantasy Action Adventure,The King's Daughter
...,...,...,...,...,...,...
524,Jim Strouse,Jessica Williams,Chris O'Dowd,Keith Stanfield,Romance Comedy,The Incredible Jessica James
525,Farhad Mann,Adelaide Kane,Benjamin Hollingsworth,Jean Louisa Kelly,Romance,Can't Buy My Love
526,Hannaleena Hauru,Inka Haapamäki,Rosa Honkonen,Tiitus Rantala,Romance Comedy,Thick Lashes of Lauri Mäntyvaara
527,Jonathan A. Rosenbaum,Lou Diamond Phillips,Wallace Shawn,Gina Holden,Crime Comedy Action Family,Cop and a Half: New Recruit


In [33]:
movie.isna().sum()

director_name     4
actor_1_name     22
actor_2_name     55
actor_3_name     70
genres            7
title             0
dtype: int64

*Renaming Our Column*

In [34]:
movie=movie.rename(columns={'title':'movie_title'})

In [35]:
movie.head()

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title
0,Joachim Rønning Espen Sandberg,Johnny Depp,Javier Bardem,Geoffrey Rush,Adventure Action Fantasy Comedy,Pirates of the Caribbean: Dead Men Tell No Tales
1,Zack Snyder,Ben Affleck,Henry Cavill,Gal Gadot,Action Adventure Fantasy Sci-Fi,Justice League
2,Taika Waititi,Chris Hemsworth,Tom Hiddleston,Cate Blanchett,Action Adventure Fantasy Sci-Fi,Thor: Ragnarok
3,James Gunn,Chris Pratt,Zoe Saldana,Dave Bautista,Action Adventure Comedy Sci-Fi,Guardians of the Galaxy Vol. 2
4,Sean McNamara,Pierce Brosnan,William Hurt,Benjamin Walker,Fantasy Action Adventure,The King's Daughter


*Converting Data Values in Lower Case*

In [36]:
movie=movie.applymap(lambda x : x.lower() if isinstance(x,str) else x)

In [37]:
movie.head()

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title
0,joachim rønning espen sandberg,johnny depp,javier bardem,geoffrey rush,adventure action fantasy comedy,pirates of the caribbean: dead men tell no tales
1,zack snyder,ben affleck,henry cavill,gal gadot,action adventure fantasy sci-fi,justice league
2,taika waititi,chris hemsworth,tom hiddleston,cate blanchett,action adventure fantasy sci-fi,thor: ragnarok
3,james gunn,chris pratt,zoe saldana,dave bautista,action adventure comedy sci-fi,guardians of the galaxy vol. 2
4,sean mcnamara,pierce brosnan,william hurt,benjamin walker,fantasy action adventure,the king's daughter


*We will use this imformation later on*

In [38]:
movie['comb']=movie['actor_1_name']+ ' ' +movie['actor_2_name']+ ' ' +movie['actor_3_name']+ ' ' +movie['director_name']+ ' ' +movie['genres']

In [39]:
movie

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,joachim rønning espen sandberg,johnny depp,javier bardem,geoffrey rush,adventure action fantasy comedy,pirates of the caribbean: dead men tell no tales,johnny depp javier bardem geoffrey rush joachi...
1,zack snyder,ben affleck,henry cavill,gal gadot,action adventure fantasy sci-fi,justice league,ben affleck henry cavill gal gadot zack snyder...
2,taika waititi,chris hemsworth,tom hiddleston,cate blanchett,action adventure fantasy sci-fi,thor: ragnarok,chris hemsworth tom hiddleston cate blanchett ...
3,james gunn,chris pratt,zoe saldana,dave bautista,action adventure comedy sci-fi,guardians of the galaxy vol. 2,chris pratt zoe saldana dave bautista james gu...
4,sean mcnamara,pierce brosnan,william hurt,benjamin walker,fantasy action adventure,the king's daughter,pierce brosnan william hurt benjamin walker se...
...,...,...,...,...,...,...,...
526,hannaleena hauru,inka haapamäki,rosa honkonen,tiitus rantala,romance comedy,thick lashes of lauri mäntyvaara,inka haapamäki rosa honkonen tiitus rantala ha...
527,jonathan a. rosenbaum,lou diamond phillips,wallace shawn,gina holden,crime comedy action family,cop and a half: new recruit,lou diamond phillips wallace shawn gina holden...
528,beth david esteban bravo,,,,family animation romance comedy,in a heartbeat,
529,ravi udyawar,sridevi kapoor,sajal ali,akshaye khanna,crime drama thriller,mom,sridevi kapoor sajal ali akshaye khanna ravi u...


In [40]:
old_data=pd.read_csv('data_1.csv')
old_data.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,color,james cameron,723.0,178.0,0.0,855.0,joel david moore,1000.0,760505847.0,action adventure fantasy sci-fi,...,3054.0,english,usa,pg-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,color,gore verbinski,302.0,169.0,563.0,1000.0,orlando bloom,40000.0,309404152.0,action adventure fantasy,...,1238.0,english,usa,pg-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,color,sam mendes,602.0,148.0,0.0,161.0,rory kinnear,11000.0,200074175.0,action adventure thriller,...,994.0,english,uk,pg-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,color,christopher nolan,813.0,164.0,22000.0,23000.0,christian bale,27000.0,448130642.0,action thriller,...,2701.0,english,usa,pg-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,unknown,doug walker,unknown,unknown,131.0,unknown,rob walker,131.0,unknown,documentary,...,unknown,unknown,unknown,unknown,unknown,unknown,12.0,7.1,unknown,0


In [41]:
old_data.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [42]:
old_data=old_data.loc[:,['director_name','actor_1_name','actor_2_name','actor_3_name','genres','movie_title']]
old_data.head()

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title
0,james cameron,cch pounder,joel david moore,wes studi,action adventure fantasy sci-fi,avatar
1,gore verbinski,johnny depp,orlando bloom,jack davenport,action adventure fantasy,pirates of the caribbean: at world's end
2,sam mendes,christoph waltz,rory kinnear,stephanie sigman,action adventure thriller,spectre
3,christopher nolan,tom hardy,christian bale,joseph gordon-levitt,action thriller,the dark knight rises
4,doug walker,doug walker,rob walker,unknown,documentary,star wars: episode vii - the force awakens ...


In [43]:
old_data['comb']=old_data['comb']=old_data['actor_1_name']+ ' ' +old_data['actor_2_name']+ ' ' +old_data['actor_3_name']+ ' ' +old_data['director_name']+ ' ' +old_data['genres']

In [44]:
old_data.head()

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,james cameron,cch pounder,joel david moore,wes studi,action adventure fantasy sci-fi,avatar,cch pounder joel david moore wes studi james c...
1,gore verbinski,johnny depp,orlando bloom,jack davenport,action adventure fantasy,pirates of the caribbean: at world's end,johnny depp orlando bloom jack davenport gore ...
2,sam mendes,christoph waltz,rory kinnear,stephanie sigman,action adventure thriller,spectre,christoph waltz rory kinnear stephanie sigman ...
3,christopher nolan,tom hardy,christian bale,joseph gordon-levitt,action thriller,the dark knight rises,tom hardy christian bale joseph gordon-levitt ...
4,doug walker,doug walker,rob walker,unknown,documentary,star wars: episode vii - the force awakens ...,doug walker rob walker unknown doug walker doc...


*Putting All Dataset Together*

In [45]:
new_data=pd.concat([old_data,movie])
new_data

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,james cameron,cch pounder,joel david moore,wes studi,action adventure fantasy sci-fi,avatar,cch pounder joel david moore wes studi james c...
1,gore verbinski,johnny depp,orlando bloom,jack davenport,action adventure fantasy,pirates of the caribbean: at world's end,johnny depp orlando bloom jack davenport gore ...
2,sam mendes,christoph waltz,rory kinnear,stephanie sigman,action adventure thriller,spectre,christoph waltz rory kinnear stephanie sigman ...
3,christopher nolan,tom hardy,christian bale,joseph gordon-levitt,action thriller,the dark knight rises,tom hardy christian bale joseph gordon-levitt ...
4,doug walker,doug walker,rob walker,unknown,documentary,star wars: episode vii - the force awakens ...,doug walker rob walker unknown doug walker doc...
...,...,...,...,...,...,...,...
526,hannaleena hauru,inka haapamäki,rosa honkonen,tiitus rantala,romance comedy,thick lashes of lauri mäntyvaara,inka haapamäki rosa honkonen tiitus rantala ha...
527,jonathan a. rosenbaum,lou diamond phillips,wallace shawn,gina holden,crime comedy action family,cop and a half: new recruit,lou diamond phillips wallace shawn gina holden...
528,beth david esteban bravo,,,,family animation romance comedy,in a heartbeat,
529,ravi udyawar,sridevi kapoor,sajal ali,akshaye khanna,crime drama thriller,mom,sridevi kapoor sajal ali akshaye khanna ravi u...


*Removing Duplicate Data*

In [46]:
new_data.drop_duplicates(subset='movie_title',keep='last',inplace=True)

In [47]:
new_data

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,james cameron,cch pounder,joel david moore,wes studi,action adventure fantasy sci-fi,avatar,cch pounder joel david moore wes studi james c...
1,gore verbinski,johnny depp,orlando bloom,jack davenport,action adventure fantasy,pirates of the caribbean: at world's end,johnny depp orlando bloom jack davenport gore ...
2,sam mendes,christoph waltz,rory kinnear,stephanie sigman,action adventure thriller,spectre,christoph waltz rory kinnear stephanie sigman ...
3,christopher nolan,tom hardy,christian bale,joseph gordon-levitt,action thriller,the dark knight rises,tom hardy christian bale joseph gordon-levitt ...
4,doug walker,doug walker,rob walker,unknown,documentary,star wars: episode vii - the force awakens ...,doug walker rob walker unknown doug walker doc...
...,...,...,...,...,...,...,...
526,hannaleena hauru,inka haapamäki,rosa honkonen,tiitus rantala,romance comedy,thick lashes of lauri mäntyvaara,inka haapamäki rosa honkonen tiitus rantala ha...
527,jonathan a. rosenbaum,lou diamond phillips,wallace shawn,gina holden,crime comedy action family,cop and a half: new recruit,lou diamond phillips wallace shawn gina holden...
528,beth david esteban bravo,,,,family animation romance comedy,in a heartbeat,
529,ravi udyawar,sridevi kapoor,sajal ali,akshaye khanna,crime drama thriller,mom,sridevi kapoor sajal ali akshaye khanna ravi u...


*Saving Our Dataset*

In [48]:
new_data.to_csv("new_data.csv",index=False)