In [1]:
import pandas as pd
import ast
import json
import numpy as np

import re

# Read dataframes

In [2]:
df_upcoming = pd.read_csv('output/df_upcoming.csv', index_col=0)
df_upcoming.tail(5)

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
714,False,,[16],1303677,pt,Mytikah Explora – Maria Sibylla Merian,Manga and Leco learn about the story Maria Sib...,1.4,/6CfCoS3O1pZUKXip6QdRwbXlbJt.jpg,2024-06-19,Mytikah Explora – Maria Sibylla Merian,False,0.0,0
715,False,,[99],1303655,en,Breakin' on the One,"In 1981, a breaking battle between the Rockste...",1.4,,2024-06-24,Breakin' on the One,False,0.0,0
716,False,,[],1303649,uk,Clubbing,The film is about the constraints faced by two...,1.4,/wFyirnWB4NaPctJqhY3PFVVH4RV.jpg,2024-06-26,Clubbing,False,0.0,0
717,False,,[],1303578,de,Die Drei ??? -Das Dorf der Teufel,Radio play in 3D sound = cinema for your ears!...,1.4,/bhHs1BDN6zrAkODWxokyl2hbqXI.jpg,2024-06-27,Die Drei ??? -Das Dorf der Teufel,False,0.0,0
718,False,,[],1303578,de,Die Drei ??? -Das Dorf der Teufel,Radio play in 3D sound = cinema for your ears!...,1.4,/bhHs1BDN6zrAkODWxokyl2hbqXI.jpg,2024-06-27,Die Drei ??? -Das Dorf der Teufel,False,0.0,0


In [3]:
# Read the CSV file with the converter function
df_credits = pd.read_csv('output/df_credits.csv')

df_credits.head(2)

Unnamed: 0,id,budget,production_companies,production_countries,revenue,spoken_languages,genres,runtime,tagline,crew,cast-name
0,573435,80000000,"['Westbrook Studios', 'Columbia Pictures', 'Do...",['United States of America'],104600000,['English'],"['Action', 'Crime', 'Thriller', 'Comedy']",115,Miami's finest are now its most wanted.,"{'Producer': 'Chad Oman', 'Characters': 'Georg...","['Will Smith', 'Martin Lawrence', 'Vanessa Hud..."
1,1022789,200000000,"['Pixar', 'Walt Disney Pictures']",['United States of America'],0,['English'],"['Animation', 'Family', 'Drama', 'Adventure', ...",97,Make room for new emotions.,"{'Director': 'Kelsey Mann', 'Producer': 'Mark ...","['Amy Poehler', 'Maya Hawke', 'Kensington Tall..."


# Create extra columns

## Evaluate dtypes

In [4]:
# Function that safely evaluates a string representation of a literal Python expression and returns the corresponding Python object

list_items = ['spoken_languages','genres', 'cast-name', 'production_companies', 'crew', 'production_countries']

for item in list_items:

    df_credits[item] = df_credits[item].apply(ast.literal_eval)


## Define functions

### From dictionaries

In [5]:
# Define the function
def add_col_dict(df, new_col_name, searched_item, searched_col_name):
    add_col = [key.get(searched_item, np.nan) for key in df[searched_col_name]]
    df[new_col_name] = add_col

### From list - 2 items

In [6]:
# Define a function to extract the first and second items from a list
def extract_2_items(actor_list):
    return actor_list[:2] if actor_list else [None, None]


def add_2_items_list(df, new_col_name1, new_col_name2, source_col_name):
    # Apply the function using a list comprehension to create two new columns
    df[[new_col_name1, new_col_name2]] = pd.DataFrame(df[source_col_name].apply(extract_2_items).tolist(), index=df.index)

### From list - 1 item

In [7]:
# Define a function to extract the first item from a list
def extract_1_item(actor_list):
    return actor_list[:1] if actor_list else [None]


def add_1_item_list(df, new_col_name1, source_col_name):
    # Apply the function using a list comprehension to create two new columns
    df[[new_col_name1]] = pd.DataFrame(df[source_col_name].apply(extract_1_item).tolist(), index=df.index)

### Call functions

In [8]:
# Call the function 

add_col_dict(df=df_credits, 
        new_col_name='director', 
        searched_item='Director', 
        searched_col_name='crew'
        )


add_col_dict(df=df_credits, 
        new_col_name='producer', 
        searched_item='Producer', 
        searched_col_name='crew'
        )

add_col_dict(df=df_credits, 
        new_col_name='sound', 
        searched_item='Original Music Composer', 
        searched_col_name='crew'
        )

add_col_dict(df=df_credits, 
        new_col_name='camera', 
        searched_item='Director of Photography', 
        searched_col_name='crew'
        )


add_col_dict(df=df_credits, 
        new_col_name='screenplay', 
        searched_item='Screenplay', 
        searched_col_name='crew'
        )


df_credits.head(2)

Unnamed: 0,id,budget,production_companies,production_countries,revenue,spoken_languages,genres,runtime,tagline,crew,cast-name,director,producer,sound,camera,screenplay
0,573435,80000000,"[Westbrook Studios, Columbia Pictures, Don Sim...",[United States of America],104600000,[English],"[Action, Crime, Thriller, Comedy]",115,Miami's finest are now its most wanted.,"{'Producer': 'Chad Oman', 'Characters': 'Georg...","[Will Smith, Martin Lawrence, Vanessa Hudgens,...",Bilall Fallah,Chad Oman,Lorne Balfe,Robrecht Heyvaert,
1,1022789,200000000,"[Pixar, Walt Disney Pictures]",[United States of America],0,[English],"[Animation, Family, Drama, Adventure, Comedy]",97,Make room for new emotions.,"{'Director': 'Kelsey Mann', 'Producer': 'Mark ...","[Amy Poehler, Maya Hawke, Kensington Tallman, ...",Kelsey Mann,Mark Nielsen,Andrea Datzman,Jonathan Pytko,Dave Holstein


In [9]:
add_2_items_list(df=df_credits, 
            new_col_name1='actor1', 
            new_col_name2='actor2', 
            source_col_name='cast-name'
            )

In [10]:
df_credits.head(2)

Unnamed: 0,id,budget,production_companies,production_countries,revenue,spoken_languages,genres,runtime,tagline,crew,cast-name,director,producer,sound,camera,screenplay,actor1,actor2
0,573435,80000000,"[Westbrook Studios, Columbia Pictures, Don Sim...",[United States of America],104600000,[English],"[Action, Crime, Thriller, Comedy]",115,Miami's finest are now its most wanted.,"{'Producer': 'Chad Oman', 'Characters': 'Georg...","[Will Smith, Martin Lawrence, Vanessa Hudgens,...",Bilall Fallah,Chad Oman,Lorne Balfe,Robrecht Heyvaert,,Will Smith,Martin Lawrence
1,1022789,200000000,"[Pixar, Walt Disney Pictures]",[United States of America],0,[English],"[Animation, Family, Drama, Adventure, Comedy]",97,Make room for new emotions.,"{'Director': 'Kelsey Mann', 'Producer': 'Mark ...","[Amy Poehler, Maya Hawke, Kensington Tallman, ...",Kelsey Mann,Mark Nielsen,Andrea Datzman,Jonathan Pytko,Dave Holstein,Amy Poehler,Maya Hawke


In [11]:
add_1_item_list(df=df_credits, 
            new_col_name1='production_company', 
            source_col_name='production_companies'
            )


add_1_item_list(df=df_credits, 
            new_col_name1='production_country', 
            source_col_name='production_countries'
            )


add_1_item_list(df=df_credits, 
            new_col_name1='genre', 
            source_col_name='genres'
            )


""" add_1_item_list(df=df_credits, 
            new_col_name1='spoken_language', 
            source_col_name='spoken_languages'
            ) """

" add_1_item_list(df=df_credits, \n            new_col_name1='spoken_language', \n            source_col_name='spoken_languages'\n            ) "

In [12]:
df_credits.head(5)

Unnamed: 0,id,budget,production_companies,production_countries,revenue,spoken_languages,genres,runtime,tagline,crew,...,director,producer,sound,camera,screenplay,actor1,actor2,production_company,production_country,genre
0,573435,80000000,"[Westbrook Studios, Columbia Pictures, Don Sim...",[United States of America],104600000,[English],"[Action, Crime, Thriller, Comedy]",115,Miami's finest are now its most wanted.,"{'Producer': 'Chad Oman', 'Characters': 'Georg...",...,Bilall Fallah,Chad Oman,Lorne Balfe,Robrecht Heyvaert,,Will Smith,Martin Lawrence,Westbrook Studios,United States of America,Action
1,1022789,200000000,"[Pixar, Walt Disney Pictures]",[United States of America],0,[English],"[Animation, Family, Drama, Adventure, Comedy]",97,Make room for new emotions.,"{'Director': 'Kelsey Mann', 'Producer': 'Mark ...",...,Kelsey Mann,Mark Nielsen,Andrea Datzman,Jonathan Pytko,Dave Holstein,Amy Poehler,Maya Hawke,Pixar,United States of America,Animation
2,974635,8800000,"[Aggregate Films, BarnStorm Productions, Detou...",[United States of America],1139025,[English],"[Romance, Comedy, Crime]",116,"He's not a killer, but he can pretend.","{'Director': 'Richard Linklater', 'Producer': ...",...,Richard Linklater,Mike Blizzard,Graham Reynolds,Shane F. Kelly,Glen Powell,Glen Powell,Adria Arjona,Aggregate Films,United States of America,Romance
3,882059,18000000,"[Vertigo Entertainment, Hammerstone Studios, N...",[United States of America],3139717,[English],"[Action, Thriller, Crime, Science Fiction]",110,,"{'Director': 'Moritz Mohr', 'Producer': 'Simon...",...,Moritz Mohr,Simon Swart,Ludvig Forssell,Peter Matjasko,,Bill Skarsgård,Jessica Rothe,Vertigo Entertainment,United States of America,Action
4,748783,60000000,"[Alcon Entertainment, Prime Focus, DNEG Animat...","[Hong Kong, India, United Kingdom, United Stat...",192713000,[English],"[Animation, Comedy, Family, Adventure]",101,Indoor cat. Outdoor adventure.,"{'Storyboard Artist': 'Bob Scott', 'Production...",...,Mark Dindal,Craig Sost,John Debney,,Paul A. Kaplan,Chris Pratt,Samuel L. Jackson,Alcon Entertainment,Hong Kong,Animation


### Drop cols used for parsing

In [13]:
df_credits = df_credits.drop(columns=list_items)
df_credits.head(5)

Unnamed: 0,id,budget,revenue,runtime,tagline,director,producer,sound,camera,screenplay,actor1,actor2,production_company,production_country,genre
0,573435,80000000,104600000,115,Miami's finest are now its most wanted.,Bilall Fallah,Chad Oman,Lorne Balfe,Robrecht Heyvaert,,Will Smith,Martin Lawrence,Westbrook Studios,United States of America,Action
1,1022789,200000000,0,97,Make room for new emotions.,Kelsey Mann,Mark Nielsen,Andrea Datzman,Jonathan Pytko,Dave Holstein,Amy Poehler,Maya Hawke,Pixar,United States of America,Animation
2,974635,8800000,1139025,116,"He's not a killer, but he can pretend.",Richard Linklater,Mike Blizzard,Graham Reynolds,Shane F. Kelly,Glen Powell,Glen Powell,Adria Arjona,Aggregate Films,United States of America,Romance
3,882059,18000000,3139717,110,,Moritz Mohr,Simon Swart,Ludvig Forssell,Peter Matjasko,,Bill Skarsgård,Jessica Rothe,Vertigo Entertainment,United States of America,Action
4,748783,60000000,192713000,101,Indoor cat. Outdoor adventure.,Mark Dindal,Craig Sost,John Debney,,Paul A. Kaplan,Chris Pratt,Samuel L. Jackson,Alcon Entertainment,Hong Kong,Animation


# Merge

In [14]:
df_merged = pd.merge(df_upcoming,df_credits, on='id')
df_merged.head(2)

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,...,director,producer,sound,camera,screenplay,actor1,actor2,production_company,production_country,genre
0,False,/ga4OLm4qLxPqKLMzjJlqHxVjst3.jpg,"[28, 80, 53, 35]",573435,en,Bad Boys: Ride or Die,"After their late former Captain is framed, Low...",2585.897,/nP6RliHjxsz4irTKsxe8FRhKZYl.jpg,2024-06-05,...,Bilall Fallah,Chad Oman,Lorne Balfe,Robrecht Heyvaert,,Will Smith,Martin Lawrence,Westbrook Studios,United States of America,Action
1,False,/qjoX7hl721FOiyeHsDkeQ6rFVLl.jpg,"[16, 10751, 18, 12, 35]",1022789,en,Inside Out 2,Teenager Riley's mind headquarters is undergoi...,1813.01,/vpnVM9B6NMmQpWeZvzLvDESb2QY.jpg,2024-06-11,...,Kelsey Mann,Mark Nielsen,Andrea Datzman,Jonathan Pytko,Dave Holstein,Amy Poehler,Maya Hawke,Pixar,United States of America,Animation


# Drop Columns with many NaN

In [15]:
df_merged.isna().sum()

adult                   0
backdrop_path         362
genre_ids               0
id                      0
original_language       0
original_title          0
overview               66
popularity              0
poster_path           129
release_date            2
title                   0
video                   0
vote_average            0
vote_count              0
budget                  0
revenue                 0
runtime                 0
tagline               538
director               41
producer              289
sound                 614
camera                416
screenplay            599
actor1                136
actor2                180
production_company    246
production_country    216
genre                 126
dtype: int64

In [16]:
df_merged = df_merged.drop(columns=['backdrop_path', 'poster_path', 'tagline', 'overview', 'sound', 'screenplay', 'camera', 'producer'])
df_merged.head(5)

Unnamed: 0,adult,genre_ids,id,original_language,original_title,popularity,release_date,title,video,vote_average,vote_count,budget,revenue,runtime,director,actor1,actor2,production_company,production_country,genre
0,False,"[28, 80, 53, 35]",573435,en,Bad Boys: Ride or Die,2585.897,2024-06-05,Bad Boys: Ride or Die,False,7.2,172,80000000,104600000,115,Bilall Fallah,Will Smith,Martin Lawrence,Westbrook Studios,United States of America,Action
1,False,"[16, 10751, 18, 12, 35]",1022789,en,Inside Out 2,1813.01,2024-06-11,Inside Out 2,False,7.581,31,200000000,0,97,Kelsey Mann,Amy Poehler,Maya Hawke,Pixar,United States of America,Animation
2,False,"[10749, 35, 80]",974635,en,Hit Man,888.899,2024-05-16,Hit Man,False,6.9,199,8800000,1139025,116,Richard Linklater,Glen Powell,Adria Arjona,Aggregate Films,United States of America,Romance
3,False,"[28, 53, 80, 878]",882059,en,Boy Kills World,433.026,2024-04-24,Boy Kills World,False,6.906,272,18000000,3139717,110,Moritz Mohr,Bill Skarsgård,Jessica Rothe,Vertigo Entertainment,United States of America,Action
4,False,"[16, 35, 10751, 12]",748783,en,The Garfield Movie,502.943,2024-04-30,The Garfield Movie,False,6.42,157,60000000,192713000,101,Mark Dindal,Chris Pratt,Samuel L. Jackson,Alcon Entertainment,Hong Kong,Animation


In [17]:
df_merged.isna().sum()

adult                   0
genre_ids               0
id                      0
original_language       0
original_title          0
popularity              0
release_date            2
title                   0
video                   0
vote_average            0
vote_count              0
budget                  0
revenue                 0
runtime                 0
director               41
actor1                136
actor2                180
production_company    246
production_country    216
genre                 126
dtype: int64

# Drop useless Columns 

In [18]:
df_merged = df_merged.drop(columns=['genre_ids', 'video', 'adult','original_title'])

# Drop Rows with NaN

In [19]:
# Drop all rows which contain any NaN values
df_merged= df_merged.dropna()

In [20]:
df_merged.isna().sum()

id                    0
original_language     0
popularity            0
release_date          0
title                 0
vote_average          0
vote_count            0
budget                0
revenue               0
runtime               0
director              0
actor1                0
actor2                0
production_company    0
production_country    0
genre                 0
dtype: int64

In [21]:
df_merged.shape

(363, 16)

# Final adjustment

In [22]:
cut = df_merged[df_merged['runtime']==0]
cut

Unnamed: 0,id,original_language,popularity,release_date,title,vote_average,vote_count,budget,revenue,runtime,director,actor1,actor2,production_company,production_country,genre
8,1093995,en,303.245,2024-05-02,Chief of Station,5.272,46,0,0,0,Jesse V. Johnson,Aaron Eckhart,Alex Pettyfer,Bee Holder Productions,United States of America,Action
55,1215918,en,24.725,2024-06-21,Blackwater Lane,0.000,0,0,0,0,Jeff Celentano,Dermot Mulroney,Minka Kelly,Clear Pictures Entertainment,United States of America,Thriller
59,1291436,tl,32.556,2024-06-28,Huwad,0.000,0,0,0,0,Reynold Giba,Azi Acosta,Aerol Carmelo,Vivamax,Philippines,Drama
60,1291436,tl,32.556,2024-06-28,Huwad,0.000,0,0,0,0,Reynold Giba,Azi Acosta,Aerol Carmelo,Vivamax,Philippines,Drama
61,1291436,tl,32.556,2024-06-28,Huwad,0.000,0,0,0,0,Reynold Giba,Azi Acosta,Aerol Carmelo,Vivamax,Philippines,Drama
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,1246372,es,2.118,2023-09-29,In Another Life,10.000,1,600,0,0,Kaory Rios,Andrea Rivera,Daniel Villalpando,UPAEP,Mexico,Drama
612,1299288,tr,2.047,2024-06-28,La Hayde Maske,0.000,0,0,0,0,Sefa Özçelik,Ayhan Taş,Burak Satıbol,SineLine Film,Turkey,Comedy
640,1195360,zh,1.859,2024-07-05,Welcome to My Side,0.000,0,0,0,0,Song Haolin,Yu Shi,Wang Yinglu,海南可能制造影业有限公司,China,Comedy
673,1302332,it,1.058,2024-06-19,Camilla,0.000,0,3000,0,0,Tommaso Barba,Beatrice Gatta,Antonio Greco,Accademia Griffith,Italy,Drama


In [23]:
df_merged = df_merged[df_merged['runtime']>=60]
df_merged.shape

(257, 16)

# Remove duplicates

In [24]:
df_merged = df_merged.drop_duplicates()
df_merged.shape

(257, 16)

# Save df

In [25]:
df_merged.to_csv('output/cleaned-upcoming-test.csv')