### Imports

In [1]:
import pandas as pd
import re

### Reading csv files

    Original Pixar DataSet

In [2]:
original_df = pd.read_csv('../data/pixar__studio12.csv')
original_df.sample(3)

Unnamed: 0,Moviename,Releaseyear,Budget(million),US and canada(million),Other territories(million),Worldwide(million),Rotten Tomatoes
17,Cars 3,2017,1750,1529,2310,3839,69%
19,Incredibles 2,2018,2000,6086,6342,12428,93%
11,Cars 2,2011,2000,1915,3684,5599,40%


Renaming columns in order to have it standarized in all the dataframes.

In [3]:
original_df.rename(columns={'Moviename': 'film'}, inplace=True)
original_df.rename(columns={'Releaseyear': 'year'}, inplace=True)
original_df.rename(columns={'Rotten Tomatoes': 'rotten_tomatoes_score'}, inplace=True)
original_df = original_df.drop(columns=['Budget(million)', 'US and canada(million)', 'Other territories(million)', 'Worldwide(million)'])
original_df.head(3)

Unnamed: 0,film,year,rotten_tomatoes_score
0,Toy Story,1995,100%
1,A Bug's Life,1998,92%
2,Toy Story 2,1999,100%


As the year column is going to be repeated, it has to be removed from this dataframe because as it is not up to date, 4 movies are missing.

In [4]:
original_df = original_df.drop(columns=['year'])

    Characters and assets DataSet

In [5]:
char_assets_df = pd.read_csv('../data/characters_assets.csv')
char_assets_df.head(3)

Unnamed: 0,film,characters,assets
0,Elemental,"Ember Lumen, Wade Ripple, Bernie Lumen, Cinder...","Building The World, Firetown, The Water Distri..."
1,Lightyear,"The Human Side of Buzz, Buzz Lightyear, Izzy H...","“Warm CGI”, T'kani Prime, Turnip, Star Command..."
2,Turning Red,"The Inner 13-Year Old, Meilin Lee, Red Panda M...","“Chunky Cute”, Toronto, Chinatown & the Lee Fa..."


    Academy Awards DataSet

In [6]:
academy_awards_df = pd.read_csv('../data/pixar_academy_awards.csv')
academy_awards_df.tail(7)

Unnamed: 0,film,best_picture,animated_feature,original_screenplay,adapted_screenplay,original_score,original_song,sound_editing,sound_mixing,other,year,budget,u.s._and_canada,other_territories,worldwide
20,Toy Story 4,,Won,Ineligible,,,Nominated,,,,2019,200.0,434.0,639,1073
21,Onward,,Nominated,,Ineligible,,,,,,2020,175.0,62.0,80,142
22,Soul,,Won,,Ineligible,Won,,Nominated,Nominated,,2020,150.0,,121,121
23,Luca,,Nominated,,Ineligible,,,,,,2021,,,50,50
24,Turning Red,,Nominated,,Ineligible,,,,,,2022,175.0,,20,20
25,Lightyear,,,Ineligible,,,,,,,2022,200.0,118.0,108,226
26,Elemental,,,,,,,,,,2023,200.0,154.0,338,492


Cleaning Awards DataSet

In [7]:
def clean_budget(str_):
    rgx = r'\d+'
    res = re.search(rgx, str_)
    if res:
        return int(res.group())
    return ''

def clean_money_cell(str_):
    regex_other = r'\$\d{1,3}(?:,\d{3})*(?:\.\d+)?'
    res = re.search(regex_other, str_)
    if res:
        res = res.group()
        res = res.replace(',', '').replace('$','')
        return round(int(res) / 1000000)
    return ''

academy_awards_df['budget'] = academy_awards_df['budget'].apply(clean_budget)
academy_awards_df['u.s._and_canada'] = academy_awards_df['u.s._and_canada'].apply(clean_money_cell)
academy_awards_df['other_territories'] = academy_awards_df['other_territories'].apply(clean_money_cell)
academy_awards_df['worldwide'] = academy_awards_df['worldwide'].apply(clean_money_cell)

academy_awards_df.tail(7)

TypeError: expected string or bytes-like object, got 'float'

### Merging dataframes

In [None]:
final_df = pd.merge(original_df, char_assets_df, on="film", how="outer")
final_df = pd.merge(final_df, academy_awards_df, on="film", how="outer")
final_df.sample(3)

Unnamed: 0,film,rotten_tomatoes_score,characters,assets,best_picture,animated_feature,original_screenplay,adapted_screenplay,original_score,original_song,sound_editing,sound_mixing,other,year,budget,u.s._and_canada,other_territories,worldwide
7,Ratatouille,96%,"The Artist Rat, Remy, Auguste Gusteau, Linguin...","A World of Extremes, Old School, Gusteau’s, Li...",,Won,Nominated,Ineligible,Nominated,,Nominated,Nominated,,2007,150,206,417,624
11,Cars 2,40%,"What Would Mater Do?, Mater, Lightning McQueen...","The International World of Cars 2, (Not) Lost ...",,,Ineligible,,,,,,,2011,200,191,368,560
22,Elemental,,"Ember Lumen, Wade Ripple, Bernie Lumen, Cinder...","Building The World, Firetown, The Water Distri...",,,,,,,,,,2023,200,154,338,492


In [None]:
def count_items(str_):
    list_ = str_.split(',')
    return len(list_)

final_df['num_characters'] = final_df['characters'].apply(count_items)
final_df['num_assets'] = final_df['assets'].apply(count_items)
final_df['assets_+_characters'] = final_df['num_characters'] + final_df['num_assets']
final_df.sample(3)

Unnamed: 0,film,rotten_tomatoes_score,characters,assets,best_picture,animated_feature,original_screenplay,adapted_screenplay,original_score,original_song,...,sound_mixing,other,year,budget,u.s._and_canada,other_territories,worldwide,num_characters,num_assets,assets_+_characters
21,Onward,88%,"Ian Lightfoot, Barley Lightfoot, Laurel Lightf...","Familiar Fantasy, Trust Bridge, Homes, Suburbia",,Nominated,,Ineligible,,,...,,,2020,175,62,80,142,9,4,13
16,Finding Dory,94%,"The Master Mimic, Dory, Nemo & Marlin, Hank, B...","Water and Light, Great Barrier Reef, The Marin...",,,Ineligible,,,,...,,,2016,200,486,542,1029,10,4,14
12,Brave,78%,"The Story of Brave, Merida , Queen Elinor, Kin...","The Stories of Scotland, Castle DunBroch, The ...",,Won,,Ineligible,,,...,,,2012,185,237,302,539,9,6,15


In [None]:
final_df.to_csv('../data/joined.csv', index=False)