### Imports

In [1]:
import pandas as pd

### Reading csv files

    Original Pixar DataSet

In [2]:
original_df = pd.read_csv('../data/pixar__studio12.csv')
original_df

Unnamed: 0,Moviename,Releaseyear,Budget(million),US and canada(million),Other territories(million),Worldwide(million),Rotten Tomatoes
0,Toy Story,1995,300,1917,1818,3736,100%
1,A Bug's Life,1998,1200,1628,2005,3633,92%
2,Toy Story 2,1999,900,2459,2515,4974,100%
3,"Monsters, Inc.",2001,1150,2899,3424,6323,96%
4,Finding Nemo,2003,940,3397,5313,8710,99%
5,The Incredibles,2004,920,2614,3702,6316,97%
6,Cars,2006,1200,2441,2179,4620,74%
7,Ratatouille,2007,1500,2064,4173,6237,96%
8,WALL-E,2008,1800,2238,2975,5213,95%
9,Up,2009,1750,2930,4421,7351,98%


Renaming columns in order to have it standarized in all the dataframes.

In [3]:
original_df.rename(columns={'Moviename': 'film'}, inplace=True)
original_df.rename(columns={'Releaseyear': 'year'}, inplace=True)
original_df.rename(columns={'Budget(million)': 'budget'}, inplace=True)
original_df.rename(columns={'US and canada(million)': 'us_canada'}, inplace=True)
original_df.rename(columns={'Other territories(million)': 'other_territories'}, inplace=True)
original_df.rename(columns={'Worldwide(million)': 'worldwide'}, inplace=True)
original_df.rename(columns={'Rotten Tomatoes': 'rotten_tomatoes_score'}, inplace=True)
original_df.head(3)

Unnamed: 0,film,year,budget,us_canada,other_territories,worldwide,rotten_tomatoes_score
0,Toy Story,1995,300,1917,1818,3736,100%
1,A Bug's Life,1998,1200,1628,2005,3633,92%
2,Toy Story 2,1999,900,2459,2515,4974,100%


According to wikipedia information, the budget in this dataframe is 10 times the budget of wikipedia, so it needs to be modified.

In [4]:
original_df['budget'] = original_df['budget'] / 10
original_df['us_canada'] = original_df['us_canada'] / 10
original_df['other_territories'] = original_df['other_territories'] / 10
original_df['worldwide'] = original_df['worldwide'] / 10
original_df.head(3)

Unnamed: 0,film,year,budget,us_canada,other_territories,worldwide,rotten_tomatoes_score
0,Toy Story,1995,30.0,191.7,181.8,373.6,100%
1,A Bug's Life,1998,120.0,162.8,200.5,363.3,92%
2,Toy Story 2,1999,90.0,245.9,251.5,497.4,100%


As the year column is going to be repeated, it has to be removed from this dataframe because as it is not up to date, 4 movies are missing.

In [5]:
original_df = original_df.drop(columns=['year'])

    Characters and assets DataSet

In [6]:
char_assets_df = pd.read_csv('../data/characters_assets.csv')
char_assets_df.head(3)

Unnamed: 0,film,characters,assets
0,Elemental,"Ember Lumen, Wade Ripple, Bernie Lumen, Cinder...","Building The World, Firetown, The Water Distri..."
1,Lightyear,"The Human Side of Buzz, Buzz Lightyear, Izzy H...","“Warm CGI”, T'kani Prime, Turnip, Star Command..."
2,Turning Red,"The Inner 13-Year Old, Meilin Lee, Red Panda M...","“Chunky Cute”, Toronto, Chinatown & the Lee Fa..."


    Academy Awards DataSet

In [7]:
academy_awards_df = pd.read_csv('../data/pixar_academy_awards.csv')
academy_awards_df.tail(3)

Unnamed: 0,film,best_picture,animated_feature,original_screenplay,adapted_screenplay,original_score,original_song,sound_editing,sound_mixing,other,year
24,Turning Red,,Nominated,,Ineligible,,,,,,2022
25,Lightyear,,,Ineligible,,,,,,,2022
26,Elemental,,,,,,,,,,2023


### Merging dataframes

In [8]:
final_df = pd.merge(original_df, char_assets_df, on="film", how="outer")
final_df = pd.merge(final_df, academy_awards_df, on="film", how="outer")
final_df.sample(3)

Unnamed: 0,film,budget,us_canada,other_territories,worldwide,rotten_tomatoes_score,characters,assets,best_picture,animated_feature,original_screenplay,adapted_screenplay,original_score,original_song,sound_editing,sound_mixing,other,year
26,Soul,,,,,,"Joe Gardner, 22, Dez, Dorothea Williams , Libb...","Building The Performances, New York City, Barb...",,Won,,Ineligible,Won,,Nominated,Nominated,,2020
2,Toy Story 2,90.0,245.9,251.5,497.4,100%,"Woody, Buzz Lightyear, Jessie, Bullseye, Mrs. ...","Al's Apartment, Al's Toy Barn, Woody's Roundup...",,Award not yet introduced,Ineligible,,,Nominated,,,,1999
21,Onward,200.0,61.6,80.4,142.0,88%,"Ian Lightfoot, Barley Lightfoot, Laurel Lightf...","Familiar Fantasy, Trust Bridge, Homes, Suburbia",,Nominated,,Ineligible,,,,,,2020


In [None]:
def count_items(str_):
    list_ = str_.split(',')
    return len(list_)

final_df['num_characters'] = final_df['characters'].apply(count_items)
final_df['num_assets'] = final_df['assets'].apply(count_items)

In [9]:
final_df.to_csv('../data/joined.csv', index=False)