In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Introduction / Getting the Datasets

__Load__ and __inspect__ the datasets "movies_clean.csv" and "credits.csv". 

In [2]:
df_movies = pd.read_csv("movies_clean.csv")

In [3]:
df_credits = pd.read_csv("credits.csv")

In [4]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44691 entries, 0 to 44690
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     44691 non-null  int64  
 1   title                  44691 non-null  object 
 2   tagline                20284 non-null  object 
 3   release_date           44657 non-null  object 
 4   genres                 42586 non-null  object 
 5   belongs_to_collection  4463 non-null   object 
 6   original_language      44681 non-null  object 
 7   budget_musd            8854 non-null   float64
 8   revenue_musd           7385 non-null   float64
 9   production_companies   33356 non-null  object 
 10  production_countries   38835 non-null  object 
 11  vote_count             44691 non-null  float64
 12  vote_average           42077 non-null  float64
 13  popularity             44691 non-null  float64
 14  runtime                43179 non-null  float64
 15  ov

In [5]:
df_credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cast    45476 non-null  object
 1   crew    45476 non-null  object
 2   id      45476 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


__Identify__ stringified/nested __json columns__ in the __credits__ dataset.

Answer: Cast, Crew

In [6]:
df_credits.head(2)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844


## Preparing the Data for Merge

__Drop Duplicates__ in the credits datasets. (similar to Project 3)

In [7]:
len(df_credits)

45476

In [8]:
df_credits.drop_duplicates('id', inplace=True)

In [9]:
len(df_credits)

45432

## Merging the Data

__Merge/Join__ the datasets movies_clean and credits. -> Add the features __cast__ and __crew__ to the movies_clean dataset

In [10]:
df = pd.merge(df_movies, df_credits, how="inner", on="id")
df.head(2)

Unnamed: 0,id,title,tagline,release_date,genres,belongs_to_collection,original_language,budget_musd,revenue_musd,production_companies,production_countries,vote_count,vote_average,popularity,runtime,overview,spoken_languages,poster_path,cast,crew
0,862,Toy Story,,1995-10-30,Animation|Comedy|Family,Toy Story Collection,en,30.0,373.554033,Pixar Animation Studios,United States of America,5415.0,7.7,21.946943,81.0,"Led by Woody, Andy's toys live happily in his ...",English,<img src='http://image.tmdb.org/t/p/w185//rhIR...,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,8844,Jumanji,Roll the dice and unleash the excitement!,1995-12-15,Adventure|Fantasy|Family,,en,65.0,262.797249,TriStar Pictures|Teitler Film|Interscope Commu...,United States of America,2413.0,6.9,17.015539,104.0,When siblings Judy and Peter discover an encha...,English|Français,<img src='http://image.tmdb.org/t/p/w185//vzmL...,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."


In [11]:
df.iloc[0]['crew']

'[{\'credit_id\': \'52fe4284c3a36847f8024f49\', \'department\': \'Directing\', \'gender\': 2, \'id\': 7879, \'job\': \'Director\', \'name\': \'John Lasseter\', \'profile_path\': \'/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f4f\', \'department\': \'Writing\', \'gender\': 2, \'id\': 12891, \'job\': \'Screenplay\', \'name\': \'Joss Whedon\', \'profile_path\': \'/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f55\', \'department\': \'Writing\', \'gender\': 2, \'id\': 7, \'job\': \'Screenplay\', \'name\': \'Andrew Stanton\', \'profile_path\': \'/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f5b\', \'department\': \'Writing\', \'gender\': 2, \'id\': 12892, \'job\': \'Screenplay\', \'name\': \'Joel Cohen\', \'profile_path\': \'/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f61\', \'department\': \'Writing\', \'gender\': 0, \'id\': 12893, \'job\': \'Screenplay\', \'name\': \'A

## Cleaning and Transforming the new "Cast" Column

__Evaluate__ Python Expressions in the stringified column "cast" and __remove quotes__ ("") where possible.

In [12]:
import json
import ast

def filter_json_director(x):   
    if type(x) is not str:
        return np.nan

    evaluated = ast.literal_eval(x)
    
    if type(evaluated) is list:
        output = ""
        for item in evaluated:
            if item['job'] == 'Director':
                return item['name']
        return np.nan
    else:
        if x[0] != "{":
            return np.nan
        if evaluated['job'] != 'Director':
            return np.nan
        output = evaluated["name"]
        return output

def filter_json(x):      
    if type(x) is not str:
        return np.nan

    evaluated = ast.literal_eval(x)
    
    if type(evaluated) is list:
        output = ""
        for item in evaluated:
            output += item["name"] + "|"
        return output[:-1]
    else:
        if x[0] != "{":
            return np.nan
        output = evaluated["name"]
        return output
    
df['director'] = df['crew'].apply(filter_json_director)
df['crew'] = df['crew'].apply(filter_json)
df['cast'] = df['cast'].apply(filter_json)

In [13]:
df['crew']

0        John Lasseter|Joss Whedon|Andrew Stanton|Joel ...
1        Larry J. Franco|Jonathan Hensleigh|James Horne...
2        Howard Deutch|Mark Steven Johnson|Mark Steven ...
3        Forest Whitaker|Ronald Bass|Ronald Bass|Ezra S...
4        Alan Silvestri|Elliot Davis|Nancy Meyers|Nancy...
                               ...                        
44686    Hamid Nematollah|Hamid Nematollah|Farshad Moha...
44687    Lav Diaz|Lav Diaz|Dante Perez|Lav Diaz|Lav Dia...
44688    Mark L. Lester|C. Courtney Joyner|Jeffrey Gold...
44689                 Yakov Protazanov|Joseph N. Ermolieff
44690                                        Daisy Asquith
Name: crew, Length: 44691, dtype: object

In [14]:
df['cast']

0        Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...
1        Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...
2        Walter Matthau|Jack Lemmon|Ann-Margret|Sophia ...
3        Whitney Houston|Angela Bassett|Loretta Devine|...
4        Steve Martin|Diane Keaton|Martin Short|Kimberl...
                               ...                        
44686              Leila Hatami|Kourosh Tahami|Elham Korda
44687    Angel Aquino|Perry Dizon|Hazel Orencio|Joel To...
44688    Erika Eleniak|Adam Baldwin|Julie du Page|James...
44689    Iwan Mosschuchin|Nathalie Lissenko|Pavel Pavlo...
44690                                                     
Name: cast, Length: 44691, dtype: object

__Determine__ the __cast size__ for all movies (number of actors) and add the additional column "cast_size".

In [15]:
def determine_cast_size(cast: str):
    if type(cast) is float:
        return np.nan
    
    if cast.strip() == "":
        return 0
    
    return cast.count("|") + 1

df['cast_size'] = df['cast'].apply(determine_cast_size)

__Extract__ all __actor names__ from the column "cast" and __overwrite__ "cast". If a movie has more than one actor, __seperate names by a pipe__ "|".<br>
For example: The value in the first row (Toy Story) should be 'Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wallace Shawn|John Ratzenberger|Annie Potts|John Morris|Erik von Detten|Laurie Metcalf|R. Lee Ermey|Sarah Freeman|Penn Jillette'.
  
Answer: Already done.

__Inspect__ cast with value_counts(). Do you see anything strange? __Take reasonable measures__!

In [16]:
df['cast'].value_counts()

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            2189
Georges Méliès                                                                                                                                                                                                                                                                         

In [17]:
df['cast_size'].value_counts()

10     2770
8      2729
7      2710
6      2649
5      2637
       ... 
183       1
165       1
151       1
130       1
109       1
Name: cast_size, Length: 151, dtype: int64

In [18]:
df = df[df['cast_size'] > 1]

In [19]:
df['cast_size'].value_counts()[:20]

10    2770
8     2729
7     2710
6     2649
5     2637
9     2557
15    2444
4     2406
11    2283
12    2043
13    1799
14    1578
16    1350
17    1103
3     1087
2      882
18     811
19     785
20     735
21     591
Name: cast_size, dtype: int64

__Extract__ the __director name__ from the column "crew" and create the new column "director". <br> For example: The value in the first row (Toy Story) should be 'John Lasseter'.

Answer: Calculated before when handling crew

In [20]:
df['director']

0           John Lasseter
1            Joe Johnston
2           Howard Deutch
3         Forest Whitaker
4           Charles Shyer
               ...       
44685          John Irvin
44686    Hamid Nematollah
44687            Lav Diaz
44688      Mark L. Lester
44689    Yakov Protazanov
Name: director, Length: 41200, dtype: object

__Drop__ the column "crew" and __save__ the dataset in a csv-file.

No need to save.

In [22]:
df.drop('crew', axis=1, inplace=True)