# In-depth DP
Look at missing/wrong values

Isolate based on column

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import ast
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Function to safely parse JSON-like strings
def parse_json_column(df, column_name):
    def parse_json(x):
        if pd.isna(x):
            return []
        try:
            return ast.literal_eval(x)
        except:
            return []
    df[column_name] = df[column_name].apply(parse_json)
    return df

In [3]:
# Function to flatten list of dictionaries into a DataFrame
def flatten_column(df, column_name):
    flattened_data = df[column_name].apply(pd.Series).stack().reset_index(level=1, drop=True)
    flattened_df = pd.DataFrame(list(flattened_data))
    return flattened_df

In [4]:
# Function to check data quality of a DataFrame
def data_quality_report(df, df_name='DataFrame'):
    print(f"\nData Quality Report for {df_name}:")
    print("\nData Types:")
    print(df.dtypes)
    print("\nMissing Values:")
    print(df.isnull().sum())
    print("\nSummary Statistics:")
    print(df.describe(include='all'))

In [5]:
# Load credits.csv
credits = pd.read_csv('./raw-data/credits.csv')

display(credits.head())

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [6]:
data_quality_report(credits, 'Credits Dataset')


Data Quality Report for Credits Dataset:

Data Types:
cast    object
crew    object
id       int64
dtype: object

Missing Values:
cast    0
crew    0
id      0
dtype: int64

Summary Statistics:
         cast   crew             id
count   45476  45476   45476.000000
unique  43019  44669            NaN
top        []     []            NaN
freq     2418    771            NaN
mean      NaN    NaN  108345.997537
std       NaN    NaN  112443.796536
min       NaN    NaN       2.000000
25%       NaN    NaN   26443.250000
50%       NaN    NaN   60002.500000
75%       NaN    NaN  157302.000000
max       NaN    NaN  469172.000000


In [7]:
credits = parse_json_column(credits, 'cast')
credits = parse_json_column(credits, 'crew')

In [8]:
cast_df = flatten_column(credits, 'cast')
cast_df.reset_index(drop=True, inplace=True)
display(cast_df.head())

Unnamed: 0,cast_id,character,credit_id,gender,id,name,order,profile_path
0,14,Woody (voice),52fe4284c3a36847f8024f95,2,31,Tom Hanks,0,/pQFoyx7rp09CJTAb932F2g8Nlho.jpg
1,15,Buzz Lightyear (voice),52fe4284c3a36847f8024f99,2,12898,Tim Allen,1,/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg
2,16,Mr. Potato Head (voice),52fe4284c3a36847f8024f9d,2,7167,Don Rickles,2,/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg
3,17,Slinky Dog (voice),52fe4284c3a36847f8024fa1,2,12899,Jim Varney,3,/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg
4,18,Rex (voice),52fe4284c3a36847f8024fa5,2,12900,Wallace Shawn,4,/oGE6JqPP2xH4tNORKNqxbNPYi7u.jpg


In [9]:
data_quality_report(cast_df, 'Cast DataFrame')


Data Quality Report for Cast DataFrame:

Data Types:
cast_id          int64
character       object
credit_id       object
gender           int64
id               int64
name            object
order            int64
profile_path    object
dtype: object

Missing Values:
cast_id              0
character            0
credit_id            0
gender               0
id                   0
name                 0
order                0
profile_path    173856
dtype: int64

Summary Statistics:
              cast_id character                 credit_id         gender  \
count   562474.000000    562474                    562474  562474.000000   
unique            NaN    319328                    562044            NaN   
top               NaN            592ef81792514130de01097e            NaN   
freq              NaN     25782                         3            NaN   
mean        51.563372       NaN                       NaN       1.004887   
std        172.998891       NaN                       NaN

In [10]:
crew_df = flatten_column(credits, 'crew')
crew_df.reset_index(drop=True, inplace=True)
display(crew_df.head())

Unnamed: 0,credit_id,department,gender,id,job,name,profile_path
0,52fe4284c3a36847f8024f49,Directing,2,7879,Director,John Lasseter,/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg
1,52fe4284c3a36847f8024f4f,Writing,2,12891,Screenplay,Joss Whedon,/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg
2,52fe4284c3a36847f8024f55,Writing,2,7,Screenplay,Andrew Stanton,/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg
3,52fe4284c3a36847f8024f5b,Writing,2,12892,Screenplay,Joel Cohen,/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg
4,52fe4284c3a36847f8024f61,Writing,0,12893,Screenplay,Alec Sokolow,/v79vlRYi94BZUQnkkyznbGUZLjT.jpg


In [11]:
data_quality_report(crew_df, 'Crew DataFrame')


Data Quality Report for Crew DataFrame:

Data Types:
credit_id       object
department      object
gender           int64
id               int64
job             object
name            object
profile_path    object
dtype: object

Missing Values:
credit_id            0
department           0
gender               0
id                   0
job                  0
name                 0
profile_path    369216
dtype: int64

Summary Statistics:
                       credit_id  department         gender            id  \
count                     464314      464314  464314.000000  4.643140e+05   
unique                    463836          12            NaN           NaN   
top     52fe4ac89251416c750edd47  Production            NaN           NaN   
freq                           3       94498            NaN           NaN   
mean                         NaN         NaN       0.759975  6.456037e+05   
std                          NaN         NaN       0.935607  6.791434e+05   
min                 

In [None]:
# Load keywords.csv
keywords = pd.read_csv('./raw-data/keywords.csv')

display(keywords.head())
# Load links.csv
links = pd.read_csv('./raw-data/links.csv')

# Load movies_metadata.csv
movies_metadata = pd.read_csv('./raw-data/movies_metadata.csv', low_memory=False)

# Load ratings.csv
ratings = pd.read_csv('./raw-data/ratings.csv')