In [None]:
# Setup environment
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from IPython.display import display
from ast import literal_eval

Ingest and Combine Sources to make 1 Movies file

In [None]:
# Read different data files into notebook
metadata = pd.read_csv("data/movies_metadata.csv")
keywords = pd.read_csv("data/keywords.csv")
ratings = pd.read_csv("data/ratings_small.csv")

In [None]:
# Check the shapes of all dataframes and the columns that each dataset includes
print("metadata shape:", list(metadata.shape))
print("metadata columns name:", metadata.columns)

print("ratings shape:", ratings.shape)
print("ratings columns name:", list(ratings.columns))

print("keywords shape:", keywords.shape)
print("columns name:", list(keywords.columns))

In [None]:
# Check the column dtypes for metadata dataframe
metadata.dtypes

In [None]:
# Check the column dtypes for keywords dataframe
keywords.dtypes

In [None]:
# Check the column dtypes for ratings dataframe
ratings.dtypes

In [None]:
# Filter movie metadata dataframe to include movies in Englist ONLY and non-video movies (youtube short clips)
print("metadata shape:", list(metadata.shape))
metadata = metadata[(metadata.original_language == "en") & (metadata.video == False)]
print("metadata shape:", list(metadata.shape))


In [None]:
# Change metadata ID col to int type to merge with other datasets
metadata["id"] = metadata["id"].astype('int')

In [None]:
# Merge dataframes on the "id" col to create one movie dataframe
movies = metadata.merge(keywords, on='id')
print("movies shape:", list(movies.shape))
print("movies columns:", list(movies.columns))

In [None]:
# Remove columns that are not necessary
drop_cols = ['budget', 'homepage', 'imdb_id', 'original_language', 'revenue', 'spoken_languages', 'video', 'production_countries', 'belongs_to_collection', 'tagline']
movies.drop(drop_cols, inplace=True, axis=1)
print("movies shape:", list(movies.shape))
print("movies columns:", list(movies.columns))


In [None]:
display(movies)

In [None]:
# Change values in columns that are "stringified" to their original python data types (lists)
features = ['genres', 'production_companies', 'keywords']
for feature in features:
    movies[feature] = movies[feature].apply(literal_eval)
    
print(type(movies.genres[0]))

In [None]:
# Create a function to extract valuable data from list values in certain columns
def get_list_data(value):
    if isinstance(value, list):
        return [val['name'] for val in value]
    else:
        return []

In [None]:
# Apply get_list_data to each column with list values and return the important values
for feature in features:
    movies[feature] = movies[feature].apply(get_list_data)

In [None]:
display(movies)

EDA Entire Movie Dataset

In [None]:
# Check Non-Null counts and column dtypes
movies.info()

In [None]:
# Check rows that are missing overview values
print(movies.shape)
print(movies.dropna().shape)

In [None]:
movies.describe().transpose()