# Exploratory data analysis of the Disney Datasets

## Methods and Results

First, I will read the csv files in one go

In [1]:
import altair as alt
import numpy as np
import pandas as pd
import glob
import os
import sys
from textwrap import wrap
from datetime import datetime
alt.data_transformers.enable('data_server')
alt.renderers.enable('mimetype')
alt.themes.enable('default')

ThemeRegistry.enable('default')

In [2]:
import vl_convert as vlc

## Data Wrangling

In [3]:
disney_characters = pd.read_csv("data/processed/disney-characters.csv", parse_dates=['release_date'])
disney_director = pd.read_csv("data/processed/disney-director.csv")
disney_movies_total_gross = pd.read_csv("data/processed/disney-movies-total-gross.csv", parse_dates=['release_date'])
disney_revenue_1991_2016 = pd.read_csv("data/processed/disney-revenue-1991-2016.csv", parse_dates=['Year'])
disney_voice_actors = pd.read_csv("data/processed/disney-voice-actors.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'data/processed/disney-characters.csv'

Then, I will attempt to merge the files based on a common column name

In [None]:
merge_1 = disney_movies_total_gross.merge(
    disney_characters, on=["movie_title", "release_date"], how="outer")
merge_1

In [None]:
merge_2 = merge_1.merge(disney_director, on=['movie_title'], how="outer")
merge_2

In [None]:
listed_characters = disney_voice_actors.groupby(['movie_title']).agg({
    'character': lambda x: x.tolist(),
    'voice-actor': lambda x: x.tolist()
}
)
merge_3 = merge_2.merge(listed_characters, on=['movie_title'], how="outer")
merge_3

## EDA

In [None]:
non_null_genres = merge_3[~merge_3['genre'].isnull()]

In [None]:
genre_count = alt.Chart(non_null_genres).mark_bar().encode(
    x=alt.X('genre:N', sort='-y'),
    y=alt.Y('count()'), 
    color = 'genre')
genre_count

In [None]:
non_null_rating = merge_3[~merge_3['MPAA_rating'].isnull()]

In [None]:
rating_count = alt.Chart(non_null_rating).mark_bar().encode(
    x=alt.X('MPAA_rating:N', sort='-y'),
    y=alt.Y('count()'), 
    color = 'MPAA_rating')
rating_count

In [None]:
non_null_director = merge_3[~merge_3['director'].isnull()]

In [None]:
director_count = alt.Chart(non_null_director).mark_bar().encode(
    x=alt.X('director:N', sort='-y'),
    y=alt.Y('count:Q'), 
    color=alt.Color('director:N')
).transform_aggregate(
    count='count()',
    groupby=['director']
).transform_window(
    rank='rank(count)',
    sort=[alt.EncodingSortField('count', order='descending')]
).transform_filter('datum.rank <= 10')

director_count

In [None]:
x = merge_3.dropna(subset=['genre', 'director'], how='any')

In [None]:
director_genres = alt.Chart(x).mark_circle().encode(
    x=alt.X('director'),
    y=alt.Y('genre'), 
    color = 'genre', size = 'count()')
director_genres

In [None]:
genre_grouped = merge_3.groupby("genre").agg({"total_gross": "sum", "inflation_adjusted_gross": "sum"}).reset_index()
genre_grouped = genre_grouped.melt(id_vars=['genre'] , 
                              value_vars=['total_gross', 'inflation_adjusted_gross'], 
                              var_name='gross', 
                              value_name='value')
genre_grouped['genre'] = genre_grouped['genre'].apply(wrap, args=[11])
genre_grouped

In [None]:
genre_chart = alt.Chart(genre_grouped).mark_bar().encode(
    x=alt.X('gross:N', axis=alt.Axis(title=None, labels=False, ticks=False)),
    y=alt.Y('value:Q', axis=alt.Axis(format='$~s')),
    color='gross:N',
    column=alt.Column('genre:N', sort=alt.EncodingSortField(field='value', op='mean', order='descending'),
                      header=alt.Header(title=None, labelOrient='bottom', labelAngle=0)),
).configure_view(
    stroke='transparent').configure_axis(labelPadding=38, labelAlign='left')
genre_chart 

In [None]:
grouping2 = merge_3.groupby("release_date").agg({"total_gross": "sum", "inflation_adjusted_gross": "sum"}).reset_index()
grouping2 = grouping2.melt(id_vars=['release_date'] , 
                               value_vars=['total_gross', 'inflation_adjusted_gross'], 
                               var_name='gross', 
                               value_name='value')
grouping2 = grouping2[(grouping2['release_date'].dt.year >= 1991)]
grouping2

In [None]:
year_chart = alt.Chart(grouping2).mark_line().encode(
    x=alt.X('year(release_date):T'),
    y=alt.Y('sum(value):Q', axis=alt.Axis(format='$~s')),
    color='gross:N')

year_chart 

In [None]:
revenue_chart = alt.Chart(disney_revenue_1991_2016).mark_line().encode(
    x = alt.X('Year', scale=alt.Scale(domain=('1991', '2016'))),
    y = alt.Y('Total', axis=alt.Axis(format='$~s')) 
)
revenue_chart

In [None]:
none_null_voice_actors = disney_voice_actors[disney_voice_actors['voice-actor'] != 'None']

In [None]:
voice_actor_count = alt.Chart(none_null_voice_actors).mark_bar().encode(
    x=alt.X('voice-actor:N', sort='-y'),
    y=alt.Y('count:Q'), 
    color=alt.Color('voice-actor:N')
).transform_aggregate(
    count='count()',
    groupby=['voice-actor']
).transform_window(
    rank='rank(count)',
    sort=[alt.EncodingSortField(field='count', order='descending')]
).transform_filter('datum.rank <= 10')

voice_actor_count

In [None]:
disney_revenue_melted = disney_revenue_1991_2016.melt(id_vars=['Year'] , 
                              value_vars=['Studio Entertainment',
                                          'Disney Consumer Products',
                                          'Disney Interactive',
                                          'Walt Disney Parks and Resorts',
                                          'Disney Media Networks',
                                          'Total'], 
                              var_name='revenue', 
                              value_name='value')
disney_revenue_melted

In [None]:
revenue_density = alt.Chart(disney_revenue_melted).transform_density('value',
    groupby=['revenue'],
    as_ = ['value', 'density'],
    steps=200,
    extent=[0,100000]
).mark_area(opacity=0.45).encode(
    alt.X('value:Q'),
    alt.Y('density:Q'),
    alt.Color('revenue:N')
).properties(width=400, height=400)

revenue_density