In [1]:
import pandas as pd
import altair as alt
from vega_datasets import data

In [3]:
alt.renderers.enable('notebook')
alt.renderers.enable('default')

RendererRegistry.enable('default')

In [4]:
#the movies dataset
movies_data = data.movies()
movies_data.sample(5)

Unnamed: 0,Title,US_Gross,Worldwide_Gross,US_DVD_Sales,Production_Budget,Release_Date,MPAA_Rating,Running_Time_min,Distributor,Source,Major_Genre,Creative_Type,Director,Rotten_Tomatoes_Rating,IMDB_Rating,IMDB_Votes
2797,Snow Day,60008303.0,62452927.0,,13000000.0,Feb 11 2000,PG,89.0,Paramount Pictures,Original Screenplay,Comedy,Contemporary Fiction,,26.0,4.4,4611.0
710,The Pirate,2956000.0,2956000.0,,3700000.0,Dec 31 1947,,,,,,,Vincente Minnelli,71.0,7.1,1635.0
450,Howard the Duck,16295774.0,16295774.0,,30000000.0,Aug 01 1986,,,Universal,Based on Comic/Graphic Novel,Action,Science Fiction,,16.0,4.1,16051.0
887,Superman II,108185706.0,108185706.0,,50000000.0,Jun 19 1981,,,Warner Bros.,Based on Comic/Graphic Novel,Adventure,Super Hero,Richard Donner,,6.7,29512.0
1643,Death at a Funeral,8580428.0,34743644.0,,20000000.0,Aug 17 2007,R,90.0,MGM,Original Screenplay,Comedy,Contemporary Fiction,Frank Oz,61.0,5.1,6628.0


In [5]:
movies_data.shape

(3201, 16)

In [6]:
#there are many fields with missing values
movies_data.isnull().sum()

Title                        1
US_Gross                     7
Worldwide_Gross              7
US_DVD_Sales              2637
Production_Budget            1
Release_Date                 0
MPAA_Rating                605
Running_Time_min          1992
Distributor                232
Source                     365
Major_Genre                275
Creative_Type              446
Director                  1331
Rotten_Tomatoes_Rating     880
IMDB_Rating                213
IMDB_Votes                 213
dtype: int64

In [7]:
#invoke the dropna function and drop all records that have missing fields values
movies_data.dropna(inplace = True)
movies_data.shape

(174, 16)

In [8]:
#let's take a look at a box plot representation of the worldwide
alt.Chart(movies_data, height = 400, width = 600) \
    .mark_boxplot(color = 'blue') \
    .encode(y = 'Worldwide_Gross:Q') \
    .properties(title = 'Movies')

In [9]:
#I want to know the relationship that exists between the production budget
#and worldwide collections
alt.Chart(movies_data, height = 400, width = 600) \
    .mark_point(color = 'darkcyan') \
    .encode(x = 'Production_Budget',
            y = 'Worldwide_Gross:Q') \
    .properties(title = 'Production_Budget vs Worldwide_Gross')

In [10]:
#along the x axis, I'll specify the genre of the movie, and on the y axis,
#the worldwide collection
alt.Chart(movies_data, height = 400, width = 600) \
    .mark_bar(size = 20) \
    .encode(x = 'Major_Genre:O',
            y = 'Worldwide_Gross:Q',
            color = 'Major_Genre') \
    .properties(title = 'Worldwide Gross for different Genres')

In [11]:
#using the median func on Rotten Tomatoes rating
med_rating = movies_data['Rotten_Tomatoes_Rating'].median()
med_rating

61.0

In [12]:
#adding a new column to my data, which specifies whether a particular
#movies rating is above the median or not
#false values indicate below the median, true values above the median
movies_data['above_average'] = (movies_data['Rotten_Tomatoes_Rating'] - med_rating) > 0
movies_data.head()

Unnamed: 0,Title,US_Gross,Worldwide_Gross,US_DVD_Sales,Production_Budget,Release_Date,MPAA_Rating,Running_Time_min,Distributor,Source,Major_Genre,Creative_Type,Director,Rotten_Tomatoes_Rating,IMDB_Rating,IMDB_Votes,above_average
1064,12 Rounds,12234694.0,18184083.0,8283859.0,20000000.0,Mar 27 2009,PG-13,108.0,20th Century Fox,Original Screenplay,Action,Contemporary Fiction,Renny Harlin,28.0,5.4,8914.0,False
1074,2012,166112167.0,766812167.0,50736023.0,200000000.0,Nov 13 2009,PG-13,158.0,Sony Pictures,Original Screenplay,Action,Science Fiction,Roland Emmerich,39.0,6.2,396.0,False
1090,300,210614939.0,456068181.0,261252400.0,60000000.0,Mar 09 2007,R,117.0,Warner Bros.,Based on Comic/Graphic Novel,Action,Historical Fiction,Zack Snyder,60.0,7.8,235508.0,False
1095,3:10 to Yuma,53606916.0,69791889.0,51359371.0,48000000.0,Sep 02 2007,R,117.0,Lionsgate,Remake,Western,Historical Fiction,James Mangold,89.0,7.9,98355.0,True
1107,88 Minutes,16930884.0,32955399.0,11385055.0,30000000.0,Apr 18 2008,R,106.0,Sony Pictures,Original Screenplay,Thriller/Suspense,Contemporary Fiction,Jon Avnet,5.0,5.9,31205.0,False


In [13]:
#let's go to the scatter plot
alt.Chart(movies_data, height = 400, width = 600) \
    .mark_point(color = 'darkcyan') \
    .encode(x = 'Production_Budget',
            y = 'Worldwide_Gross',
            color = 'above_average') \
    .properties(title = 'Production_Budget vs Worldwide_Gross')

In [15]:
#how the worldwide collections of movies are based on genre and on their rating
alt.Chart(movies_data, height = 400, width = 140) \
    .mark_bar() \
    .encode(x = 'above_average:O',
            y = 'Worldwide_Gross:Q',
            color = 'above_average:N',
            column = 'MPAA_Rating:N')    

In [16]:
#I want to see the collections of movies in the US, but I want to see it
#on the basis of their MPAA ratings
alt.Chart(movies_data, height = 400, width = 600) \
    .mark_bar() \
    .encode(x = 'US_Gross',
            y = 'MPAA_Rating',
            color = 'MPAA_Rating',
            #applying an ordering to the MPAA ratings
            #Sort=ascending will sort it in the lexical graphical order
            order = alt.Order('MPAA_Rating',
                             sort = 'ascending'))\
    .properties(title = 'US Gross vs MPAA_Rating')