# Movies

### Imports

In [None]:
import pandas as pd
import numpy as np

from plotly.graph_objs import *
from plotly.offline import init_notebook_mode, iplot
import cufflinks as cf

init_notebook_mode()
cf.go_offline()

dims = (980,700)

In [None]:
df = pd.read_csv('Titles.csv', skiprows=9, encoding='utf-16')

### Data Cleaning

In [None]:
df.sample(5)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102389 entries, 0 to 102388
Data columns (total 12 columns):
Target               102389 non-null object
Channel Group ...    102389 non-null object
Multi Period         102389 non-null int64
Year                 102389 non-null int64
Quarter              102389 non-null object
Month                102389 non-null object
Date                 102389 non-null object
Time                 102389 non-null object
Main Title           102389 non-null object
All C&S              102389 non-null float64
Unnamed: 10          0 non-null float64
Unnamed: 11          0 non-null float64
dtypes: float64(3), int64(2), object(7)
memory usage: 9.4+ MB


In [5]:
# drop empty columns
df = df.iloc[:,:-2]
# clean headers
df.columns = df.columns.str.lower().str.replace(' ','').str.replace('\.\.\.','')

In [6]:
df.sample(5)

Unnamed: 0,target,channelgroup,multiperiod,year,quarter,month,date,time,maintitle,allc&s
72739,A4+,Fox Movies Premium,15,2015,2015q4,2015m11,2015-11-06,08:20:00,3 ninjas kick back,0.0
1379,A4+,HBO,14,2014,2014q2,2014m4,2014-04-09,16:30:00,I am sam,0.0011
68736,A4+,Fox Movies Premium,15,2015,2015q1,2015m1,2015-01-18,19:10:00,The grand budapest hotel,0.0028
72513,A4+,Fox Movies Premium,15,2015,2015q4,2015m10,2015-10-20,23:10:00,Whiplash,0.0041
69900,A4+,Fox Movies Premium,15,2015,2015q2,2015m4,2015-04-13,03:45:00,The end of the affair,0.0


In [7]:
# Datatypes
df.date = pd.to_datetime(df.date)
# Cleaning up the period columns
df.quarter = df.date.dt.quarter
df.month = df.date.dt.month

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102389 entries, 0 to 102388
Data columns (total 10 columns):
target          102389 non-null object
channelgroup    102389 non-null object
multiperiod     102389 non-null int64
year            102389 non-null int64
quarter         102389 non-null int64
month           102389 non-null int64
date            102389 non-null datetime64[ns]
time            102389 non-null object
maintitle       102389 non-null object
allc&s          102389 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(4), object(4)
memory usage: 7.8+ MB


### Exploratory Data Analysis

In [9]:
# Gives you no information cause it's all the same - so could drop it, unless you have a use for it
df.target.value_counts()

A4+    102389
Name: target, dtype: int64

In [10]:
# even the graph looks silly :)
df['target'].iplot(kind='histogram', dimensions=dims)

### Let's try and see how the `channelgroup` has done each `year`.

In [11]:
channels = df.groupby(['channelgroup', 'year'])

In [12]:
# We've created a clean little dataframe that's going to be useful for plotting
channels[['allc&s']].sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,allc&s
channelgroup,year,Unnamed: 2_level_1
Fox Action Movies,2014,50.3938
Fox Action Movies,2015,55.2117
Fox Action Movies,2016,18.62
Fox Family Movies,2014,72.7969
Fox Family Movies,2015,61.5668
Fox Family Movies,2016,16.4405
Fox Movies Premium,2014,232.1927
Fox Movies Premium,2015,97.4499
Fox Movies Premium,2016,39.0427
HBO,2014,232.2866


In [13]:
# but really we want it to have channelgroups as rows and year as columns, so we use unstack()
channels['allc&s'].sum().unstack()

year,2014,2015,2016
channelgroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fox Action Movies,50.3938,55.2117,18.62
Fox Family Movies,72.7969,61.5668,16.4405
Fox Movies Premium,232.1927,97.4499,39.0427
HBO,232.2866,149.9917,44.6853
HBO Family,100.4489,80.9435,21.9236
HBO HITS,117.9658,92.0839,22.4105
HBO Signature,48.0736,46.2915,12.4199


In [14]:
# You could save the above into a variable, or just chain it up like this when calling iplot()
channels['allc&s'].sum().unstack().iplot(kind='bar', dimensions=dims, barmode='stack')

In [15]:
# switching the axis is as simple as adding a .T in the mix!
channels['allc&s'].sum().unstack().T.iplot(kind='bar', dimensions=dims, barmode='stack')

This is just the beginning! Good luck :)