# Movies

### Imports

In [None]:
import pandas as pd
import numpy as np

from plotly.graph_objs import *
from plotly.offline import init_notebook_mode, iplot
import cufflinks as cf

init_notebook_mode()
cf.go_offline()

dims = (980,700)

In [None]:
df = pd.read_csv('Titles.csv', skiprows=9, encoding='utf-16')

### Data Cleaning

In [3]:
df.sample(5)

AttributeError: 'DataFrame' object has no attribute 'sample'

In [None]:
df.info()

In [None]:
# drop empty columns
df = df.iloc[:,:-2]
# clean headers
df.columns = df.columns.str.lower().str.replace(' ','').str.replace('\.\.\.','')

In [None]:
df.sample(5)

In [None]:
# Datatypes
df.date = pd.to_datetime(df.date)
# Cleaning up the period columns
df.quarter = df.date.dt.quarter
df.month = df.date.dt.month

In [None]:
df.info()

### Exploratory Data Analysis

In [None]:
# Gives you no information cause it's all the same - so could drop it, unless you have a use for it
df.target.value_counts()

In [None]:
# even the graph looks silly :)
df['target'].iplot(kind='histogram', dimensions=dims)

### Let's try and see how the `channelgroup` has done each `year`.

In [None]:
channels = df.groupby(['channelgroup', 'year'])

In [None]:
# We've created a clean little dataframe that's going to be useful for plotting
channels[['allc&s']].sum()

In [None]:
# but really we want it to have channelgroups as rows and year as columns, so we use unstack()
channels['allc&s'].sum().unstack()

In [None]:
# You could save the above into a variable, or just chain it up like this when calling iplot()
channels['allc&s'].sum().unstack().iplot(kind='bar', dimensions=dims, barmode='stack')

In [None]:
# switching the axis is as simple as adding a .T in the mix!
channels['allc&s'].sum().unstack().T.iplot(kind='bar', dimensions=dims, barmode='stack')

This is just the beginning! Good luck :)