In [25]:
# importing and formatting the data for usage
import pandas 
dataframe = pandas.read_csv('ViewingActivity-sample.csv') # read our Netflix data into a dataframe
dataframe.shape # tells the number of rows and columns in the dataframe 

dataframe.drop( # make it only print out the columns we want using the drop function
    [ # pass in a list of the argumnets we'd like to exclude
    'Attributes',
    'Supplemental Video Type',
    "Device Type",
    'Bookmark', 
    'Latest Bookmark',
    'Country'
    ], axis=1) # axis removes the columns

dataframe.head(10) # prints out first ten columns of the csv
# dataframe.dtypes ; get a list of the data types for each column in dataframe

dataframe['Start Time'] = pandas.to_datetime(dataframe['Start Time'], utc=True)
# convert Start Time to a datetime format so that pandas can understand and perform calculations
# convert Start Time from UTC to local timezone ( the "utc=True" arg)
# convert duration to timedelta (time duration format for pandas)

# change the Start Time column into the dataframe's index as DatetimeIndex
dataframe = dataframe.set_index('Start Time')
# convert from UTC to western time
dataframe.index = dataframe.index.tz_convert('US/Pacific')
# reset back to a column
dataframe = dataframe.reset_index()

dataframe.head(1)

# convert Duration to a timedelta, a measure of the time duration in a format that panda can use
dataframe['Duration'] = pandas.to_timedelta(dataframe['Duration']) 

# create a new dataframe that takes the data from dataframe
# only takes the rows where the "Title" column contains our specified string that holds the exact title of the show(s) we want; can add new arguments for each additional show
# uses the str.contains function to do so; regex tells the function that the prev arg is a stiring
specific_show = dataframe[dataframe['Title'].str.contains('The Office (U.S.)', regex=False)]

specific_show.sample(20)
# specific_show.shape  checking to see if all rows contained only episodes from specific_show

# now we filter out short durations to exclude trailer playing from counting as a view
# we do this by  setting a condition with a 1 minute required watch duration
specific_show = specific_show[(specific_show['Duration'] > '0 days 00:01:00')]
specific_show.shape


(130, 10)

In [33]:
# Analyzing the data

# sum() adds up the total of the list, Duration
specific_show['Duration'].sum()

# finding watch times for the show
specific_show['weekday'] = specific_show['Start Time'].dt.weekday
specific_show['hour'] = specific_show['Start Time'].dt.hour

specific_show.head(20) # test

#make charts show in jpynb
%matplotlib inline
import matplotlib

specific_show['weekday'] = pandas.Categorical(specific_show['weekday'], 
categories=[
    0,1,2,3,4,5,6 # tell pandas to order by weekdays
], ordered=True) 

# sort by day
specific_show_by_day = specific_show['weekday'].value_counts() # assigns the rows per day to a variable
specific_show_by_day = specific_show_by_day.sort_index() # sorts in order using the categorical func

# display data in a bar chart
specific_show_by_day.plot(kind='bar', figsize=(20,10), title='Your Show Episodes Watched by Day') 

# sort by hour
specific_show_by_hour = specific_show['hour'].value_counts() # assigns the rows per hour to a var
specific_show_by_hour = specific_show_by_hour.sort_index() # sorts in order again

# display in another chart
specific_show_by_hour.plot(kind='bar', figsize=(20,10), title='Your Show Episodes Watched by Hour')

ModuleNotFoundError: No module named 'matplotlib'