In [1]:
import pandas as pd
import numpy as np

In [2]:
# import netflix data csv.
df = pd.read_csv('viewing-activity-data.csv')

In [3]:
# number of rows and columns
df.shape

(39389, 10)

In [4]:
# show data rows.
df.head(2)

Unnamed: 0,Profile Name,Start Time,Duration,Attributes,Title,Supplemental Video Type,Device Type,Bookmark,Latest Bookmark,Country
0,Brother Two,2/18/2022 8:36,0:19:03,,Komi Can't Communicate: Season 1: It's just th...,,DefaultWidevineAndroidPhone,0:24:01,0:24:01,US (United States)
1,Brother Two,2/18/2022 7:10,0:03:43,,Komi Can't Communicate: Season 1: It's just th...,,DefaultWidevineAndroidPhone,0:03:53,Not latest view,US (United States)


In [5]:
# remove coulumns you do not want to analyze by adding them to df.drop
# QUESTION: Do you recomend cleaning data in csv or python? 
df = df.drop(['Attributes', 'Supplemental Video Type', 'Device Type', 'Bookmark', 'Latest Bookmark', 'Country'], axis=1)
df.head(2)

Unnamed: 0,Profile Name,Start Time,Duration,Title
0,Brother Two,2/18/2022 8:36,0:19:03,Komi Can't Communicate: Season 1: It's just th...
1,Brother Two,2/18/2022 7:10,0:03:43,Komi Can't Communicate: Season 1: It's just th...


In [6]:
# provides a list of the data types for each column
df.dtypes

Profile Name    object
Start Time      object
Duration        object
Title           object
dtype: object

In [7]:
# convert Start Time from object to datetime
# attach UTC to datetime format 
df['Start Time'] = pd.to_datetime(df['Start Time'], utc=True)
df.dtypes

Profile Name                 object
Start Time      datetime64[ns, UTC]
Duration                     object
Title                        object
dtype: object

In [8]:
# change Start Time column into the dataframe's index
df = df.set_index('Start Time')

# convert from UTC to eastern time
df.index = df.index.tz_convert('US/Eastern')

# reset the index so that Start Time becomes a column again
df = df.reset_index()

df.head(1)


Unnamed: 0,Start Time,Profile Name,Duration,Title
0,2022-02-18 03:36:00-05:00,Brother Two,0:19:03,Komi Can't Communicate: Season 1: It's just th...


In [9]:
# convert Duration from object to timedelta
df['Duration'] = pd.to_timedelta(df['Duration'])
df.dtypes

Start Time      datetime64[ns, US/Eastern]
Profile Name                        object
Duration                   timedelta64[ns]
Title                               object
dtype: object

In [10]:
# create a new datafrom called American Horror Story (ahs) that pulls from df
# pulls only rows with the Title column containing American Horror Story
ahs = df[df['Title'].str.contains('American Horror Story', regex=False)]
ahs.sample(20)

Unnamed: 0,Start Time,Profile Name,Duration,Title
3446,2018-10-22 00:49:00-04:00,Brother Two,0 days 00:22:18,American Horror Story: Cult: Drink the Kool-Ai...
3435,2018-10-24 00:12:00-04:00,Brother Two,0 days 00:37:48,American Horror Story: Cult: Great Again (Epis...
34564,2017-12-01 21:59:00-05:00,Me,0 days 00:42:29,American Horror Story: Coven: Boy Parts (Episo...
6143,2016-10-08 09:27:00-04:00,Brother Two,0 days 00:43:04,American Horror Story: Hotel: Flicker (Episode 7)
3440,2018-10-23 00:03:00-04:00,Brother Two,0 days 00:05:37,American Horror Story: Cult: Charles (Manson) ...
3439,2018-10-23 00:13:00-04:00,Brother Two,0 days 00:19:30,American Horror Story: Cult: Charles (Manson) ...
3451,2018-10-20 03:09:00-04:00,Brother Two,0 days 00:02:56,American Horror Story: Cult: Winter of Our Dis...
6154,2016-10-05 15:47:00-04:00,Brother Two,0 days 00:06:08,American Horror Story: Hotel: Chutes and Ladde...
6151,2016-10-06 20:35:00-04:00,Brother Two,0 days 00:52:34,American Horror Story: Hotel: Mommy (Episode 3)
3461,2018-10-17 02:29:00-04:00,Brother Two,0 days 00:08:16,American Horror Story: Cult: Don't Be Afraid o...


In [11]:
ahs['Duration'].sum()

Timedelta('0 days 22:41:53')