### Loading Python Libraries

In [38]:
# loading necessary libraries
import pandas as pd
from googleapiclient.discovery import build
import datetime

### Accessing the YouTube API

##### Accessing the Channel

In [39]:
# key to access YouTube API (keep private)
api_key = "APIKEY"

# interacting with the API
api_service_name = "youtube"
api_version = "v3"

youtube = build(
    api_service_name, api_version, developerKey = api_key)

request = youtube.channels().list(
    part="snippet,contentDetails,statistics",

    # unique channel id that corresponds to the channel I'm interested in
    id="UCIPPMRA040LQr5QPyJEbmXA"
)
channel_response = request.execute()


##### Channel Statistics

In [40]:
number_of_subscribers = int(channel_response['items'][0]['statistics']['subscriberCount'])
number_of_views = int(channel_response['items'][0]['statistics']['viewCount'])
number_of_videos = int(channel_response['items'][0]['statistics']['videoCount'])
uploads_id = channel_response['items'][0]['contentDetails']['relatedPlaylists']['uploads']

print('Here are some statistics about the channel, MrBeast Gaming:')
print("Number of subscribers:", "{:,}".format(number_of_subscribers))
print("Number of views:", "{:,}".format(number_of_views))
print("Number of videos:", "{:,}".format(number_of_videos))
print("Upload ID:", uploads_id)

Here are some statistics about the channel, MrBeast Gaming:
Number of subscribers: 30,800,000
Number of views: 5,419,152,626
Number of videos: 138
Upload ID: UUIPPMRA040LQr5QPyJEbmXA


##### Accessing the Uploaded Videos

In [41]:
request = youtube.playlistItems().list(
        part="snippet,contentDetails",
        playlistId="UUIPPMRA040LQr5QPyJEbmXA"
    )
videos_response = request.execute()

videos = []
for item in videos_response['items']:
        videos.append(item['contentDetails']['videoId'])

next_page_token = videos_response.get('nextPageToken')
while next_page_token is not None:
    request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId = "UUIPPMRA040LQr5QPyJEbmXA",
                maxResults = 50,
                pageToken = next_page_token)
    videos_response = request.execute()

    for item in videos_response['items']:
        videos.append(item['contentDetails']['videoId'])

    next_page_token = videos_response.get('nextPageToken')
print('We have successfully accessed', len(videos), 'videos from the channel.')
print("There are actually", number_of_videos, "videos on the channel.")
print('This is a difference of', number_of_videos - len(videos), 'videos.')

We have successfully accessed 138 videos from the channel.
There are actually 138 videos on the channel.
This is a difference of 0 videos.


##### Turning Video Information from a .JSON into a DataFrame

In [42]:
temp = []
for i in range(len(videos)):
    # getting the information about the ith video
    video_stats_request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id = videos[i]
        )
    video_stats_request = video_stats_request.execute()
    # getting the video type
    video_type = video_stats_request['items'][0]['kind'].split('#')[1]
    # getting the title
    title = video_stats_request['items'][0]['snippet']['title']
    # getting the publish date
    publish_date = video_stats_request['items'][0]['snippet']['publishedAt']
    # getting the number of views
    views = int(video_stats_request['items'][0]['statistics']['viewCount'])
    # getting the number of likes
    likes = int(video_stats_request['items'][0]['statistics']['likeCount'])
    # getting the number of comments
    comments = int(video_stats_request['items'][0]['statistics']['commentCount'])
    # getting the duration
    duration = video_stats_request['items'][0]['contentDetails']['duration']

    temp.append([title, publish_date, views, likes, comments, duration, video_type])
video_statistics = pd.DataFrame(temp, columns = ['Title', 'Publish Date', 'Views', 'Likes', 'Comments', 'Duration', 'Video Type'])

In [43]:
video_statistics.head()


Unnamed: 0,Title,Publish Date,Views,Likes,Comments,Duration,Video Type
0,"If You Build It, I'll Pay For It!",2022-12-31T20:00:04Z,16945756,610733,20081,PT11M42S,video
1,World's Hardest Challenge!,2022-12-16T22:18:00Z,17730508,540855,22964,PT14M30S,video
2,100 Youtuber Minecraft Battle Royale!,2022-10-28T21:00:09Z,17674328,992919,45801,PT16M3S,video
3,"Red vs Blue For $1,000,000",2022-10-12T20:00:12Z,10169817,395452,11497,PT10M43S,video
4,Minecraft with Ultra Realistic Graphics!,2022-09-16T19:00:37Z,15332613,496234,13029,PT8M47S,video


### Data Cleaning

In [44]:
video_statistics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138 entries, 0 to 137
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Title         138 non-null    object
 1   Publish Date  138 non-null    object
 2   Views         138 non-null    int64 
 3   Likes         138 non-null    int64 
 4   Comments      138 non-null    int64 
 5   Duration      138 non-null    object
 6   Video Type    138 non-null    object
dtypes: int64(3), object(4)
memory usage: 7.7+ KB


The above code...
- Gets the data types of each of the variables
- Shows us that there are no missing values, which makes our lives much easier.
- Show that 'Publish Date' is not in a date time format

In [45]:
video_statistics['Video Type'].value_counts()

video    138
Name: Video Type, dtype: int64

In [46]:
del video_statistics['Video Type']

All video are a video, none appear to be labeled specifically as shorts. Since all of the values are the same I decided to delete is since it doesn't give us much information.

In [47]:
# duration includes H
video_statistics[video_statistics['Duration'].str.contains('H')]

Unnamed: 0,Title,Publish Date,Views,Likes,Comments,Duration


No videos are an hour long or greater.

The function below converts the data from the format it is in directly from the YouTube API (which is not usable unfortunately) into seconds (very usable).

In [48]:
# converting duration to seconds
def convert_to_seconds(duration):
    # sum of the total duration of the video in seconds
    duration_seconds = 0

    # remove the string 'PT' (which is present in every observation)
    duration = duration[2:]

    # If the H is present, which indicates the video is equal to or longer than an hour, add the amount of seconds to the duration_seconds variable
    duration = duration.split('H')
    if len(duration) == 1:
        duration = duration[0]

    elif len(duration) == 2:
        duration_seconds += int(duration[0]) * 3600
        duration = duration[1]

    # If the M is present, which indicates the video is equal to or longer than an minute, add the amount of seconds to the duration_seconds variable 
    duration = duration.split('M')
    if len(duration) == 1:
        duration = duration[0]

    elif len(duration) == 2:
        duration_seconds += int(duration[0]) * 60
        duration = duration[1]

    # add the number of seconds to the video (if present)
    if len(duration) > 0:
        duration_seconds += int(duration.split('S')[0])

    return duration_seconds

In [49]:
# apply the function to the duration column
video_statistics['Duration in Seconds'] = video_statistics['Duration'].apply(convert_to_seconds)

In [50]:
# getting duration in minutes
video_statistics['Duration in Minutes'] = round(video_statistics['Duration in Seconds'] / 60,3)

In [51]:
# deleting the original duration column
del video_statistics['Duration']

##### Converting the Publish Date to a Date Format and Determining the Days Since Published

Converting to a DateTime format is important so we can actually work with the date.

In [52]:
video_statistics['Publish Date'].head()

0    2022-12-31T20:00:04Z
1    2022-12-16T22:18:00Z
2    2022-10-28T21:00:09Z
3    2022-10-12T20:00:12Z
4    2022-09-16T19:00:37Z
Name: Publish Date, dtype: object

Above we can see that the code is not in DateTime format (dtype: object).

In [53]:
# converting the publish date to datetime
video_statistics['Publish Date'] = pd.to_datetime(video_statistics['Publish Date'])

In [54]:
video_statistics['Publish Date'].head()

0   2022-12-31 20:00:04+00:00
1   2022-12-16 22:18:00+00:00
2   2022-10-28 21:00:09+00:00
3   2022-10-12 20:00:12+00:00
4   2022-09-16 19:00:37+00:00
Name: Publish Date, dtype: datetime64[ns, UTC]

Now we have dtype: datetime64.

In [55]:
# converting the time zone from UTC to EST
video_statistics['Publish Date'] = video_statistics['Publish Date'].dt.tz_convert('EST')

In [56]:
video_statistics['Publish Date'].head()

0   2022-12-31 15:00:04-05:00
1   2022-12-16 17:18:00-05:00
2   2022-10-28 16:00:09-05:00
3   2022-10-12 15:00:12-05:00
4   2022-09-16 14:00:37-05:00
Name: Publish Date, dtype: datetime64[ns, EST]

The code below removes the time zone from the datetime observation.

In [57]:
video_statistics['Publish Date'] = video_statistics['Publish Date'].dt.tz_localize(None)

In [58]:
video_statistics['Publish Date'].tail()

133   2020-05-22 15:01:29
134   2020-05-20 15:01:40
135   2020-05-16 14:35:31
136   2020-05-14 15:16:51
137   2020-05-12 15:00:11
Name: Publish Date, dtype: datetime64[ns]

In [59]:
# getting the difference (in days) between the current time and the publish date from the column Publish Date
video_statistics['Days Since Published'] = (datetime.datetime.now() - video_statistics['Publish Date']).dt.days

In [60]:
video_statistics['Days Since Published']

0       16
1       31
2       80
3       96
4      122
      ... 
133    969
134    971
135    975
136    977
137    979
Name: Days Since Published, Length: 138, dtype: int64

The below code shows us the comment to view ratio and the like to view ratio.

In [61]:
# comment to views ratio
video_statistics['Comment to View Ratio'] = round(video_statistics['Comments'] / video_statistics['Views'],5)
# like to view ratio
video_statistics['Like to View Ratio'] = round(video_statistics['Likes'] / video_statistics['Views'],5)

In [62]:
video_statistics.head()

Unnamed: 0,Title,Publish Date,Views,Likes,Comments,Duration in Seconds,Duration in Minutes,Days Since Published,Comment to View Ratio,Like to View Ratio
0,"If You Build It, I'll Pay For It!",2022-12-31 15:00:04,16945756,610733,20081,702,11.7,16,0.00119,0.03604
1,World's Hardest Challenge!,2022-12-16 17:18:00,17730508,540855,22964,870,14.5,31,0.0013,0.0305
2,100 Youtuber Minecraft Battle Royale!,2022-10-28 16:00:09,17674328,992919,45801,963,16.05,80,0.00259,0.05618
3,"Red vs Blue For $1,000,000",2022-10-12 15:00:12,10169817,395452,11497,643,10.717,96,0.00113,0.03888
4,Minecraft with Ultra Realistic Graphics!,2022-09-16 14:00:37,15332613,496234,13029,527,8.783,122,0.00085,0.03236


In [63]:
# new column which is the number of views per day
video_statistics['Views per Day'] = round(video_statistics['Views'] / video_statistics['Days Since Published'],1)

In [64]:
# renaming publish date to publish time (EST)
video_statistics.rename(columns = {'Publish Date':'Publish Time (EST)'}, inplace = True)


In [65]:
# taking pubslish time (EST), taking only the date, and storing it in a new column Publish Date
video_statistics['Publish Date'] = video_statistics['Publish Time (EST)'].dt.date

In [66]:
# adding a new column Title and Day Published which is the title and the publish date
video_statistics['Title and Day Published'] = video_statistics['Title'] + ' - (' + video_statistics['Publish Date'].astype(str) + ')'

In [67]:
# switching the order of the columns
video_statistics = video_statistics[['Title and Day Published', 'Title', 'Publish Date', 'Publish Time (EST)', 'Days Since Published', 'Views', 'Views per Day', 'Likes', 'Like to View Ratio', 'Comments', 'Comment to View Ratio', 'Duration in Seconds', 'Duration in Minutes']]

In [68]:
video_statistics.head()

Unnamed: 0,Title and Day Published,Title,Publish Date,Publish Time (EST),Days Since Published,Views,Views per Day,Likes,Like to View Ratio,Comments,Comment to View Ratio,Duration in Seconds,Duration in Minutes
0,"If You Build It, I'll Pay For It! - (2022-12-31)","If You Build It, I'll Pay For It!",2022-12-31,2022-12-31 15:00:04,16,16945756,1059109.8,610733,0.03604,20081,0.00119,702,11.7
1,World's Hardest Challenge! - (2022-12-16),World's Hardest Challenge!,2022-12-16,2022-12-16 17:18:00,31,17730508,571951.9,540855,0.0305,22964,0.0013,870,14.5
2,100 Youtuber Minecraft Battle Royale! - (2022-...,100 Youtuber Minecraft Battle Royale!,2022-10-28,2022-10-28 16:00:09,80,17674328,220929.1,992919,0.05618,45801,0.00259,963,16.05
3,"Red vs Blue For $1,000,000 - (2022-10-12)","Red vs Blue For $1,000,000",2022-10-12,2022-10-12 15:00:12,96,10169817,105935.6,395452,0.03888,11497,0.00113,643,10.717
4,Minecraft with Ultra Realistic Graphics! - (20...,Minecraft with Ultra Realistic Graphics!,2022-09-16,2022-09-16 14:00:37,122,15332613,125677.2,496234,0.03236,13029,0.00085,527,8.783


### Exporting the Data to a Daily CSV File & Calculating Daily Differences

In [69]:
from datetime import datetime, timedelta
# Get current date
now = datetime.now()

# Get date of 24 hours ago
yesterday = now - timedelta(days=1)

# Format dates as strings
now_str = now.strftime("%Y-%m-%d")
yesterday_str = yesterday.strftime("%Y-%m-%d")
print(now_str)
print(yesterday_str)

2023-01-16
2023-01-15


In [70]:
# write the current data to a csv file
video_statistics.to_csv("data/MrBeastGaming_"+f"{now_str}.csv", index=False)

# pull in yesterday's data (24 hours ago)
yesterday_video_statistics = pd.read_csv("data/MrBeastGaming_"+f"{yesterday_str}.csv")

In [71]:
# reusable function that does what is described above
def get_difference_csv(yesterday_video_statistics, video_statistics, difference_csv_name = 'data/DailyDifference.csv'):
    # calculating the difference
    # Title and Day Published (Constant)
    # Days Since Published, Views, Likes Comments (Different)
    difference = video_statistics.copy()
    difference['Daily Views'] = video_statistics['Views'] - yesterday_video_statistics['Views']
    difference['Daily Likes'] = video_statistics['Likes'] - yesterday_video_statistics['Likes']
    difference['Daily Comments'] = video_statistics['Comments'] - yesterday_video_statistics['Comments']
    difference['Date'] = now_str

     # read in the difference CSV
    difference_csv = pd.read_csv(difference_csv_name)

    # subset with only these columns: Days Since Published, Views, Likes Comments
    difference = difference[['Date','Title and Day Published','Days Since Published', 'Daily Views', 'Daily Likes', 'Daily Comments']]
    #difference_csv = difference_csv[['Date','Title and Day Published','Days Since Published', 'Views', 'Likes', 'Comments']]
    
    # rename the columns to Daily Views, Daily Likes, Daily Comments
    difference_csv.rename(columns = {'Views':'Daily Views', 'Likes':'Daily Likes', 'Comments':'Daily Comments'}, inplace = True)
    # adding a new line to the difference CSV
    difference_csv = difference_csv.append(difference, ignore_index=True)

    # exporting the difference CSV
    difference_csv.to_csv(difference_csv_name, index=False)

    print('Success.')

    return difference_csv    

In [72]:
get_difference_csv(yesterday_video_statistics, video_statistics)

Success.


  difference_csv = difference_csv.append(difference, ignore_index=True)


Unnamed: 0,Date,Title and Day Published,Days Since Published,Daily Views,Daily Likes,Daily Comments
0,2023-01-14,"If You Build It, I'll Pay For It! - (2022-12-31)",13,528810,9620,197
1,2023-01-14,World's Hardest Challenge! - (2022-12-16),28,134774,1603,46
2,2023-01-14,100 Youtuber Minecraft Battle Royale! - (2022-...,77,68938,1603,24
3,2023-01-14,"Extreme $1,000,000 Challenge! - (2022-10-12)",93,44078,665,2
4,2023-01-14,Minecraft with Ultra Realistic Graphics! - (20...,119,312637,5608,23
...,...,...,...,...,...,...
271,2023-01-16,"$10,000 Obstacle Course - Challenge - (2020-05...",969,5910,109,-3
272,2023-01-16,"Last to Survive Random Blocks wins $10,000 - C...",971,14571,259,1
273,2023-01-16,"Last to Survive Arena wins $10,000 - Challenge...",975,19377,230,0
274,2023-01-16,"$10,000 Bank robbery - Challenge - (2020-05-14)",977,10762,166,1


### Future Analysis
- Analyzing how MrBeast media appearances (podcasts, videos with other creators) impact channel views