### Loading Python Libraries

In [117]:
# loading necessary libraries
import pandas as pd
from googleapiclient.discovery import build
import datetime

### Accessing the YouTube API

##### Accessing the Channel

In [118]:
# key to access YouTube API (keep private)
api_key = "AIzaSyAcL_fq1YQz4tDxxTHmwkAsjub0yj0c6Zo"

# interacting with the API
api_service_name = "youtube"
api_version = "v3"

# storing channel unique ids
MrBeast = 'UCX6OQ3DkcsbYNE6H8uQQuVA'
MrBeastGaming = 'UCIPPMRA040LQr5QPyJEbmXA'

youtube = build(
    api_service_name, api_version, developerKey = api_key)

request = youtube.channels().list(
    part="snippet,contentDetails,statistics",

    # unique channel id that corresponds to the channel I'm interested in
    id=MrBeast
)
channel_response = request.execute()


##### Channel Statistics

In [119]:
number_of_subscribers = int(channel_response['items'][0]['statistics']['subscriberCount'])
number_of_views = int(channel_response['items'][0]['statistics']['viewCount'])
number_of_videos = int(channel_response['items'][0]['statistics']['videoCount'])
uploads_id = channel_response['items'][0]['contentDetails']['relatedPlaylists']['uploads']

print('Here are some statistics about the channel, MrBeast Gaming:')
print("Number of subscribers:", "{:,}".format(number_of_subscribers))
print("Number of views:", "{:,}".format(number_of_views))
print("Number of videos:", "{:,}".format(number_of_videos))
print("Upload ID:", uploads_id)

Here are some statistics about the channel, MrBeast Gaming:
Number of subscribers: 128,000,000
Number of views: 21,509,793,024
Number of videos: 733
Upload ID: UUX6OQ3DkcsbYNE6H8uQQuVA


##### Accessing the Uploaded Videos

In [120]:
request = youtube.playlistItems().list(
        part="snippet,contentDetails",
        playlistId=uploads_id
    )
videos_response = request.execute()

videos = []
for item in videos_response['items']:
        videos.append(item['contentDetails']['videoId'])

next_page_token = videos_response.get('nextPageToken')
while next_page_token is not None:
    request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId = uploads_id,
                maxResults = 50,
                pageToken = next_page_token)
    videos_response = request.execute()

    for item in videos_response['items']:
        videos.append(item['contentDetails']['videoId'])

    next_page_token = videos_response.get('nextPageToken')
print('We have successfully accessed', len(videos), 'videos from the channel.')
print("There are actually", number_of_videos, "videos on the channel.")
print('This is a difference of', number_of_videos - len(videos), 'videos.')

We have successfully accessed 733 videos from the channel.
There are actually 733 videos on the channel.
This is a difference of 0 videos.


##### Turning Video Information from a .JSON into a DataFrame

In [121]:
temp = []
for i in range(len(videos)):
    # getting the information about the ith video
    video_stats_request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id = videos[i]
        )
    video_stats_request = video_stats_request.execute()
    # getting the video type
    video_type = video_stats_request['items'][0]['kind'].split('#')[1]
    # getting the title
    title = video_stats_request['items'][0]['snippet']['title']
    # getting the publish date
    publish_date = video_stats_request['items'][0]['snippet']['publishedAt']
    # getting the number of views
    views = int(video_stats_request['items'][0]['statistics']['viewCount'])
    # getting the number of likes
    try:
        likes = int(video_stats_request['items'][0]['statistics']['likeCount'])
    except:
        likes = pd.NA
    # getting the number of comments
    try:
        comments = int(video_stats_request['items'][0]['statistics']['commentCount'])
    except:
        comments = pd.NA
    # getting the duration
    duration = video_stats_request['items'][0]['contentDetails']['duration']

    temp.append([title, publish_date, views, likes, comments, duration, video_type])
video_statistics = pd.DataFrame(temp, columns = ['Title', 'Publish Date', 'Views', 'Likes', 'Comments', 'Duration', 'Video Type'])

In [122]:
len(temp)

733

In [123]:
video_statistics.head()


Unnamed: 0,Title,Publish Date,Views,Likes,Comments,Duration,Video Type
0,I Survived 50 Hours In Antarctica,2022-12-24T20:59:59Z,79408982,3533170,133983,PT12M10S,video
1,Hydraulic Press Vs Lamborghini,2022-12-10T21:00:01Z,94497580,4089840,121292,PT10M25S,video
2,Would You Fly To Paris For A Baguette?,2022-12-08T20:12:27Z,457205770,25690316,54637,PT48S,video
3,"100 Kids Vs 100 Adults For $500,000",2022-12-03T21:00:00Z,99263866,3018711,90445,PT15M23S,video
4,Gordon Ramsay Tries Most Expensive Chocolate Bar!,2022-11-19T20:59:59Z,108386817,7753021,18893,PT41S,video


### Data Cleaning

In [124]:
video_statistics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 733 entries, 0 to 732
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Title         733 non-null    object
 1   Publish Date  733 non-null    object
 2   Views         733 non-null    int64 
 3   Likes         726 non-null    object
 4   Comments      732 non-null    object
 5   Duration      733 non-null    object
 6   Video Type    733 non-null    object
dtypes: int64(1), object(6)
memory usage: 40.2+ KB


Out of the 733 rows, there is one case where the comments are not present and seven cases where the likes are not present

The above code...
- Gets the data types of each of the variables
- Shows us that there are no missing values, which makes our lives much easier.
- Show that 'Publish Date' is not in a date time format

In [125]:
video_statistics['Video Type'].value_counts()

video    733
Name: Video Type, dtype: int64

In [126]:
del video_statistics['Video Type']

All video are a video, none appear to be labeled specifically as shorts. Since all of the values are the same I decided to delete is since it doesn't give us much information.

In [127]:
# duration includes H
print(video_statistics[video_statistics['Duration'].str.contains('DT')].shape)
video_statistics[video_statistics['Duration'].str.contains('H')].shape

(1, 6)


(11, 6)

- 1 video is longer than a day!
- 11 videos are longer than one hour

The function below converts the data from the format it is in directly from the YouTube API (which is not usable unfortunately) into seconds (very usable).

In [128]:
# converting duration to seconds
def convert_to_seconds(duration):
    # sum of the total duration of the video in seconds
    duration_seconds = 0

    # handling the 1 to 9 day case
    if 'DT' in duration:
        duration = duration.split('DT')
        duration_seconds += int(duration[0][1:]) * 86400
        duration = duration[1]

    # remove the string 'PT' (which is present in every observation)
    else:
        duration = duration[2:]

    # If the H is present, which indicates the video is equal to or longer than an hour, add the amount of seconds to the duration_seconds variable
    duration = duration.split('H')
    if len(duration) == 1:
        duration = duration[0]

    elif len(duration) == 2:
        duration_seconds += int(duration[0]) * 3600
        duration = duration[1]

    # If the M is present, which indicates the video is equal to or longer than an minute, add the amount of seconds to the duration_seconds variable 
    duration = duration.split('M')
    if len(duration) == 1:
        duration = duration[0]

    elif len(duration) == 2:
        duration_seconds += int(duration[0]) * 60
        duration = duration[1]

    # add the number of seconds to the video (if present)
    if len(duration) > 0:
        duration_seconds += int(duration.split('S')[0])

    return duration_seconds

In [129]:
convert_to_seconds('PT11H48M40S')

42520

In [130]:
# apply the function to the duration column
video_statistics['Duration in Seconds'] = video_statistics['Duration'].apply(convert_to_seconds)

In [131]:
# getting duration in minutes
video_statistics['Duration in Minutes'] = round(video_statistics['Duration in Seconds'] / 60,3)

In [132]:
# deleting the original duration column
del video_statistics['Duration']

##### Converting the Publish Date to a Date Format and Determining the Days Since Published

Converting to a DateTime format is important so we can actually work with the date.

In [133]:
video_statistics['Publish Date'].head()

0    2022-12-24T20:59:59Z
1    2022-12-10T21:00:01Z
2    2022-12-08T20:12:27Z
3    2022-12-03T21:00:00Z
4    2022-11-19T20:59:59Z
Name: Publish Date, dtype: object

Above we can see that the code is not in DateTime format (dtype: object).

In [134]:
# converting the publish date to datetime
video_statistics['Publish Date'] = pd.to_datetime(video_statistics['Publish Date'])

In [135]:
video_statistics['Publish Date'].head()

0   2022-12-24 20:59:59+00:00
1   2022-12-10 21:00:01+00:00
2   2022-12-08 20:12:27+00:00
3   2022-12-03 21:00:00+00:00
4   2022-11-19 20:59:59+00:00
Name: Publish Date, dtype: datetime64[ns, UTC]

Now we have dtype: datetime64.

In [136]:
# converting the time zone from UTC to EST
video_statistics['Publish Date'] = video_statistics['Publish Date'].dt.tz_convert('EST')

In [137]:
video_statistics['Publish Date'].head()

0   2022-12-24 15:59:59-05:00
1   2022-12-10 16:00:01-05:00
2   2022-12-08 15:12:27-05:00
3   2022-12-03 16:00:00-05:00
4   2022-11-19 15:59:59-05:00
Name: Publish Date, dtype: datetime64[ns, EST]

The code below removes the time zone from the datetime observation.

In [138]:
video_statistics['Publish Date'] = video_statistics['Publish Date'].dt.tz_localize(None)

In [139]:
video_statistics['Publish Date'].tail()

728   2013-01-12 20:59:21
729   2013-01-12 18:35:45
730   2013-01-12 17:34:11
731   2012-03-09 18:29:03
732   2012-02-20 17:42:32
Name: Publish Date, dtype: datetime64[ns]

In [141]:
# getting the difference (in days) between the current time and the publish date from the column Publish Date
video_statistics['Days Since Published'] = (datetime.datetime.now() - video_statistics['Publish Date']).dt.days

In [142]:
video_statistics['Days Since Published']

0        24
1        38
2        40
3        45
4        59
       ... 
728    3657
729    3657
730    3657
731    3966
732    3984
Name: Days Since Published, Length: 733, dtype: int64

The below code shows us the comment to view ratio and the like to view ratio.

### Droping the rows with missing values

In [143]:
### Droping the rows with missing values
video_statistics = video_statistics.dropna()

In [144]:
video_statistics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 725 entries, 0 to 732
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Title                 725 non-null    object        
 1   Publish Date          725 non-null    datetime64[ns]
 2   Views                 725 non-null    int64         
 3   Likes                 725 non-null    object        
 4   Comments              725 non-null    object        
 5   Duration in Seconds   725 non-null    int64         
 6   Duration in Minutes   725 non-null    float64       
 7   Days Since Published  725 non-null    int64         
dtypes: datetime64[ns](1), float64(1), int64(3), object(3)
memory usage: 51.0+ KB


In [147]:
video_statistics['Views'].isna().sum()

0

In [148]:
# comment to views ratio
video_statistics['Comment to View Ratio'] = video_statistics['Comments'] / video_statistics['Views']
# like to view ratio
video_statistics['Like to View Ratio'] = video_statistics['Likes'] / video_statistics['Views']

In [152]:
video_statistics.head()

Unnamed: 0,Title,Publish Date,Views,Likes,Comments,Duration in Seconds,Duration in Minutes,Days Since Published,Comment to View Ratio,Like to View Ratio,Views per Day
0,I Survived 50 Hours In Antarctica,2022-12-24 15:59:59,79408982,3533170,133983,730,12.167,24,0.001687,0.044493,3308707.6
1,Hydraulic Press Vs Lamborghini,2022-12-10 16:00:01,94497580,4089840,121292,625,10.417,38,0.001284,0.04328,2486778.4
2,Would You Fly To Paris For A Baguette?,2022-12-08 15:12:27,457205770,25690316,54637,48,0.8,40,0.00012,0.05619,11430144.2
3,"100 Kids Vs 100 Adults For $500,000",2022-12-03 16:00:00,99263866,3018711,90445,923,15.383,45,0.000911,0.030411,2205863.7
4,Gordon Ramsay Tries Most Expensive Chocolate Bar!,2022-11-19 15:59:59,108386817,7753021,18893,41,0.683,59,0.000174,0.071531,1837064.7


In [153]:
# new column which is the number of views per day
video_statistics['Views per Day'] = round(video_statistics['Views'] / video_statistics['Days Since Published'],1)

In [154]:
# renaming publish date to publish time (EST)
video_statistics.rename(columns = {'Publish Date':'Publish Time (EST)'}, inplace = True)


In [155]:
# taking pubslish time (EST), taking only the date, and storing it in a new column Publish Date
video_statistics['Publish Date'] = video_statistics['Publish Time (EST)'].dt.date

In [156]:
# adding a new column Title and Day Published which is the title and the publish date
video_statistics['Title and Day Published'] = video_statistics['Title'] + ' - (' + video_statistics['Publish Date'].astype(str) + ')'

In [157]:
# switching the order of the columns
video_statistics = video_statistics[['Title and Day Published', 'Title', 'Publish Date', 'Publish Time (EST)', 'Days Since Published', 'Views', 'Views per Day', 'Likes', 'Like to View Ratio', 'Comments', 'Comment to View Ratio', 'Duration in Seconds', 'Duration in Minutes']]

In [158]:
video_statistics.head()

Unnamed: 0,Title and Day Published,Title,Publish Date,Publish Time (EST),Days Since Published,Views,Views per Day,Likes,Like to View Ratio,Comments,Comment to View Ratio,Duration in Seconds,Duration in Minutes
0,I Survived 50 Hours In Antarctica - (2022-12-24),I Survived 50 Hours In Antarctica,2022-12-24,2022-12-24 15:59:59,24,79408982,3308707.6,3533170,0.044493,133983,0.001687,730,12.167
1,Hydraulic Press Vs Lamborghini - (2022-12-10),Hydraulic Press Vs Lamborghini,2022-12-10,2022-12-10 16:00:01,38,94497580,2486778.4,4089840,0.04328,121292,0.001284,625,10.417
2,Would You Fly To Paris For A Baguette? - (2022...,Would You Fly To Paris For A Baguette?,2022-12-08,2022-12-08 15:12:27,40,457205770,11430144.2,25690316,0.05619,54637,0.00012,48,0.8
3,"100 Kids Vs 100 Adults For $500,000 - (2022-12...","100 Kids Vs 100 Adults For $500,000",2022-12-03,2022-12-03 16:00:00,45,99263866,2205863.7,3018711,0.030411,90445,0.000911,923,15.383
4,Gordon Ramsay Tries Most Expensive Chocolate B...,Gordon Ramsay Tries Most Expensive Chocolate Bar!,2022-11-19,2022-11-19 15:59:59,59,108386817,1837064.7,7753021,0.071531,18893,0.000174,41,0.683


### Exporting the Data to a Daily CSV File & Calculating Daily Differences

In [159]:
from datetime import datetime, timedelta
# Get current date
now = datetime.now()

# Get date of 24 hours ago
yesterday = now - timedelta(days=1)

# Format dates as strings
now_str = now.strftime("%Y-%m-%d")
yesterday_str = yesterday.strftime("%Y-%m-%d")
print(now_str)
print(yesterday_str)

2023-01-18
2023-01-17


In [161]:
# write the current data to a csv file
video_statistics.to_csv("MrBeast/MrBeast_"+f"{now_str}.csv", index=False)

# pull in yesterday's data (24 hours ago)
yesterday_video_statistics = pd.read_csv("MrBeast/MrBeast_"+f"{yesterday_str}.csv")

In [162]:
# reusable function that does what is described above
def get_difference_csv(yesterday_video_statistics, video_statistics, difference_csv_name = 'MrBeast/DailyDifferenceMrBeast.csv'):
    # calculating the difference
    # Title and Day Published (Constant)
    # Days Since Published, Views, Likes Comments (Different)
    difference = video_statistics.copy()
    difference['Daily Views'] = video_statistics['Views'] - yesterday_video_statistics['Views']
    difference['Daily Likes'] = video_statistics['Likes'] - yesterday_video_statistics['Likes']
    difference['Daily Comments'] = video_statistics['Comments'] - yesterday_video_statistics['Comments']
    difference['Date'] = now_str

     # read in the difference CSV
    difference_csv = pd.read_csv(difference_csv_name)

    # subset with only these columns: Days Since Published, Views, Likes Comments
    difference = difference[['Date','Title and Day Published','Days Since Published', 'Daily Views', 'Daily Likes', 'Daily Comments']]
    #difference_csv = difference_csv[['Date','Title and Day Published','Days Since Published', 'Views', 'Likes', 'Comments']]
    
    # rename the columns to Daily Views, Daily Likes, Daily Comments
    difference_csv.rename(columns = {'Views':'Daily Views', 'Likes':'Daily Likes', 'Comments':'Daily Comments'}, inplace = True)
    # adding a new line to the difference CSV
    difference_csv = difference_csv.append(difference, ignore_index=True)

    # exporting the difference CSV
    difference_csv.to_csv(difference_csv_name, index=False)

    print('Success.')

    return difference_csv    

In [163]:
get_difference_csv(yesterday_video_statistics, video_statistics)

Success.


  difference_csv = difference_csv.append(difference, ignore_index=True)


Unnamed: 0,Date,Title and Day Published,Days Since Published,Daily Views,Daily Likes,Daily Comments
0,,,,,,
1,2023-01-18,I Survived 50 Hours In Antarctica - (2022-12-24),24.0,0.0,0.0,0.0
2,2023-01-18,Hydraulic Press Vs Lamborghini - (2022-12-10),38.0,0.0,0.0,0.0
3,2023-01-18,Would You Fly To Paris For A Baguette? - (2022...,40.0,0.0,0.0,0.0
4,2023-01-18,"100 Kids Vs 100 Adults For $500,000 - (2022-12...",45.0,0.0,0.0,0.0
...,...,...,...,...,...,...
721,2023-01-18,Scary minecraft pig skin! - (2013-01-13),3656.0,,,
722,2023-01-18,Most Epic minecraft skin EVER (Psy) - (2013-0...,3657.0,,,
723,2023-01-18,More birds IN MINECRAFT!! - (2013-01-12),3657.0,,,
724,2023-01-18,Boxy item mod Minecraft. EPIC - (2013-01-12),3657.0,,,


### Future Analysis
- Analyzing how MrBeast media appearances (podcasts, videos with other creators) impact channel views