# Youtube API Exploratory file

Do not run these cells again, it will change our video and channel data significantly. 

In [9]:
# import dependencies
import pandas as pd
from googleapiclient.discovery import build
# import config from parent directory
import sys
sys.path.append('../')
from config import api_key

import numpy as np



In [6]:
youtube = build('youtube', 'v3', developerKey=api_key)

In [10]:
 # First input, video_list, is a list of video IDs
    # Second input, title, is the title of the CATEGORY of videos being passed in
    # This will also save the output as a csv file

def channel_information_grabber(video_list, title):
    # create empty dataframe
    df = pd.DataFrame()

    # for loop to get channel details
    for channel in video_list:
        response = youtube.channels().list(
            part=['snippet', 'statistics', 'topicDetails', 'contentDetails'],
            id=channel
        ).execute()
        # append response to dataframe
        df = df.append(response['items'], ignore_index=True)

    # seperate the snippet column into its own dataframe
    snippet_df = pd.json_normalize(df['snippet'])
    snippet_df = snippet_df.drop(columns=['thumbnails.default.width', 'thumbnails.default.height', 'thumbnails.medium.url',
                                          'thumbnails.medium.width', 'thumbnails.medium.height', 'thumbnails.high.url',
                                          'thumbnails.high.width', 'thumbnails.high.height', 'localized.title', 'localized.description'])

    # seperate the statistics column into its own dataframe
    statistics_df = pd.json_normalize(df['statistics'])

    # seperate the topicDetails column into its own dataframe
    topicDetails_df = pd.json_normalize(df['topicDetails'])

    # seperate the contentDetails column into its own dataframe
    contentDetails_df = pd.json_normalize(df['contentDetails'])

    df = df.merge(snippet_df, left_index=True, right_index=True)
    df = df.merge(statistics_df, left_index=True, right_index=True)
    df = df.merge(topicDetails_df, left_index=True, right_index=True)
    df = df.merge(contentDetails_df, left_index=True, right_index=True)

    # change data types
    df['publishedAt'] = pd.to_datetime(df['publishedAt'])
    df['viewCount'] = df['viewCount'].astype('int64')
    df['subscriberCount'] = df['subscriberCount'].astype('int64')
    df['videoCount'] = df['videoCount'].astype('int64')

    # drop columns that are not needed
    df = df.drop(columns=['kind', 'etag', 'snippet', 'statistics', 'topicDetails',
                 'country', 'hiddenSubscriberCount', 'contentDetails', 'relatedPlaylists.likes'])

    # sort by view count
    df = df.sort_values(by='viewCount', ascending=False)

    # add 'title' column
    df['category_title'] = title

    # export as csv
    df.to_csv(f'../Dataset/csv/categories/{title}_df.csv', index=False)

    return df


In [11]:
def get_50_videos(channel_id):
    # This function takes in a channel ID and returns a list of the 50 most recent video IDs
    response = youtube.channels().list(
        part=['contentDetails'],
        id=channel_id
    ).execute()

    playlist_id = response['items'][0]['contentDetails']['relatedPlaylists']['uploads']

    response = youtube.playlistItems().list(
        part=['contentDetails'],
        playlistId=playlist_id,
        maxResults=50
    ).execute()

    # get the video ids
    video_ids = []
    for item in response['items']:
        video_ids.append(item['contentDetails']['videoId'])

    return video_ids

In [12]:
def video_details_grabber(video_id):
    # This function takes in a video ID and returns a dataframe with the video details
    response = youtube.videos().list(part=['snippet', 'statistics', 'topicDetails', 'contentDetails'], id=video_id).execute()
    # get channelID
    try:
        channel_id = response['items'][0]['snippet']['channelId']
    except:
        channel_id = np.nan
    # get video title
    try:
        video_title = response['items'][0]['snippet']['title']
        video_title_clean = ''.join(e for e in video_title if e.isalnum() or e == ' ')
        video_title_clean = video_title_clean.replace('  ', ' ')
    except:
        video_title = 'N/A'
        video_title_clean = 'N/A'
    # get video id
    try:
        video_id = response['items'][0]['id']
    except:
        video_id = np.nan
    # get the date time the video was published
    try:
        video_published = response['items'][0]['snippet']['publishedAt']
    except:
        video_published = None
    # get number of views
    try:
        video_views = response['items'][0]['statistics']['viewCount']
    except:
        video_views = 0
    # get video description
    try:
        video_description = response['items'][0]['snippet']['description']
    except:
        video_description = ''
    # get like count
    try:
        video_likes = response['items'][0]['statistics']['likeCount']
    except:
        video_likes = 0
    # get nummber of comments
    try:
        video_comment_count = response['items'][0]['statistics']['commentCount']
    except:
        video_comment_count = 0
    # get length of video
    try:
        video_length = response['items'][0]['contentDetails']['duration']
        video_length = video_length.replace('PT', '')
        video_length = video_length.replace('H', ':')
        video_length = video_length.replace('M', ':')
        video_length = video_length.replace('S', '')
    except:
        video_length = None

    # get video description
    try:
        video_description = response['items'][0]['snippet']['description']
    except:
        video_description = ''

    # get video tags
    try:
        video_tags = response['items'][0]['snippet']['tags']
    except:
        video_tags = ''

    # retrieve youtube video results
    # video_response = youtube.commentThreads().list(part='snippet,replies', videoId=video_id).execute()

    # create empty dataframe
    df = pd.DataFrame(columns=[
        'channel_id',
        'video_title', 'video_title_clean', 'video_id', 'published', 'video_views', 
        'video_likes', 'video_comment_count', 'video_length', 'video_description', 'video_tags'])

    # append video details to dataframe
    df = df.append({
        'channel_id': channel_id,
        'video_title': video_title,
        'video_title_clean': video_title_clean,
        'video_id': video_id,
        'published': video_published,
        'video_views': video_views,
        'video_likes': video_likes,
        'video_comment_count': video_comment_count,
        'video_length': video_length,
        'video_description': video_description,
        'video_tags': video_tags
    }, ignore_index=True)
    
    return df

In [5]:
top_channels = ['UCbCmjCuTUZos6Inko4u57UQ', 'UC-lHJZR3Gqxm24_Vd_AJ5Yw', 'UCX6OQ3DkcsbYNE6H8uQQuVA', 'UCk8GzjMOrta8yxDcKfylJYw', 'UCJplp5SjeGSdVdwsfb9Q7lQ',
                'UCJ5v_MCY6GNUBTO8-D3XoAg', 'UCvlE5gTbOvjiolFlEm-c_Ow', 'UC295-Dw_tDNtZXFeAPAW6Aw', 'UCIwFjwMjI0y7PDBVEO9-bkQ', 'UCcdwLMPsaU2ezNSJU1nFoBQ',
                'UCRijo3ddMTht_IHyNSNXpNQ', 'UC3gNmTGu-TTbFPpfSs5kNkg', 'UCfM3zsQsOnfWNUppiycmBuw', 'UC0C-w0YjGpqDXGB8IHb662A', 'UC9CoOnJkIBMdeijd9qYoT_g',
                'UC4NALVCmcmL5ntpV0thoH6w', 'UCqECaJ8Gagnn7YCbPEzWH6g', 'UCRx3mKNUdl8QE06nEug7p6Q', 'UCiGm_E4ZwYSHV3bcW1pnSeQ', 'UC4rlAVgAK0SGk-yTfe48Qpw']


In [1]:
random_channels = [
    'UCEGGyGmo0NbAPmw1zVNdXbg',
    'UCIWC3bm2eKj0wLGFULhzKdA',
    'UC-yI6rR_EGSY8cVs4qHo0BQ',
    'UCGZrQMFEhuo6cV4wenQ6xLQ',
    'UCwo2cn9mViQ8Y9xKH3ZPKDA',
    'UC9QMc_tF7lIdTQLpezdOCxA',
    'UCIPX0CTQmfGb7PoRG0MNsnQ',
    'UCMffaLPtQHcuQ6UY83qtM0g',
    'UCECGTK7DkU7I8SSfBItn3uw',
    'UC1zhHVE2KIwnqfJ5r77cM7A',
    'UCwcEhBb53uJO4ogeBLLh2yQ',
    'UCPXW0F7HwfXhZY6-rZwJt4w',
    'UCfxUyLb5D-KpCt15hfGbtrA',
    'UCfV7GnI8E_RnxMjc5-x8mAQ',
    'UCykt3TNvH5xJwA3O_WVvQpA',
    'UCfKqi5ZUXqXxErUgTdMkSDQ',
    'UCLq4U2sExYRjYd0tMdmWUmA',
    'UC9LGi7FRFTxNT7WK6jr6OXw',
    'UC5TKWxKKl1IRfTZsVgS0ziA',
    'UCmHCmV4rVl_2_wg2l60nzjA',
    'UCMM9Z0_Ur9U_hpmTzGaHvaA',
    'UClgD65bB6SJPTPv20qIVKDw',
    'UCMQEmETLCqv088mFEIlLQyQ',
    'UCj5BN5C0lmsq3QptHFEvipg',
    'UCTNAGaw3TauzN7pkJTaZZtg',
    'UCXwFtLCUu10dJKKWD_TD2LQ',
    'UC8Ewe7WqGg01KRNjJCO5cjg',
    'UCL-yKZ_kCuKBHyeRfePpvuw',
    'UCs0Pz9jzByzbhQGsMoW4jcw',
    'UC_lf4_Wss_uW0KGny_A3erg',
    'UC_E5qa-s0VEocKVV1KPtNgw',
    'UCG-PXXY7gyI7LOMeE0rl_KA',
    'UCIzKAXlRpnPzulWndJ0SOSA',
    'UCzOwecflGQFfHvMp-vMJCgQ',
    'UCEHB7jMymvg63b6x30qx5Jw',
    'UCitSKTq6Ghg2bBKC4YdKd_Q',
    'UCaBFtf4I61T9UO9WKhUG-dA',
    'UCS-SLEeQ1F7k7mpmhnKVMKw',
    'UCdBjEJpySOp2fY6f3x6TfPg',
    'UCged4xNWHYsJTGPQ5rn6HmA',
    'UCf9Ua7W95KDm28d16otkm0Q',
    'UCjlgDApB1OrU_3-1dLMHOZg',
    'UCfOjz88tFouLzzDWqN9apKg',
    'UC22gUNj0sgOlBAR_zLfdh5A',
    'UCVaXclURQZlakiTMzuwHvRw',
    'UC6YN4FNhAKN3MDO5DbJSnOA',
    'UC4z1t_toTtWFG6PGpcgzWcw',
    'UCHU5LSiGQsCbdPKKreHcITw',
    'UCrWtfbfTrZn9penvY3xwWig',
    'UCAlZ-9e75wau2hY_wWFliNA',
    'UC2NTBsmAC2ePOrUkpRvogoA',
    'UCJDIvab5y2yIBzzkbIgy8WQ',
    'UCrdJ-n4brmIZy975U-4iwsQ',
    'UCxAIx1VAAKW0F7L5Xaqensg',
    'UCQqaNnVhS1w_iTeFaIJsXog',
    'UCf7J0vxbg6SsIjY9587PEiQ',
    'UCmTSflg4X32Qkxuz5Mw1k8w',
    'UCXQC_SxY3XXbvbombLEu5hg',
    'UCnODsMthEUVnqFSisw8E4ow',
    'UCitLqDolQHZ_tldN-bXGDwg',
    'UCtxFOnPpYdQuDog6fGlp1Eg',
    'UCBjoR2uNSYyBVRKq-ZLinuw',
    'UCGffDssIzCydUTNgFNIJyxQ',
    'UCTM1z1vpulAL3zt61PDDzkw',
    'UCjGC05n79MPgDcnhtsG5_cQ',
    'UCK45CFLlvd9pP7IzVYDWIMw',
    'UCx29pmyVOjk7ZinBNUDVgnQ',
    'UC2CtsATmHPFq8Ky5olCu3jQ',
    'UCaPrv_BwRFvpIsSnXSmGt0w',
    'UCC7jlYxfWti7WAW8r7ef1RQ'
]


In [4]:
len(random_channels)

70

In [6]:
top_education_channels = [
    'UC1zZE_kJ8rQHgLTVfobLi_g',
    'UCpVm7bg6pXKo1Pr6k5kxG9A',
    'UCX6b17PVsYBQ0ip5gyeme-Q',
    'UCsooa4yRKGN_zEE8iknghZA',
    'UCXhSCMRRPyxSoyLSPFxK7VA',
    'UCGi_crMdUZnrcsvkCa8pt-g',
    'UCZYTClx2T1of7BRZ86-8fow',
    'UCHnyfMqiRRG1u-2MsSQLbXA',
    'UCYenDLnIHsoqQ6smwKXQ7Hg',
    'UC4a-Gbdw7vOaccHmFo40b9g']
top_cooking_channels = [
    'UCJFp8uSYCjXOMnkUyb3CQ3Q',
    'UCYjk_zY-iYR8YNfJmuzd70A',
    'UCpSgg_ECBj25s9moCDfSTsA',
    'UCsP7Bpw36J666Fct5M8u-ZA',
    'UCJHA_jMfCvEnv-3kRjTCQXw',
    'UCNbngWUqL2eqRw12yAwcICg',
    'UC8gFadPgK2r1ndqLI04Xvvw',
    'UCRxAgfYexGLlu1WHGIMUDqw',
    'UCbpMy0Fg74eXXkvxJrtEn3w',
    'UCfyehHM_eo4g5JUyWmms2LA']
top_fitness_channels = [
    'UCiP6wD_tYlYLYh3agzbByWQ',
    'UCIJwWYOfsCfz6PjxbONYXSg',
    'UCM1Nde-9eorUhq-teaWlgUA',
    'UCBINFWq52ShSgUFEoynfSwg',
    'UCEtMRF1ywKMc4sf3EXYyDzw',
    'UCyqR7WkL8i1b6xtSssDmW9w',
    'UCGMOauU8dOd4mv2bT3Tx57w',
    'UCEQi1ZNJiw3YMRwni0OLsTQ',
    'UC4GJndVHEhdmqLFBHOCi97A',
    'UCiH4auDlkM0tgn9ewT3B1Vw']
top_history_channels = [
    'UC9MAhZQQd9egwWCxrwSIsJQ',
    'UClfEht64_NrzHf8Y0slKEjw',
    'UC510QYlOlKNyhy_zdQxnGYw',
    'UCNIuvl7V8zACPpTmmNIqP2A',
    'UCggHoXaj8BQHIiPmOxezeWA',
    'UC88lvyJe7aHZmcvzvubDFRg',
    'UCodbH5mUeF-m_BsNueRDjcw',
    'UCv_vLHiWVBh_FR9vbeuiY-A',
    'UCx-dJoP9hFCBloY9qodykvw',
    'UCHdluULl5c7bilx1x1TGzJQ']
top_science_channels = [
    'UCC552Sd-3nyi_tk2BudLUzA',
    'UCsXVk37bltHxD1rDPwtNM8Q',
    'UC6107grRI4m0o2-emgoDnAA',
    'UCUHW94eEFW7hkUMVaZz4eDg',
    'UC06E4Y_-ybJgBUMtXx8uNNw',
    'UCmQXOAse-VnzuXHebX5I77g',
    'UCxo8ooAqXiObjuaIy10ud0A',
    'UCvJiYiBUbw4tmpRSZT2r1Hw',
    'UCJcycnanWtyOGcz34jUlYZA',
    'UC9uD-W5zQHQuAVT2GdcLCvg']
top_news_channels = [
    'UCn8zNIfYAQNdrFRrr8oibKw',
    'UCttspZesZIDEwwpVIgoZtWQ',
    'UCfwx98Wty7LhdlkxL5PZyLA',
    'UCupvZG-5ko_eiXAupbDfxWw',
    'UCLXo7UDZvByw2ixzpQCufnA',
    'UCE2606prvXQc_noEqKxVJXA',
    'UC9k-yiEpRHMNVOnOi_aQK8w',
    'UCBi2mrWuNuyYy4gbM6fU18Q',
    'UC1yBKRuGpC1tSM73A0ZjYjQ',
    'UC16niRr50-MSBwiO3YDb3RA']
top_music_channels = [
    'UC0C-w0YjGpqDXGB8IHb662A',
    'UCfM3zsQsOnfWNUppiycmBuw',
    'UCYvmuw-JtVrTZQ-7Y4kd63Q',
    'UCqECaJ8Gagnn7YCbPEzWH6g',
    'UCb2HGwORFBo94DmRx4oLzow',
    'UC9CoOnJkIBMdeijd9qYoT_g',
    'UCpDJl2EmP7Oh90Vylx0dZtA',
    'UCa10nxShhzNrCE1o2ZOPztg',
    'UCoUM-UJ7rirJYP8CQ0EIaHA',
    'UCEdvpU2pFRCVqU6yIPyTpMQ']
top_comedy_channels = [
    'UCY30JRSgfhYXA6i6xX1erWg',
    'UCV9_KinVpV-snHe3C3n1hvA',
    'UC9gFih9rw0zNCK3ZtoKQQyA',
    'UC8-Th83bH_thdKZDJCrn88g',
    'UCxSz6JVYmzVhtkraHWZC7HQ',
    'UCfm4y4rHF5HGrSr-qbvOwOg',
    'UCPDis9pjXuqyI7RYLJ-TTSA',
    'UCB0d0JLn1WcGYcwwZ87d2LA',
    'UCPDXXXJj9nax0fr0Wfc048g',
    'UCi9cDo6239RAzPpBZO9y5SA']
top_travel_channels = [
    'UCHJuQZuzapBh-CuhRYxIZrg',
    'UCyEd6QBSgat5kkC6svyjudA',
    'UCdPambxHRj0kdFPNoJFM98A',
    'UCXsQlHGuoWqukC9vz-uonrg',
    'UCd5xLBi_QU6w7RGm5TTznyQ',
    'UCGaOvAFinZ7BCN_FDmw74fQ',
    'UC8hI77bH0VraIw6p2PHwivQ',
    'UC_ptyMRLOsS1Uj0a34a_xCA',
    'UCJsSEDFFnMFvW9JWU6XUn0Q',
    'UCchgIh8Tc4sTmBfnMQ5pDdg']


In [17]:
# Don't run this again, please.
'''
channel_information_grabber(top_channels, 'top_channels')
channel_information_grabber(top_news_channels, 'top_news_channels')
channel_information_grabber(top_music_channels, 'top_music_channels')
channel_information_grabber(top_comedy_channels, 'top_comedy_channels')
channel_information_grabber(top_travel_channels, 'top_travel_channels')
channel_information_grabber(top_cooking_channels, 'top_cooking_channels')
channel_information_grabber(top_education_channels, 'top_education_channels')
channel_information_grabber(top_fitness_channels, 'top_fitness_channels')
channel_information_grabber(top_history_channels, 'top_history_channels')
channel_information_grabber(top_science_channels, 'top_science_channels')
channel_information_grabber(random_channels, 'random_channels')
'''

"\nchannel_information_grabber(top_channels, 'top_channels')\nchannel_information_grabber(top_news_channels, 'top_news_channels')\nchannel_information_grabber(top_music_channels, 'top_music_channels')\nchannel_information_grabber(top_comedy_channels, 'top_comedy_channels')\nchannel_information_grabber(top_travel_channels, 'top_travel_channels')\nchannel_information_grabber(top_cooking_channels, 'top_cooking_channels')\nchannel_information_grabber(top_education_channels, 'top_education_channels')\nchannel_information_grabber(top_fitness_channels, 'top_fitness_channels')\nchannel_information_grabber(top_history_channels, 'top_history_channels')\nchannel_information_grabber(top_science_channels, 'top_science_channels')\nchannel_information_grabber(random_channels, 'random_channels')\n"

In [7]:
# Create a list of all the channels
all_channels = random_channels + top_channels + top_education_channels + top_cooking_channels + top_fitness_channels + top_history_channels + top_science_channels + top_news_channels + top_music_channels + top_comedy_channels + top_travel_channels

len(all_channels)


180

In [19]:
# Don't run this again.
# get the 50 videos from each channel in all_channels
video_list = []
for channel in all_channels:
    video_list.append(get_50_videos(channel))

# flatten the list
video_list = [item for sublist in video_list for item in sublist]

# convert to csv
df = pd.DataFrame(video_list)
df.to_csv('video_list.csv', index=False)

video_list




"\n# get the 50 videos from each channel in all_channels\nvideo_list = []\nfor channel in all_channels:\n    video_list.append(get_50_videos(channel))\n\n# flatten the list\nvideo_list = [item for sublist in video_list for item in sublist]\n\n# convert to csv\ndf = pd.DataFrame(video_list)\n# df.to_csv('video_list.csv', index=False)\n\nvideo_list\n"

In [17]:
# get information about each video

video_df = pd.DataFrame(columns=[
    'channel_id',
    'video_title', 'video_title_clean', 'video_id', 'published', 'video_views', 'video_madeforkids', 
    'video_likes', 'video_comment_count', 'video_length', 'video_description', 'video_tags'])

# get information about each video
for video in video_list:
    video_df = video_df.append(video_details_grabber(video), ignore_index=True)

video_df





Unnamed: 0,channel_id,video_title,video_title_clean,video_id,published,video_views,video_madeforkids,video_likes,video_comment_count,video_length,video_description,video_tags
0,UCEGGyGmo0NbAPmw1zVNdXbg,Python String Index Function #shorts #python #...,Python String Index Function shorts python pro...,YiqtVoky35M,2022-11-03T16:29:11Z,84,False,1,0,51,Python String Index Function #shorts #python...,"[python, python for beginners, python programm..."
1,UCEGGyGmo0NbAPmw1zVNdXbg,Python Split Function - Get last word of Stri...,Python Split Function Get last word of String...,T9_h8ImN7rI,2022-11-03T16:27:53Z,100,False,5,0,1:,Python Split Function - Get last word of Stri...,"[python, python for beginners, python programm..."
2,UCEGGyGmo0NbAPmw1zVNdXbg,Python unpacking operator * #shorts #python #p...,Python unpacking operator shorts python progra...,UpvvygeZm58,2022-11-03T15:43:52Z,46,False,2,0,1:,Python unpacking operator * #shorts #python \...,"[python, python for beginners, python programm..."
3,UCEGGyGmo0NbAPmw1zVNdXbg,Python using * with strings and numbers #short...,Python using with strings and numbers shorts p...,0LqgbbdcQlw,2022-11-03T15:40:05Z,96,False,2,0,58,Python using * with strings #shorts #python ...,"[python, python for beginners, python programm..."
4,UCEGGyGmo0NbAPmw1zVNdXbg,Python Multiply String with Number #shorts #py...,Python Multiply String with Number shorts pyth...,7kk1TFekZ_w,2022-11-03T15:36:12Z,36,False,0,0,33,Python Multiply String with Number #shorts #...,"[python, python for beginners, python programm..."
...,...,...,...,...,...,...,...,...,...,...,...,...
3268,UCC7jlYxfWti7WAW8r7ef1RQ,In Paris use the Navigo pass like locals and s...,In Paris use the Navigo pass like locals and s...,xRi8SGcRDOY,2022-03-28T15:00:24Z,311,False,4,0,9,,
3269,UCC7jlYxfWti7WAW8r7ef1RQ,How to save money in Paris by using the Paris ...,How to save money in Paris by using the Paris ...,C-34pIsWZPk,2022-03-27T15:00:04Z,4642,False,166,79,4:49,"We all want to save money while traveling, so ...","[how to save money in paris, using the paris m..."
3270,UCC7jlYxfWti7WAW8r7ef1RQ,Magical Malta should 100% be on your bucket li...,Magical Malta should 100 be on your bucket lis...,L2GdB1gB1ZM,2022-03-26T15:00:09Z,5091,False,133,7,16,Save for your bucket list! Malta is a must vis...,
3271,UCC7jlYxfWti7WAW8r7ef1RQ,Is France on your bucket list? #shorts #france...,Is France on your bucket list shorts france tr...,89IewFGQQ6E,2022-03-25T18:00:03Z,31,False,1,0,16,Is France on your bucket list? Be sure to slow...,


In [18]:
# export to csv
video_df.to_csv('../Dataset/csv/video_df.csv', index=False)


In [21]:
# import the video_df_success.csv
video_df = pd.read_csv('video_df_success.csv')

# drop null columns
video_df = video_df.dropna(axis=1, how='all')

print(video_df.columns)
print(video_df.shape)
print(video_df.describe())


Index(['video_id', 'channel_id', 'video_title', 'video_title_clean',
       'published', 'video_views', 'video_madeforkids', 'video_likes',
       'video_comment_count', 'video_length', 'video_description',
       'video_tags'],
      dtype='object')
(5420, 12)
        video_views   video_likes  video_comment_count
count  5.420000e+03  5.420000e+03          5420.000000
mean   6.159120e+06  1.385938e+05          6369.311624
std    3.796450e+07  5.579684e+05         26269.721757
min    0.000000e+00  0.000000e+00             0.000000
25%    6.177075e+04  1.663000e+03            69.000000
50%    3.383950e+05  1.104450e+04           447.500000
75%    2.354940e+06  5.700050e+04          2363.250000
max    2.073001e+09  1.451987e+07        728578.000000


In [27]:
# convert the published column to datetime
video_df['published'] = pd.to_datetime(video_df['published'])

# convert the video_length column to datetime
#video_df['video_length'] = pd.to_datetime(video_df['video_length'])

# convert the video_madeforkids column to boolean
video_df['video_madeforkids'] = video_df['video_madeforkids'].astype(bool)

# convert the video_views column to integer
video_df['video_views'] = video_df['video_views'].astype('int64')

# convert the video_likes column to integer
video_df['video_likes'] = video_df['video_likes'].astype('int64')

# convert the video_comment_count column to integer
video_df['video_comment_count'] = video_df['video_comment_count'].astype('int64')

#check datatypes
video_df.dtypes



video_id                            object
channel_id                          object
video_title                         object
video_title_clean                   object
published              datetime64[ns, UTC]
video_views                          int64
video_madeforkids                     bool
video_likes                          int64
video_comment_count                  int64
video_length                        object
video_description                   object
video_tags                          object
dtype: object

In [28]:
# export the cleaned csv
video_df.to_csv('../Dataset/csv/video_df_cleaned.csv', index=False)