In [1]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from datetime import datetime
import pandas as pd
import os
from dataclasses import dataclass

In [8]:
# data classes
@dataclass
class Channel:
    channelId: str = None
    title: str = None
    publishedAt: str = None
    country: str = None
    description: str = None
@dataclass
class Video:
    video_id: str = None
    channelId: str = None
    categoryId: str = None
    title: str = None
    publishedAt: str =  None
    description: str = None
@dataclass
class ChannelMetrics:
    channelId: str = None
    extract_date: str = None
    subscriberCount: str = None
    viewCount: str = None
    videoCount: str = None

@dataclass
class VideoMetrics:
    video_id: str = None
    extract_date: str = None
    viewCount:  str = None
    likeCount: str = None
    commentCount:  str = None


In [9]:
# config

# api key name
API_KEY = 'YT_API_KEY'
#date formats
SOURCE_DATE_FORMAT = '%Y-%m-%dT%H:%M:%SZ'
TARGET_DATE_FORMAT = '%Y-%m-%d'
DATE_COLUMN = 'publishedAt'
##
CHANNELS = ['lekkostronniczy', 'Marvecc']
START_DATE = '2023-04-10'

#extraction filed Channels info
CHANNELS_SNIPPED = ['title',  'publishedAt', 'country', 'description']
CHANNELS_METRICS = ['viewCount', 'subscriberCount', 'videoCount']

#extractions fields for videos
VIDEOS_SNIPPED =['title', 'categoryId', 'publishedAt','description']
VIDOS_METRICS = ['viewCount', 'likeCount', 'commentCount']

TABLES_DIRECTORY = 'tables'

TABLES = [Channel, Video, ChannelMetrics, VideoMetrics]

In [4]:
TODAY = datetime.today().strftime(TARGET_DATE_FORMAT)
# build youtube service
youtube = build('youtube', 'v3', developerKey=os.environ['YT_API_KEY'])

In [5]:
# table initialization
def init_table(table):
    file_path = os.path.join(TABLES_DIRECTORY, f'{table.__name__}.csv') 
    if not os.path.isfile(file_path):
        df = pd.DataFrame(columns=vars(table()))
        df.to_csv(file_path, index=False)
# extract functions
def get_api_request(request):   
    try:
        response = request.execute()
        return response
    except HttpError as e:
        return 'Error response status code : {0}, reason : {1}'.format(e.status_code, e.error_details)
    
def get_channelId(channel_name):
    
    request = youtube.search().list(
        part='id',
        type='channel',
        q=channel_name
    )
    response = get_api_request(request)
    channelId = response['items'][0]['id']['channelId']
    return channelId

def extract_channel_data(channelId):
    request = youtube.channels().list(
    part="snippet,statistics",
    id=channelId
    )
    response = get_api_request(request)
    return response

def extract_video_data(channelId, start_date):
    
    max_results = 50
    next_page_token = None
    videos_results  = []
    start_date = datetime.strptime(start_date, TARGET_DATE_FORMAT)

    next_page = True
    while next_page:

        search_request = youtube.search().list(
                channelId=channelId,
                part='id',
                order='date',
                type='video',
                maxResults=max_results,
                publishedAfter=start_date.isoformat() + 'Z',
                pageToken=next_page_token
            )

        search_response = get_api_request(search_request)

        videos_ids = ",".join([item['id']['videoId'] for item in search_response['items']])

        videos_request = youtube.videos().list(
            id=videos_ids,
            part='snippet,statistics')

        videos_response = get_api_request(videos_request)

        videos_results.extend(videos_response['items'])

        if 'nextPageToken' in search_response.keys():
            next_page_token = search_response['nextPageToken']
        else:
            next_page = False

    return videos_results
# tranform functions
def transform_channel(channel_data, channelId):
    
    channel_info = {}
    for field in CHANNELS_SNIPPED:
        channel_info[field] = channel_data['items'][0]['snippet'][field]
    
    date_obj = datetime.strptime(channel_info['publishedAt'], SOURCE_DATE_FORMAT)
    channel_info['publishedAt'] = date_obj.strftime(TARGET_DATE_FORMAT)
    channel_info['description'] = channel_info['description'].rstrip()
    channel_info['channelId'] = channelId

    return channel_info

def transform_channel_metrics(channel_data, channelId):

    channel_stats = {}
    for field in CHANNELS_METRICS:
        channel_stats[field] = channel_data['items'][0]['statistics'][field]
    channel_stats['channelId'] = channelId
    channel_stats['extract_date'] = TODAY

    return channel_stats

def transform_video(video, channelId):
    video_record = {}
    video_id = video['id']
    for field in VIDEOS_SNIPPED:
        video_record[field] =  video['snippet'][field]

    date_obj = datetime.strptime(video_record['publishedAt'], SOURCE_DATE_FORMAT)
    video_record['publishedAt'] = date_obj.strftime(TARGET_DATE_FORMAT)

    video_record['video_id'], video_record['channelId'] = video_id, channelId

    return video_record

def transform_video_metrics(video):

    video_metrics_record = {}
    video_id = video['id']
    for field in VIDOS_METRICS:
        video_metrics_record[field] = video['statistics'][field]
    video_metrics_record['video_id'], video_metrics_record['extract_date'] = video_id, TODAY

    return video_metrics_record
    
def tansform_video_data(videos_data, channelId):

    videos = []
    videos_metrics = []

    for video in videos_data:

        video_record = transform_video(video, channelId)
        video_metrics_record = transform_video_metrics(video)
        
        videos.append(video_record)
        videos_metrics.append(video_metrics_record)

    return videos, videos_metrics

# load functions
def load_to_df(data):
    file_path =  os.path.join(TABLES_DIRECTORY, f'{type(data).__name__}.csv')
    df = pd.read_csv(file_path)
    to_append =  pd.DataFrame.from_dict(vars(data), orient='index').T
    df = pd.concat([to_append, df], ignore_index=True)
    df = df.drop_duplicates()
    df.to_csv(file_path, index=False)
    return 0

def load_data_list_to_df(data_list):

    for data in data_list:
        load_to_df(data)
    return 0 

# ETL process function 
def yt_etl(channel_name):

    #extract
    channelId = get_channelId(channel_name)
    
    channel_data = extract_channel_data(channelId)
    videos_data = extract_video_data(channelId, START_DATE)
    
    #transform
    channel = transform_channel(channel_data, channelId)
    channel_metrics = transform_channel_metrics(channel_data, channelId)
    videos, videos_metrics = tansform_video_data(videos_data, channelId)
    channel = Channel(**channel)
    channel_metrics = ChannelMetrics(**channel_metrics)
    videos = [Video(**video) for video in videos]
    videos_metrics = [VideoMetrics(**video) for video in videos_metrics]

    #load
    for data in [channel, channel_metrics]:
        load_to_df(data)
    for data in [videos, videos_metrics]:
        load_data_list_to_df(data)

    return 0

# main functions
def main():
    for table in TABLES:
        init_table(table)

    for channel_name in CHANNELS:
        yt_etl(channel_name)

# execute process    
main()

In [7]:
## check results

for file in os.listdir('tables/'):
    print(file)
    display(pd.read_csv(os.path.join('tables', file)))
    

Channel.csv


Unnamed: 0,channelId,title,publishedAt,country,description
0,UCK7nmg4JAgTwHf4wc3DWuVg,Marvecc,2014-09-04,PL,"Witajcie, jestem Marvecc. Zajmuję się lore, pu..."
1,UC8JbbaZ_jgdsoUqrZ2bXtQQ,Lekko Stronniczy,2011-02-24,PL,Lekko Stronniczy to codzienny program rozrywko...


ChannelMetrics.csv


Unnamed: 0,channelId,extract_date,subscriberCount,viewCount,videoCount
0,UCK7nmg4JAgTwHf4wc3DWuVg,2023-04-17,186000,60850239,1326
1,UC8JbbaZ_jgdsoUqrZ2bXtQQ,2023-04-17,400000,202234021,2089


Video.csv


Unnamed: 0,video_id,channelId,categoryId,title,publishedAt,description
0,Vs8ZpF7tyRA,UCK7nmg4JAgTwHf4wc3DWuVg,20,Kim jest Rengar?,2023-04-10,"Dzień dobry moi mili, dziś w kolejnym material..."
1,sr1VanBlt8g,UCK7nmg4JAgTwHf4wc3DWuVg,20,Zemsta Katariny i Pojedynek z Talonem,2023-04-13,"Dzień dobry moi mili, dziś porozmawiamy o 7 i ..."
2,jYuKqJuRRl8,UC8JbbaZ_jgdsoUqrZ2bXtQQ,24,Sprawdzamy ile zarabia Adam Glapiński z NBP i ...,2023-04-10,Dziś w odcinku sprawdzamy ile zarabia Adam Gla...
3,FIv3xtO8Hks,UC8JbbaZ_jgdsoUqrZ2bXtQQ,24,Mamy 2 miliardy na TVP? Wywiad z psem - to jes...,2023-04-11,Jak wiecie 2 miliardy na TVP (teraz to już 3 n...
4,sjcW4nxJOOk,UC8JbbaZ_jgdsoUqrZ2bXtQQ,24,Będzie serial Harry Potter! Kto zagra Harrego ...,2023-04-12,"Będzie serial Harry Potter, ktoś zagra Harrego..."
5,yR9nA1PN8oY,UC8JbbaZ_jgdsoUqrZ2bXtQQ,24,Apostoł Tomasz leczy chorych? Chat GPT 4 po po...,2023-04-13,Apostoł Tomasz Drożała leczy chorych. Chat GPT...
6,_Xbdu0nC9jw,UC8JbbaZ_jgdsoUqrZ2bXtQQ,24,Mr Beast: jak to robi najpopularniejszy YouTub...,2023-04-14,Mr Beast to najpopularniejszy YouTuber na świe...


VideoMetrics.csv


Unnamed: 0,video_id,extract_date,viewCount,likeCount,commentCount
0,Vs8ZpF7tyRA,2023-04-17,20352,1709,72
1,sr1VanBlt8g,2023-04-17,12698,1065,64
2,jYuKqJuRRl8,2023-04-17,57554,3238,129
3,FIv3xtO8Hks,2023-04-17,57185,2627,132
4,sjcW4nxJOOk,2023-04-17,60562,2876,189
5,yR9nA1PN8oY,2023-04-17,54055,2460,123
6,_Xbdu0nC9jw,2023-04-17,47818,2091,99
