In [None]:
! pip install pandas
! pip install googleapiclient

In [1]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import pandas as pd

In [2]:
############## FUNCTIONS ##############
# YouTube channels IDs of list of users
def get_channels(vtubers):
    channels = []

    for vtuber in vtubers['jp_name']:
        res = youtube.search().list(q = vtuber, 
                                    part = "snippet", 
                                    type = "channel", 
                                    maxResults = 1).execute()
        channel_id = res['items'][0]['id']['channelId']
        channels.append(channel_id)
        
    else:
        print('All channel IDs retrieved!')
        return channels

In [3]:
# retrieve meta-information and list of all videos for each YT channel in a list of channels
def get_channel_info(channels):
    debut = []
    subs = []
    views = []
    vid_cnt = []
    videos = {}
    
    for channel in channels:
        # create a key for each channel which takes a list of video IDs as its value
        videos[channel] = []

        # extract playlist ID including all uploaded videos
        res = youtube.channels().list(id = channel, 
                                      part = 'snippet, statistics, contentDetails').execute()
        all_uploads = res['items'][0]['contentDetails']['relatedPlaylists']['uploads']
        
        # record meta-information
        debut.append(res['items'][0]['snippet']['publishedAt'])
        subs.append(int(res['items'][0]['statistics']['subscriberCount']))
        views.append(int(res['items'][0]['statistics']['viewCount']))
        vid_cnt.append(int(res['items'][0]['statistics']['videoCount']))
        
        # record all video IDs
        next_page_token = None
        while True:
            res = youtube.playlistItems().list(playlistId = all_uploads, 
                                               part = 'contentDetails', 
                                               pageToken = next_page_token, 
                                               maxResults = 50).execute()
            for video in res['items']:
                videos[channel].append(video['contentDetails']['videoId'])
            next_page_token = res.get('nextPageToken')
            
            if next_page_token is None:
                break
    
    else:
        print('All channel information has been recorded!')
        return debut, subs, views, vid_cnt, videos

In [4]:
# generate dataframe containing video information from a list of video IDs
def get_video_info(videos):
    data = {'vid_id': [], 'title': [], 'description': [], 'category': [], 'date': [], 'duration': [], '2d3d': [], 
            'definition': [], 'views': [], 'likes': [], 'dislikes': [], 'favorites': [], 'num_comments': []}

    for video in videos:
        res = youtube.videos().list(id = video, 
                                    part = 'snippet, contentDetails, statistics').execute()

        data['vid_id'].append(video)
        data['title'].append(res['items'][0]['snippet']['title'])
        data['description'].append(res['items'][0]['snippet']['description'])
        data['category'].append(int(res['items'][0]['snippet']['categoryId']))    # 필요할까?
        data['date'].append(res['items'][0]['snippet']['publishedAt'])    # datetime
        data['duration'].append(res['items'][0]['contentDetails']['duration'])    # string
        data['2d3d'].append(res['items'][0]['contentDetails']['dimension'])
        data['definition'].append(res['items'][0]['contentDetails']['definition'])
        
        views = res['items'][0]['statistics'].get('viewCount')
        if views is None:
            views = 0
        data['views'].append(int(views))
        
        likes = res['items'][0]['statistics'].get('likeCount')
        if likes is None:
            likes = 0
        data['likes'].append(int(likes))
        
        dislikes = res['items'][0]['statistics'].get('dislikeCount')
        if dislikes is None:
            dislikes = 0
        data['dislikes'].append(int(dislikes))
        
        favorites = res['items'][0]['statistics'].get('favoriteCount')
        if favorites is None:
            favorites = 0
        data['favorites'].append(int(favorites))

        num_comments = res['items'][0]['statistics'].get('commentCount')
        if num_comments is None:
            num_comments = 0
        data['num_comments'].append(int(num_comments))

    else:
        video_info = pd.DataFrame(data)
        print('All video information extracted!')
        return video_info

In [5]:
def divide_and_conquer(batch, final_vtubers, channel2videos):
    # add channel ID to dataframe
    channels = get_channels(batch)
    batch['channel_id'] = channels

    # add meta-information to dataframe
    debut, subs, views, vid_cnt, videos = get_channel_info(channels)
    batch['debut'] = debut
    batch['total_subs'] = subs
    batch['total_views'] = views
    batch['vid_cnt'] = vid_cnt
    
    # add records to final version
    final_vtubers = final_vtubers.append(batch)
    channel2videos.update(videos)
    
    return final_vtubers, channel2videos

In [6]:
############## MAIN ##############
vtuber_rank_file = '[20200526] vtuber_all_ranking.csv'
top100 = pd.read_csv(vtuber_rank_file, encoding = 'utf-8', sep = ',', index_col = 'ranking')[:100]
# batches = [top100[:5], top100[5:10], top100[10:15], top100[15:20], top100[20:25], top100[25:30], top100[30:35], top100[35:40], 
#            top100[40:45], top100[45:50], top100[50:55], top100[55:60], top100[60:65], top100[65:70], top100[70:75], top100[75:80], 
#            top100[80:85], top100[85:90], top100[90:95], top100[95:100]]

In [7]:
# build Youtube Data v3 API
DEVELOPER_KEY = 'AIzaSyCWsLADiiwZg0zGRVKvhuzqDjHri56ZVk8'

API_SERVICE_NAME = 'youtube'
API_VERSION = 'v3'

youtube = build(API_SERVICE_NAME, API_VERSION, developerKey = DEVELOPER_KEY)

In [20]:
# initialize base for data recording
final_vtubers = pd.DataFrame()
channel2videos = dict()
final_videos = pd.DataFrame()
lower = 90

In [99]:
# #### 이 아래 구문들은 for문 대신 수동으로 각  batch를 돌리길 권장합니다!!! (quota issues)
# for batch in batches:
#     final_vtubers, channel2videos = divide_and_conquer(batch, final_vtubers, channel2videos)

# for channel, videos in channel2videos:
#     video_info = get_video_info(videos)
#     video_info['channel_id'] = channel
#     final_videos = final_videos.append(video_info)

# # record collected data to an external file    
# final_vtubers.to_csv('final_vtubers_50_60.csv', encoding = 'utf-8')
# final_videos.to_csv('final_videos_50_60.csv', encoding = 'utf-8')

In [49]:
final_vtubers, channel2videos = divide_and_conquer(top100[lower:lower + 10], final_vtubers, channel2videos)

All channel IDs retrieved!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['channel_id'] = channels


All channel information has been recorded!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['debut'] = debut
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['total_subs'] = subs
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['total_views'] = views
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

Se

In [55]:
# conduct the following operation for アズマ リム(アズリム) 
# data error in api retrieval (system problem)

# final_vtubers['total_subs'][88] = 186000

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_vtubers['total_subs'][88] = 186000


In [57]:
fname = f'final_vtubers_{lower}_{lower + 10}.csv'
final_vtubers.to_csv(fname, encoding = 'utf-8')

In [186]:
# extract channel IDs and check length of list
channels = list()
for channel in channel2videos:
    channels.append(channel)
print(len(channels))

10


In [187]:
# get video information from a single channel
video_info = get_video_info(channel2videos[channels[0]])
video_info['channel_id'] = channels[0]

# save video information to an external file
fname = f'final_videos_{lower}.csv'
video_info.to_csv(fname, encoding = 'utf-8')

All video information extracted!


In [188]:
video_info = get_video_info(channel2videos[channels[1]])
video_info['channel_id'] = channels[1]

fname = f'final_videos_{lower + 1}.csv'
video_info.to_csv(fname, encoding = 'utf-8')

All video information extracted!


In [189]:
video_info = get_video_info(channel2videos[channels[2]])
video_info['channel_id'] = channels[2]

fname = f'final_videos_{lower + 2}.csv'
video_info.to_csv(fname, encoding = 'utf-8')

All video information extracted!


In [201]:
video_info = get_video_info(channel2videos[channels[3]])
video_info['channel_id'] = channels[3]

fname = f'final_videos_{lower + 3}.csv'
video_info.to_csv(fname, encoding = 'utf-8')

All video information extracted!


In [202]:
video_info = get_video_info(channel2videos[channels[4]])
video_info['channel_id'] = channels[4]

fname = f'final_videos_{lower + 4}.csv'
video_info.to_csv(fname, encoding = 'utf-8')

All video information extracted!


In [203]:
video_info = get_video_info(channel2videos[channels[5]])
video_info['channel_id'] = channels[5]

fname = f'final_videos_{lower + 5}.csv'
video_info.to_csv(fname, encoding = 'utf-8')

All video information extracted!


In [204]:
video_info = get_video_info(channel2videos[channels[6]])
video_info['channel_id'] = channels[6]

fname = f'final_videos_{lower + 6}.csv'
video_info.to_csv(fname, encoding = 'utf-8')

All video information extracted!


In [205]:
video_info = get_video_info(channel2videos[channels[7]])
video_info['channel_id'] = channels[7]

fname = f'final_videos_{lower + 7}.csv'
video_info.to_csv(fname, encoding = 'utf-8')

All video information extracted!


In [208]:
video_info = get_video_info(channel2videos[channels[8]])
video_info['channel_id'] = channels[8]

fname = f'final_videos_{lower + 8}.csv'
video_info.to_csv(fname, encoding = 'utf-8')

All video information extracted!


In [209]:
video_info = get_video_info(channel2videos[channels[9]])
video_info['channel_id'] = channels[9]

fname = f'final_videos_{lower + 9}.csv'
video_info.to_csv(fname, encoding = 'utf-8')

All video information extracted!


Data Accumulation

In [58]:
# accumulate vtuber data and record on external file
columns = ['ranking', 'jp_name', 'eng_name', 'agency', 'total_subs', 'total_views', 'channel_id', 'debut', 'vid_cnt']
final_vtubers = pd.DataFrame(columns = columns)

# files for rank1~50 (processed by 정걸)
fnames = ['final_vtubers_0_4.csv', 
          'final_vtubers_1_2.csv', 
          'final_vtubers_2_3.csv', 
          'final_vtubers_3_4.csv', 
          'final_vtubers_4_6.csv', 
          'final_vtubers_5_8.csv', 
          'final_vtubers_7_10.csv', 
          'final_vtubers_8_10.csv', 
          'final_vtubers_9_10.csv', 
          'final_vtubers_10_12.csv', 
          'final_vtubers_11_12.csv', 
          'final_vtubers_12_22.csv', 
          'final_vtubers_14_17.csv', 
          'final_vtubers_15_18.csv', 
          'final_vtubers_18_20.csv', 
          'final_vtubers_20_30.csv', 
          'final_vtubers_24_26.csv', 
          'final_vtubers_26_28.csv', 
          'final_vtubers_27_29.csv', 
          'final_vtubers_28_30.csv', 
          'final_vtubers_30_40.csv', 
          'final_vtubers_31_33.csv', 
          'final_vtubers_33_35.csv', 
          'final_vtubers_35_37.csv', 
          'final_vtubers_37_39.csv', 
          'final_vtubers_39_41.csv', 
          'final_vtubers_40_50.csv', 
          'final_vtubers_42_44.csv', 
          'final_vtubers_44_46.csv', 
          'final_vtubers_46_48.csv', 
          'final_vtubers_48_50.csv', 
          'final_vtubers_49_51.csv']

for fname in fnames:
    vtubers_temp = pd.read_csv(fname, encoding = 'utf-8', sep = ',').to_numpy()
    for row in vtubers_temp:
        if row[0] not in list(final_vtubers.ranking):
            final_vtubers = final_vtubers.append(pd.DataFrame([row], columns = columns))

# files for rank 51~100 (processed by 박형서)
lower = 50

while lower <= 90:
    vtubers_fname = f'final_vtubers_{lower}_{lower + 10}.csv'
    vtubers_temp = pd.read_csv(vtubers_fname, encoding = 'utf-8', sep = ',')
    final_vtubers = final_vtubers.append(vtubers_temp)
    lower += 10

final_vtubers = final_vtubers.set_index('ranking')
final_vtubers.to_csv('final_vtubers.csv', encoding = 'utf-8')

In [25]:
# accumulate video data and record on external file
final_videos = pd.DataFrame()
columns = ['vid_id', 'title', 'description', 'category', 'date', 'duration', '2d3d', 'definition', 
           'views', 'likes', 'dislikes', 'favorites', 'num_comments', 'channel_id']

i = 0    

while i < 100:
    videos_fname = f'final_videos_{i}.csv'
    videos_temp = pd.read_csv(videos_fname, encoding = 'utf-8', sep = ',')
    final_videos = final_videos.append(videos_temp)
    i += 1

final_videos = pd.DataFrame(final_videos, columns = columns).set_index('vid_id')
final_videos.to_csv('final_videos.csv', encoding = 'utf-8')