**Exploratory Data Analysing Using Youtube Video Data from Most YouTube Influencers in Germany**

Quelle für den Scraper Programmcode von https://github.com/thu-vu92/youtube-api-analysis, abgerufen am: 02.11.2022

In [1]:
from googleapiclient.discovery import build
import pandas as pd
from distutils import errors
from pathlib import Path

In [2]:
#remove before upload to GitHub
api_key = 'AIzaSyBJ7mfYocgj8mEx3Ts41FmLGVHcSNbTd6s' #Paula's Key

channel_ids = ["UCYJ61XIK64sp6ZFFS8sctxw", #Gronkh
               "UCTXeJ33DzXI2veQpKfrvaYw", #Julien Bam
               "UCcn4UOBvB0W2HjCcLFLuu2w", #ungespielt
               "UCLCb_YDL9XfSYsWpS5xrO5Q", #Rezo
               "UCpAMOlA_0hFXopIxMq8ar0w", #MontanaBlack
               "UCpZ_DI-ZugwMzXcqccaTVsg", #Dagi Bee
               "UCHfdTAyg5t4mb1G-3rJ6QsQ", #BibisBeautyPalace 
               "UCdFWqYbJHMZGbOs0efRwmsg", #xLL by Lisa & Lena
               "UChVRfsT_ASBZk10o0An7Ucg", #Pamela Reif
               "UCi3OE-aN09WOcN9d2stCvPg" ] #charli d'amelio

api_service_name = "youtube"
api_version = "v3"

# Get credentials and create an API client
youtube = build(api_service_name, api_version, developerKey=api_key)

In [3]:
def get_channel_stats(youtube, channel_ids):
    
    """
    Get channel stats
    
    Params:
    ------
    youtube: build object of Youtube API
    channel_ids: list of channel IDs
    
    Returns:
    ------
    dataframe with all channel stats for each channel ID
    
    """
    
    all_data = []
    
    request = youtube.channels().list(
        part="snippet,contentDetails,statistics",
        id=','.join(channel_ids)
    )
    response = request.execute()

    # loop through items
    for item in response['items']:
        data = {'channelName': item['snippet']['title'],
                'subscribers': item['statistics']['subscriberCount'],
                'views': item['statistics']['viewCount'],
                'totalVideos': item['statistics']['videoCount'],
                'playlistId': item['contentDetails']['relatedPlaylists']['uploads']
        }
        
        all_data.append(data)
        
    return(pd.DataFrame(all_data))

def get_video_ids(youtube, playlist_id):
    
    video_ids = []
    
    request = youtube.playlistItems().list(
        part="snippet,contentDetails",
        playlistId=playlist_id,
        maxResults = 50
    )
    response = request.execute()
 
    for item in response['items']:
        video_ids.append(item['contentDetails']['videoId'])
        
    next_page_token = response.get('nextPageToken')
    while next_page_token is not None:
        request = youtube.playlistItems().list(
                    part='contentDetails',
                    playlistId = playlist_id,
                    maxResults = 50,
                    pageToken = next_page_token)
        response = request.execute()

        for item in response['items']:
            video_ids.append(item['contentDetails']['videoId'])

        next_page_token = response.get('nextPageToken')
        
    return video_ids

def get_video_details(youtube, video_ids):

    all_video_info = []
    
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics,topicDetails",
            id=','.join(video_ids[i:i+50])
        )
        response = request.execute() 

        for video in response['items']:
            stats_to_keep = {'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
                             'statistics': ['viewCount', 'likeCount', 'favouriteCount', 'commentCount'],
                             'contentDetails': ['duration', 'definition', 'caption'],
                             'topicDetails': ['topicCategories']
                            }
            video_info = {}
            video_info['video_id'] = video['id']

            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_info[v] = video[k][v]
                    except:
                        video_info[v] = None

            all_video_info.append(video_info)
    
    return pd.DataFrame(all_video_info)


def get_comments_in_videos(youtube, video_ids):
    """
    Get top level comments as text from all videos with given IDs (only the first 10 comments due to quote limit of Youtube API)
    Params:
    
    youtube: the build object from googleapiclient.discovery
    video_ids: list of video IDs
    
    Returns:
    Dataframe with video IDs and associated top level comment in text.
    
    """

    all_comments = []
    print('Got '+str(len(video_ids)) + ' video IDs ')
    percent = 0.0
    
    for video_id in video_ids:
        #next_page_token = response.get('nextPageToken')
        #while next_page_token is not None:
            try:   
                request = youtube.commentThreads().list(
                    part="snippet,replies",
                    videoId=video_id,
                    maxResults=100,
                    textFormat="plainText"
                    #pageToken = next_page_token
                )
                response = request.execute()
            
                comments_in_video = [comment['snippet']['topLevelComment']['snippet']['textOriginal'] for comment in response['items']]
                comments_in_video_info = {'video_id': video_id, 'comments': comments_in_video}

                #next_page_token = response.get('nextPageToken')
                
            except: 
                # When error occurs - most likely because comments are disabled on a video
                print('Could not get comments for video ' + video_id)


        
    return pd.DataFrame(all_comments)  

In [23]:
def get_comment_threads(youtube, video_id):
    try:
        results = youtube.commentThreads().list(
            part="snippet",
            maxResults=150,
            videoId=video_id
        ).execute()

        return results
    except:
        # When error occurs - most likely because comments are disabled on a video
        print('Could not get comments for video ' + video_id)
        return None
    
    

In [13]:
def load_comments(result):
    comments = []

    for item in result["items"]:
        comment = item["snippet"]["topLevelComment"]
        author = comment["snippet"]["authorDisplayName"]
        text = comment["snippet"]["textDisplay"]
        # print("Comment by {}: {}".format(author, text))
        
        comments.append(text)
    
    return comments

In [4]:
channel_data = get_channel_stats(youtube, channel_ids)

In [5]:
channel_data

Unnamed: 0,channelName,subscribers,views,totalVideos,playlistId
0,Dagi Bee,3960000,1061882356,644,UUpZ_DI-ZugwMzXcqccaTVsg
1,charli d'amelio,9470000,305646938,64,UUi3OE-aN09WOcN9d2stCvPg
2,BibisBeautyPalace,5880000,2875928915,956,UUHfdTAyg5t4mb1G-3rJ6QsQ
3,ungespielt,3760000,2118366292,4768,UUcn4UOBvB0W2HjCcLFLuu2w
4,rezo,1750000,191478344,49,UULCb_YDL9XfSYsWpS5xrO5Q
5,MontanaBlack,2910000,323532581,371,UUpAMOlA_0hFXopIxMq8ar0w
6,xLL by Lisa & Lena,887000,58660820,53,UUdFWqYbJHMZGbOs0efRwmsg
7,Gronkh,4920000,3653270484,15124,UUYJ61XIK64sp6ZFFS8sctxw
8,Pamela Reif,9230000,1581102771,186,UUhVRfsT_ASBZk10o0An7Ucg
9,Julien Bam,6000000,1588456718,238,UUTXeJ33DzXI2veQpKfrvaYw


In [6]:
# Create a dataframe with video statistics from all channels

video_df = pd.DataFrame()

for c in channel_data['channelName'].unique():
    print("Getting video information from channel: " + c)
    playlist_id = channel_data.loc[channel_data['channelName']== c, 'playlistId'].iloc[0]
    video_ids = get_video_ids(youtube, playlist_id)
    
    # get video data
    video_data = get_video_details(youtube, video_ids)

    # append video data together and comment data toghether
    video_df = video_df.append(video_data, ignore_index=True)

Getting video information from channel: Dagi Bee


  video_df = video_df.append(video_data, ignore_index=True)


Getting video information from channel: charli d'amelio


  video_df = video_df.append(video_data, ignore_index=True)


Getting video information from channel: BibisBeautyPalace


  video_df = video_df.append(video_data, ignore_index=True)


Getting video information from channel: ungespielt


  video_df = video_df.append(video_data, ignore_index=True)


Getting video information from channel: rezo


  video_df = video_df.append(video_data, ignore_index=True)


Getting video information from channel: MontanaBlack


  video_df = video_df.append(video_data, ignore_index=True)


Getting video information from channel: xLL by Lisa & Lena


  video_df = video_df.append(video_data, ignore_index=True)


Getting video information from channel: Gronkh


  video_df = video_df.append(video_data, ignore_index=True)


Getting video information from channel: Pamela Reif


  video_df = video_df.append(video_data, ignore_index=True)


Getting video information from channel: Julien Bam


  video_df = video_df.append(video_data, ignore_index=True)


In [18]:
numeric_cols = ['viewCount', 'likeCount', 'favouriteCount', 'commentCount']
video_df[numeric_cols] = video_df[numeric_cols].apply(pd.to_numeric, errors = 'coerce', axis = 1)

In [14]:
video_data = video_df[0:0]

channel_title = video_df['channelTitle'].unique()[0]

# get videos for selected channel
temp = video_df.loc[video_df['channelTitle'] == channel_title]
# sort data by video view count
temp.sort_values('viewCount', ascending=False)
# only get top 30 videos
temp = temp.head(30)
temp.reset_index(inplace=True)
temp.apply(pd.to_numeric, errors = 'coerce', axis = 1)
# temp.drop(temp.columns[[0]], axis=1, inplace=True)

# append found videos to video_data dataframe
video_data = pd.concat([temp,video_data],  ignore_index=False)
# video_data.append(temp)

comments_query = get_comment_threads(youtube=youtube, video_id=temp['video_id'].to_list()[0])
comments = load_comments(result=comments_query)

In [28]:
# 22-11-19 : read comments by artist and video
from timeit import default_timer as timer

video_data = video_df[0:0]


for channel_title in video_df['channelTitle'].unique():
    print("== QUERYING DATA FOR CHANNEL ["+channel_title+"]")
    channel_comment_data = []
    
    # get videos for selected channel
    temp = video_df.loc[video_df['channelTitle'] == channel_title]
    # sort data by video view count
    temp.sort_values('viewCount', ascending=False)
    # only get top 30 videos
    temp = temp.head(30)
    temp.reset_index(inplace=True)
    temp.apply(pd.to_numeric, errors = 'coerce', axis = 1)
    # temp.drop(temp.columns[[0]], axis=1, inplace=True)

    # append found videos to video_data dataframe
    video_data = pd.concat([temp,video_data],  ignore_index=False)
    # video_data.append(temp)

    print(" - got video data for " + channel_title + "with " + str(len(video_data.index)) +" videos")

    # get 100 comments for each of the 30 retrieved videos
    for video in temp['video_id']:
        print(" - retrieving comment data for ID="+video +" [" + channel_title + "]")
        _start = timer()

        # video id for current video
        current_video_id = video
        # query comments for that video
        comments_query = get_comment_threads(youtube=youtube, video_id=current_video_id)

        if comments_query is None:
            comments = []
            print("got no comments")
        else:
            # retrieve queried comments as list of strings
            comments = load_comments(result=comments_query)

        _end = timer()

        print(" - queried "+ str(len(comments)) +" comments (took " + str(round((_end - _start),4)) + "s) from ID="+video +" [" + channel_title + "]")

        # append to all comment data we have so far from channel
        # for com in comments:
        #     channel_comment_data.append(com)
        # were appending a list of comments as strings to our total list of comments for this particular youtube star
        channel_comment_data += comments

    # we're done with comments for this channel
    # lets save them locally as csv
    csv_data = pd.Series(channel_comment_data)
    print(" - saving " + str(len(csv_data.index)) + "comments from creator [" + channel_title + "] to local harddrive")
    # save channel comments as csv file
    csv_data.to_csv('C:/Users/D074066/Downloads/'+channel_title+'_comments.csv', sep=';',encoding='utf-16')
    print(" - ... SAVED CSV SUCCESSFULLY")

video_data.to_csv('C:/Users/D074066/Downloads/'+'videos_top30.csv', sep=';')

== QUERYING DATA FOR CHANNEL [Dagi Bee]
 - got video data for Dagi Beewith 30 videos
 - retrieving comment data for ID=TfIY6KLnJgg [Dagi Bee]
 - queried 100 comments (took 0.6669s) from ID=TfIY6KLnJgg [Dagi Bee]
 - retrieving comment data for ID=3w-EyQ6P0sU [Dagi Bee]
 - queried 85 comments (took 0.2198s) from ID=3w-EyQ6P0sU [Dagi Bee]
 - retrieving comment data for ID=JBavdh_I3DQ [Dagi Bee]
 - queried 100 comments (took 0.2611s) from ID=JBavdh_I3DQ [Dagi Bee]
 - retrieving comment data for ID=DKAQorfKg_E [Dagi Bee]
 - queried 100 comments (took 0.2144s) from ID=DKAQorfKg_E [Dagi Bee]
 - retrieving comment data for ID=4WC1KE5vfuA [Dagi Bee]
 - queried 100 comments (took 0.2176s) from ID=4WC1KE5vfuA [Dagi Bee]
 - retrieving comment data for ID=iYJlDKPaQMk [Dagi Bee]
 - queried 100 comments (took 0.2296s) from ID=iYJlDKPaQMk [Dagi Bee]
 - retrieving comment data for ID=h3zYPkxHeX4 [Dagi Bee]
 - queried 100 comments (took 0.3898s) from ID=h3zYPkxHeX4 [Dagi Bee]
 - retrieving comment data 

In [27]:
csv_data.to_csv('C:/Users/D074066/Downloads/emojitest.csv', sep=';',encoding='utf-32')

In [None]:
video_data = video_df[0:0]


commentdata_per_channel = {}

# iterate over all channels
for channel_title in video_df['channelTitle'].unique():
    # get videos for selected channel
    temp = video_df.loc[video_df['channelTitle']==channel_title]
    # sort data by video view count
    temp.sort_values('viewCount', ascending=False)
    # only get top 30 videos
    temp = temp.head(30)
    temp.reset_index(inplace=True)
    temp.apply(pd.to_numeric, errors = 'coerce', axis = 1)
    # temp.drop(temp.columns[[0]], axis=1, inplace=True)

    # append found videos to video_data dataframe
    video_data = pd.concat([temp,video_data],  ignore_index=False)
    # video_data.append(temp)


    # get all comments for each video
    if False:
        # get comments for selected video by video id
        comments = get_comments_in_videos(youtube, video_ids=temp['video_id'].to_list())
        # store comments per youtube channel in dictionary
        commentdata_per_channel[channel_title] = comments
    else:
        comments_query = get_comment_threads(youtube=youtube, video_ids=temp['video_id'].to_list())
        comments = load_comments(result=comments_query)
        


    comments.to_csv('C:/Users/D074066/Downloads/'+channel_title+'_comments_top30.csv', sep=';')

video_data.to_csv('C:/Users/D074066/Downloads/'+'videos_top30.csv', sep=';')

Got 30 video IDs 
Got 30 video IDs 
Got 30 video IDs 
Could not get comments for video 1OzUV_GA_qM
Got 30 video IDs 
Got 30 video IDs 
Could not get comments for video l4IKrZymsXU
Got 30 video IDs 
Could not get comments for video Z2m2SMdEmzY
Got 30 video IDs 
Could not get comments for video LUJeN1CjH9s
Could not get comments for video rdWkXGArdjA
Could not get comments for video RV29eBiaVhc
Could not get comments for video dqr6eic6PBg
Could not get comments for video hAEth8ZwMKE
Could not get comments for video WOTARgOfTZ0
Could not get comments for video UKOmNgJylvA
Could not get comments for video L12I5xySoHU
Could not get comments for video uR2RhUg-zZM
Could not get comments for video BFlkLss3Y94
Could not get comments for video 3dr0VLWdadM
Could not get comments for video dilHrCtO670
Could not get comments for video eXKCngbGv2c
Could not get comments for video LlLJvls5ves
Could not get comments for video BuA9Siwwk8c
Could not get comments for video lj_htsgNDo0
Could not get comme

Preprocessing of comments

stopwords, lowercase, no emojis etc.

In [None]:
#remove emojis
pip install demoji
import demoji

demoji.download_codes()

def remove_em(text):
    dem = demoji.findall(text)
    for item in dem.keys():
        text = text.replace(item, "")
    return text

In [None]:
#remove puncations

#library that contains punctuation
import string
string.punctuation

#defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree
#storing the puntuation free text
data['clean_msg']= data['v2'].apply(lambda x:remove_punctuation(x))
data.head()

In [None]:
#lowering 
data['msg_lower']= data['clean_msg'].apply(lambda x: x.lower())

In [None]:
#language detection
pip install langdetect

In [None]:
categories_splitted = video_data['topicCategories'].apply(pd.Series)
video_data['category'] = categories_splitted.iloc[:,0].astype(str)
video_data['category'] = video_data['category'].apply(lambda cat: cat.replace('https://en.wikipedia.org/wiki/','').split('_(')[0])

In [None]:
video_data.to_csv('C:/Users/D074066/Downloads/'+'videos_top30_clean.csv', sep=';')

### Playground

In [105]:
def clean_alt_list(list_):
    list_ = list_.replace(', ', '","')
    list_ = list_.replace('[', '["')
    list_ = list_.replace(']', '"]')
    return list_

video_data['category'] = video_data['topicCategories']
# video_data['category'] = video_data['category'].apply(clean_alt_list)
temp = video_data['category'].apply(pd.Series)
temp2 = temp.iloc[:,0]

  temp = video_data['category'].apply(pd.Series)
  temp = video_data['category'].apply(pd.Series)
  temp = video_data['category'].apply(pd.Series)
  temp = video_data['category'].apply(pd.Series)
  temp = video_data['category'].apply(pd.Series)


In [None]:
nltk.download('punkt');

def tokenize(column):
    """Tokenizes a Pandas dataframe column and returns a list of tokens.

    Args:
        column: Pandas dataframe column (i.e. df['text']).

    Returns:
        tokens (list): Tokenized list, i.e. [Donald, Trump, tweets]

    """

    tokens = nltk.word_tokenize(column)
    return [w for w in tokens if w.isalpha()] 

In [None]:
df['tokenized'] = df.apply(lambda x: tokenize(x['clean_punctations']), axis=1)
df

In [None]:
# remove stop words --> im Stemming mitdrin
#stops = set(stopwords.words('german'))
#german_stop_words = stopwords.words('german')
#df['without_stopwords'] = df['urls'].apply(lambda x: ' '.join([word for word in x.split() if word not in (german_stop_words)]))
#df

In [106]:
import re

SEARCHSTRING = 'https://en.wikipedia.org/wiki/Hallo_(mama)'

c = SEARCHSTRING.replace('https://en.wikipedia.org/wiki/','')
c = c.split('_(')[0]
print(c)

test = video_data.iloc[1]['topicCategories']

c = test[0]
print(c)

print(c.replace('https://en.wikipedia.org/wiki/',''))
print(c.split('_(')[0])


Hallo
https://en.wikipedia.org/wiki/Lifestyle_(sociology)
Lifestyle_(sociology)
https://en.wikipedia.org/wiki/Lifestyle
