# Data Scraping and Filter From Youtube

- Input
    1. Youtube API Key
    2. Playlist ID
    3. Path for the directory
    4. Name of the playlist scraped
    5. Language of the videos scraped

- Output
    1. The original comments, as scraped from youtube.
       **Ex:** *1_NEGATIVE_ENGLISH_original_comments.csv*

    2. The cleaned and filtered for labeling csv, where we will then label the entries ; this file has no punctuations / stopwords.
       **Ex:** *1_NEGATIVE_ENGLISH_cleaned_and_filtered_comments_for_labeling_LABEL_HERE.csv*
       
    3. The cleaned and filtered helper csv, it has the same comments as the cleaned and filtered one, but has stopwords etc. in it so it is easier to read, such that we use that to read the comments and then label them in the other file.
       **Ex:** *1_NEGATIVE_ENGLISH_cleaned_and_filtered_comments_helper.csv*

## Install and load libraries
Remember to install the following packages before running the script

In [None]:
# Use this cell to install the required libraries on your machine

# !pip install keras

In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk import pos_tag
from nltk import ne_chunk
import regex as re
import spacy # We need spacy for german lemmatization
#import de_core_news_sm
import en_core_web_sm
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from transformers import pipeline
from transformers import DistilBertTokenizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
#from germansentiment import SentimentModel
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import tensorflow as tf
import numpy as np
from googleapiclient.discovery import build
import pandas as pd
import getpass



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\giopa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\giopa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\giopa\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Input

In [3]:
########### ENTER YOUR API KEY HERE ###########

# You can get your API key from the Google Cloud Console 
# (https://console.cloud.google.com/apis/api/youtube.googleapis.com/credentials?hl=it&project=polished-watch-421614)

api_key = 'AIzaSyCmAB3EQlCwVY9Hc8tLHr5HHaal8UzeIOQ'

########### ENTER THE PLAYLIST ID HERE ###########

# Make a playlist on YouTube and copy the playlist ID from the URL

playlist_ids = ['PLi6PHirMO4RB17Ja_L2XZhVxdu_1OzD8m']

########### ENTER THE PATH TO THE FOLDER WHERE YOU WANT TO SAVE THE CSV FILE ###########

# Change the last number to the number of the playlist you are scraping
# Look at the Google Sheet to see which number corresponds to which playlist based on the color

path = r'C:\Users\giopa\Downloads\1' # Example for windows: r'C:\Users\danie\Downloads\1', r'C:\Users\danie\Downloads\2', r'C:\Users\danie\Downloads\3
                                     # Example for mac: '/Users/danie/Downloads/1', '/Users/danie/Downloads/2', '/Users/danie/Downloads/3'

########### ENTER THE NAME OF THE PLAYLIST YOU ARE SCRAPING HERE ###########

# This is the name of the playlist you are scraping. It will be used to name the CSV file

csv_file = "_NEUTRAL_ENGLISH" # Example: "_POSITIVE_ENGLISH", "_NEUTRAL_ENGLISH", "_POSITIVE_GERMAN", "_NEGATIVE_GERMAN"

########### ENTER THE LANGUAGE FOR SCRAPING ###########

language = 'english'


## Data Scraping

In [4]:
# Build the YouTube client
youtube = build('youtube', 'v3', developerKey=api_key)

def get_all_video_ids_from_playlists(youtube, playlist_ids):
    all_videos = []  # Initialize a single list to hold all video IDs

    for playlist_id in playlist_ids:
        next_page_token = None

        # Fetch videos from the current playlist
        while True:
            playlist_request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId=playlist_id,
                maxResults=50,
                pageToken=next_page_token)
            playlist_response = playlist_request.execute()

            all_videos += [item['contentDetails']['videoId'] for item in playlist_response['items']]

            next_page_token = playlist_response.get('nextPageToken')

            if next_page_token is None:
                break

    return all_videos

# Fetch all video IDs from the specified playlists
video_ids = get_all_video_ids_from_playlists(youtube, playlist_ids)



# Function to get replies for a specific comment
def get_replies(youtube, parent_id, video_id):  # Added video_id as an argument
    replies = []
    next_page_token = None

    while True:
        reply_request = youtube.comments().list(
            part="snippet",
            parentId=parent_id,
            textFormat="plainText",
            maxResults=100,
            pageToken=next_page_token
        )
        reply_response = reply_request.execute()

        for item in reply_response['items']:
            comment = item['snippet']
            replies.append({
                'Timestamp': comment['publishedAt'],
                'Username': comment['authorDisplayName'],
                'VideoID': video_id,
                'Comment': comment['textDisplay'],
                'Date': comment['updatedAt'] if 'updatedAt' in comment else comment['publishedAt']
            })

        next_page_token = reply_response.get('nextPageToken')
        if not next_page_token:
            break

    return replies

# Function to get all comments (including replies) for a single video
def get_comments_for_video(youtube, video_id):
    all_comments = []
    next_page_token = None

    while True:
        comment_request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            pageToken=next_page_token,
            textFormat="plainText",
            maxResults=100
        )
        comment_response = comment_request.execute()

        for item in comment_response['items']:
            top_comment = item['snippet']['topLevelComment']['snippet']
            all_comments.append({
                'Timestamp': top_comment['publishedAt'],
                'Username': top_comment['authorDisplayName'],
                'VideoID': video_id,  # Directly using video_id from function parameter
                'Comment': top_comment['textDisplay'],
                'Date': top_comment['updatedAt'] if 'updatedAt' in top_comment else top_comment['publishedAt']
            })

            # Fetch replies if there are any
            if item['snippet']['totalReplyCount'] > 0:
                all_comments.extend(get_replies(youtube, item['snippet']['topLevelComment']['id'], video_id))

        next_page_token = comment_response.get('nextPageToken')
        if not next_page_token:
            break

    return all_comments

# List to hold all comments from all videos
all_comments = []


for video_id in video_ids:
    video_comments = get_comments_for_video(youtube, video_id)
    all_comments.extend(video_comments)

# Create DataFrame
comments_df = pd.DataFrame(all_comments)






In [5]:
########### ENTER EXPORT LOCATION HERE ###########

# Export whole dataset to the local machine as CSV File
comments_df.to_csv(path + csv_file + '_original_comments.csv', index=False)

## Helper Functions

In [6]:
def remove_emoji(comment):
    """Function to remove emojis.
        comment : data input ; str
        Taken from :
        https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b

    """

    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', comment)


In [7]:
def P_data_reading(path_to_data):
    """Simple function to read in the data we want to use.
       path_to_data : the path pointing to our data ; csv file
    """

    comments_data = pd.read_csv(path_to_data)

    ############### FOR TESTING PURPOSES, WE ONLY TAKE FIRST 20 ###############

    # Turn into Series, containing only the comments
    return comments_data['Comment']

In [8]:
def P_data_cleaning(data, language, labelling):
    """Function to clean our data.
       data : data input ; pd.Series
       language : what language the comments are in (input in lowercase) : str
       labelling : if we want to label, we keep punctuation & stopwords
    """

    # REMOVE NAN ENTRIES
    data = data.dropna()

    # REMOVE COMMENTS THAT EXCEED CERTAIN LENGTH (350 for now)
    data = data[data.str.len() <= 350]
    

    # FOR GERMAN DATA : Change ö , ä , ü to oe, ae, ue
    data = data.str.replace("ö", "oe").str.replace("ä", "ae").str.replace("ü", "ue")

    # REMOVE NAMES FROM ANSWERS (in youtube comments scraper answers stored by @@)
    data = data.str.replace('@@\w+', '', regex=True)

    # REMOVING PUNCTUATION
    if labelling == False:
      data = data.str.replace('[^a-zA-Z0-9]',' ')

    # REMOVING EMOJIS
    data = data.apply(lambda x: remove_emoji(x))

    # LOWERCASE
    data = data.str.lower()

    # REMOVING STOPWORDS
    if labelling == False:
      data = data.apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords.words(language))]))


    return data



  data = data.str.replace('@@\w+', '', regex=True)


In [9]:
def P_data_tokenization(comment, language, model):
    """
    Tokenization function. We implement different tokenizers
    comment : the current comment to analyze ; string
    language : the language for tokenization ; string
    model : the tokenizer we are using (or from which model we are using the tokenizer from)
    """

    if model.lower() == 'distilbert':
        # We use the distilBERT tokenization (in case we are going to use that model later on)
        # NOTE : don't know what languages are included in multilingual, I just know german is in it
        tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-multilingual-cased")

      #  encoded_comment = tokenizer.encode(comment, add_special_tokens=True)


        # Tokenizing and padding comments (padding needed for constant input later on in distilBERT model)
        tokenized_comment = tokenizer.encode_plus(
            comment,
            max_length=128,  # Set the desired maximum sequence length
            padding='longest',  # Pad to the longest sequence in the batch
            truncation=True,  # Truncate if needed
            return_tensors='pt',  # Return PyTorch tensors
            )

        # Access the input IDs (we'll use these for fine-tuning (? on which data will we do fine-tuning ? Daniele proposed
        # the english comment section for AI on youtube, because there we have so much data and it is similar to ours))
        #input_ids = tokenized_comment['input_ids']

        # Will return input ids and attention mask of our inputs
        return tokenized_comment


In [10]:
def P_data_lemmatizing(comment, language):
    """FILTER APPROACH 1 : We first lemmatize so we get the base words of everything
       + we have less words in general and can build bigger groups
       - we will lose some accuracy in our sentiment analysis : words like best/better/good will all be just good

       To combat the negative effect, we will do the following : Build a mapping between the original input sentences
       and the lemmatized ones. We will just lemmatize to build the bigger groups and denoise our dataset. Then, when we
       done this, we map back to the original sentences and tokenize.

       Since we use pandas, we just won't reset indices. That way, we just keep the original pandas dataset (i.e. we save
       a copy of it after the cleaning steps and right before lemmatizing) and then use the indices for our mapping.

       comment : the current comment to analyze ; string
       language : the language for tokenization ; string
    """
    if language.lower() == 'german':
        lemmatizer = spacy.load("de_core_news_sm")


    if language.lower() == 'english':
        lemmatizer = spacy.load("en_core_web_sm")



    lemmatized_comment = ' '.join([token.lemma_ for token in lemmatizer(comment)])

    # After lemmatizing, some words are again higher cased
    lemmatized_comment = lemmatized_comment.lower()


    return lemmatized_comment


In [11]:
def P_data_word_count(data):
    """FILTER APPROACH 1 : We find the buzz words we want to filter for.
       The idea is to iterate through our own data and see if there are
       some really common words that are used for showing ones sentiment
       if there is a pattern, we can use these to remove the noise from
       our data
       data : data input : pd.Series
    """
    # explode() : convert each single element into a row
    # We also sort them to find the most common ones
    word_counts = data.str.split().explode().value_counts().sort_values(ascending = False)





    # We return the count aswell as the (lemmatized) words themselves
    return word_counts, list(word_counts.index)


In [12]:
def P_data_filtering(sentiment_words, model, language, threshold = 0.95):
    """FILTERING APPROACH 1 : We do pre-filtering on our data to remove noise.
       For this, we use pre-trained, state-of-the-art models to find the sentiments of different words in different languages.
       Next, we filter the data (see details below)
       sentiment_words = list of words we want to use for filtering : List of String
       model : which model to use
       language : the language for tokenization ; string
       threshold : threshold on the confidence level of sentiment predictions of the single words ; Float
    """

    if language.lower() == 'english':
        # According to :
        # https://huggingface.co/rabindralamsal/BERTsent?text=I+like+you.+I+love+you
        tokenizer = AutoTokenizer.from_pretrained("rabindralamsal/BERTsent")
        model = TFAutoModelForSequenceClassification.from_pretrained("rabindralamsal/BERTsent")

        data = {'word' : [], 'sentiment_label' : [], 'confidence_pos' : [], 'confidence_neg' : [], 'confidence_neutral' : [], 'confidence_highest' : []}

        for word in sentiment_words:

            input = tokenizer.encode(word, return_tensors="tf")
            output = model.predict(input)[0]
            prediction = tf.nn.softmax(output, axis=1).numpy()
            sentiment = np.argmax(prediction)

            # Convert sentiments (as they are stored 0,1,2 in this model)
            if sentiment == 0:
                sentiment = 'negative'
            elif sentiment == 1:
                sentiment = 'neutral'
            elif sentiment == 2:
                sentiment = 'positive'


            data['word'].append(word)
            data['sentiment_label'].append(sentiment)
            data['confidence_pos'].append(prediction[0][2])
            data['confidence_neg'].append(prediction[0][0])
            data['confidence_neutral'].append(prediction[0][1])
            data['confidence_highest'].append(max(prediction[0][0],prediction[0][1],prediction[0][2]))


            words_sentiments_confidence = pd.DataFrame(data, columns=['word', 'sentiment_label', 'confidence_pos', 'confidence_neg', 'confidence_neutral', 'confidence_highest'])
        


            words_sentiments_confidence_filtered = words_sentiments_confidence[(words_sentiments_confidence['confidence_highest'] >= threshold)\
                                                                                & (words_sentiments_confidence['confidence_highest'] != words_sentiments_confidence['confidence_neutral']) \
                                                                                & (~words_sentiments_confidence['word'].str.contains(r'\d')) \
                                                                                & (words_sentiments_confidence['word'].str.len() > 1)]

       


            # Finally, we look at the neutral values : Here, we use a list of buzz words that are AI related. We only want to keep
            # the neutral words that are somewhat related to AI.
            neutral_filter = ['ai', 'artificial', 'intelligence','machine', 'learning', 'robot']

            words_sentiments_confidence_filtered_2 = words_sentiments_confidence[(words_sentiments_confidence['word'].isin(neutral_filter))]

      

            words_sentiments_confidence_filtered_final = pd.concat([words_sentiments_confidence_filtered, words_sentiments_confidence_filtered_2])

            # Possible that we have some duplicates in the two concatenated ones (since in filtered_2 we take across also the ones with positive & negative sentiment again)
            words_sentiments_confidence_filtered_final = words_sentiments_confidence_filtered_final.drop_duplicates()


        return words_sentiments_confidence_filtered_final

    if language.lower() == 'german':
        if model.lower() == 'bert':
            model = SentimentModel() # Specifically trained on german texts !

            data = {'word' : [], 'sentiment_label' : [], 'confidence_pos' : [], 'confidence_neg' : [], 'confidence_neutral' : [], 'confidence_highest' : []}

            for word in sentiment_words:
                classes, probabilities = model.predict_sentiment([word], output_probabilities = True)
                data['word'].append(word)
                data['sentiment_label'].append(classes[0])
                data['confidence_pos'].append(probabilities[0][0][1])
                data['confidence_neg'].append(probabilities[0][1][1])
                data['confidence_neutral'].append(probabilities[0][2][1])
                data['confidence_highest'].append(max(probabilities[0][0][1],probabilities[0][1][1],probabilities[0][2][1]))


            words_sentiments_confidence = pd.DataFrame(data, columns=['word', 'sentiment_label', 'confidence_pos', 'confidence_neg', 'confidence_neutral', 'confidence_highest'])

            # NOTE : I keep this in the german & bert loop since I don't know if we will have models for each language that output a
            #        a confidence score
            # Next, based on some threshold, we only keep the words with positive / negative sentiment with a confidence >= threshold
            # Additionally, I found this pre-trained model to give numbers a positive sentiment with high confidence, so we remove these aswell
            # Also, sometimes it classifies a single letter with something positive/negative. Remove these aswell (in german, there are no single letter words)



            words_sentiments_confidence_filtered = words_sentiments_confidence[(words_sentiments_confidence['confidence_highest'] >= threshold)\
                                                                                & (words_sentiments_confidence['confidence_highest'] != words_sentiments_confidence['confidence_neutral']) \
                                                                                & (~words_sentiments_confidence['word'].str.contains(r'\d')) \
                                                                                & (words_sentiments_confidence['word'].str.len() > 1)]

    
            # Finally, we look at the neutral values : Here, we use a list of buzz words that are AI related. We only want to keep
            # the neutral words that are somewhat related to AI.
            neutral_filter = ['ai', 'künstlich', 'künstliche', 'intelligenz', 'ki', 'machine', 'learning', 'kunst', 'roboter', 'robot']
            words_sentiments_confidence_filtered_2 = words_sentiments_confidence[(words_sentiments_confidence['word'].isin(neutral_filter))]
            words_sentiments_confidence_filtered_final = pd.concat([words_sentiments_confidence_filtered, words_sentiments_confidence_filtered_2])
            # Possible that we have some duplicates in the two concatenated ones (since in filtered_2 we take across also the ones with positive & negative sentiment again)
            words_sentiments_confidence_filtered_final = words_sentiments_confidence_filtered_final.drop_duplicates()

            return words_sentiments_confidence_filtered_final


In [13]:
def P_data_remap(data_sentiments_filtered, data_lemmatized, data_only_cleaned_for_labeling, data_only_cleaned, path, csv_file):
    """
    FILTERING APPROACH 1: After we have found the words that show some strong sentiment or are connected to AI in some way,
    we now want to remap to the original sentences again
    data_sentiments_filtered : the final words with all the different sentiments scores, filtered ; pd.DataFrame
    data_lemmatized : our lemmatized (and cleaned) words ; pd.Series
    data_only_cleaned : just cleaned data ; pd.Series
    """

    # We first create a list of all the words

    filtered_words = list(data_sentiments_filtered['word'])

    # Now we only want to keep the occurences where these words appear in our lemmatized version

    data_lemmatized_filtered = data_lemmatized[data_lemmatized.apply(lambda x: any(word in x for word in filtered_words))]




    # And then finally we map back to the unlemmatized ones, because we will be using tokenization


    data_cleaned_and_filtered = data_only_cleaned[data_only_cleaned.index.isin(data_lemmatized_filtered.index)]
    data_cleaned_and_filtered_for_labeling = data_only_cleaned_for_labeling[data_only_cleaned_for_labeling.index.isin(data_lemmatized_filtered.index)]

    data_cleaned_and_filtered_for_labeling.to_csv(path + csv_file + '_cleaned_and_filtered_comments_helper.csv')
    data_cleaned_and_filtered.to_csv(path + csv_file + '_cleaned_and_filtered_comments_for_labeling_LABEL_HERE.csv')


    return  data_cleaned_and_filtered_for_labeling, data_cleaned_and_filtered



In [14]:
def V_word_cloud(data):
    """ Visualization tool. A word cloud so we can see what words appears most.
        data : contains the counts of each word ; pd.Series
    """

    # Convert the series to a concatenated string
    comment_words = ' '.join([str(w) for w in data.index])

    # Generate the word cloud
    wordcloud = WordCloud(width=512, height=512, background_color='white', max_words=20).generate(comment_words)

    # Display the word cloud
    plt.figure(figsize=(10, 8), facecolor='white', edgecolor='blue')
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()

## Filter

In [15]:
# Main function to run the whole pipeline

def main():

    # Set the language
    LANGUAGE = language

    # Read in the data
    data = P_data_reading(path + csv_file + '_original_comments.csv')

    # Clean the data
    data_cleaned = P_data_cleaning(data, language = language , labelling = False) # language german here means just changing ä to ae etc. ; can be used for english aswell
    data_cleaned_for_labeling = P_data_cleaning(data, language = language, labelling = True) # We need this to map back to originals later on
 



    if LANGUAGE == 'german':
        
        # We first lemmatize the data
        data_cleaned_lemmatized = data_cleaned.apply(lambda x : P_data_lemmatizing(x,
                                                                                   language = LANGUAGE))
        
        # We then count the words
        data_words_count, words = P_data_word_count(data_cleaned_lemmatized)

        # We then filter the words
        words_sentiments_filtered = P_data_filtering(words, 
                                                     model= 'bert', 
                                                     language= LANGUAGE)

        # We then remap the data
        data_cleaned_and_filtered_for_labeling, data_cleaned_and_filtered = P_data_remap(words_sentiments_filtered, 
                                                                                         data_cleaned_lemmatized, 
                                                                                         data_cleaned_for_labeling, 
                                                                                         data_cleaned, 
                                                                                         path, 
                                                                                         csv_file)



    elif LANGUAGE == 'english':
        
        # We first lemmatize the data
        data_cleaned_lemmatized = data_cleaned.apply(lambda x : P_data_lemmatizing(x, 
                                                                                   language = LANGUAGE)) 

        # We then count the words
        data_words_count, words = P_data_word_count(data_cleaned_lemmatized)

        # We then filter the words
        words_sentiments_filtered = P_data_filtering(words, 
                                                     model= 'bert', 
                                                     language= LANGUAGE)
        
        # We then remap the data
        data_cleaned_and_filtered_for_labeling, data_cleaned_and_filtered = P_data_remap(words_sentiments_filtered, 
                                                                                         data_cleaned_lemmatized, 
                                                                                         data_cleaned_for_labeling, 
                                                                                         data_cleaned, 
                                                                                         path, 
                                                                                         csv_file)
    

    return data_cleaned_and_filtered_for_labeling, data_cleaned_and_filtered





In [16]:
main()



tokenizer_config.json:   0%|          | 0.00/323 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/17.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

RuntimeError: Failed to import transformers.models.roberta.modeling_tf_roberta because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.