In [None]:
import pandas as pd
import csv
import re
import matplotlib, seaborn, wordcloud, nltk
import numpy as np
import requests
import json
import os
from tqdm import tqdm

In [None]:
from google.colab import userdata
from google.colab import drive

In [None]:
pip install google-api-python-client



In [None]:
from googleapiclient.discovery import build

In [None]:
youtube_api = userdata.get("YouTubeAPI")

In [None]:
youtube = build("youtube", "v3", developerKey=youtube_api)

#### Grab and save YouTube comments from each video

In [None]:
# URLS
youtube_beyonce_url = "https://www.youtube.com/watch?v=SDPITj1wlkg"
youtube_kendrick_lamar_url = "https://www.youtube.com/watch?v=KDorKy-13ak"

In [None]:
# Extract video IDs
videoID_beyonce = youtube_beyonce_url.split("v=")[1]
videoID_kendrick = youtube_kendrick_lamar_url.split("v=")[1]

In [None]:
def get_YT_comments(video_id):
    comments_data = []
    next_page_token = None

    while True:
        response = youtube.commentThreads().list(
            part='snippet',
            videoId=video_id,
            pageToken=next_page_token,
            textFormat="plainText",
            maxResults=100  # optional: speeds up data retrieval in larger chunks
        ).execute()

        for item in response['items']:
            snippet = item['snippet']['topLevelComment']['snippet']
            comment_info = {
                'video_id': video_id,
                'comment_id': item['id'],
                'comment': snippet.get('textDisplay'),
                'like_count': snippet.get('likeCount'),
                'reply_count': item['snippet'].get('totalReplyCount'),
                'published_at': snippet.get('publishedAt')
            }
            comments_data.append(comment_info)

        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break

    return comments_data

In [None]:
'''
# Get YouTube comments
def get_YT_comments(video_id):
    comments = []
    next_page_token = None

    while True:
        response = youtube.commentThreads().list(
            part='snippet',
            videoId=video_id,
            pageToken=next_page_token,
            textFormat="plainText"
        ).execute()

        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)

        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break

    return comments
    '''

In [None]:
# Beyonce Superbowl Performance
comments_beyonce = get_YT_comments(videoID_beyonce)
# Kendrick Lamar Performance
comments_kendrick = get_YT_comments(videoID_kendrick)

In [None]:
print(comments_kendrick[:5])

['ight im back here again', 'Today is 2 months ago....thats 60 million vieuws in a month', 'I think I watched this over 250 times.', 'THEY NOT LIKE US 😂😂😂✌✌', 'I keep coming back here, it really is such a phenomenal performance by K-Dot and the rest of the talented performers.']


In [None]:
# Put comments in data frame
beyonce_sentiment = pd.DataFrame(comments_beyonce)

In [None]:
kendrick_sentiment = pd.DataFrame(comments_kendrick)

In [None]:
beyonce_sentiment.head()

Unnamed: 0,Comments
0,Love it❤ 😍🎉😊
1,I love Beyonce more ❤❤❤❤❤❤❤❤❤😊🎉🎉❤
2,Greatest performer alive.
3,3:18 Beyonce ate
4,MY MOM THATS MY DAUGHTER UP THERE


In [None]:
kendrick_sentiment.head()

Unnamed: 0,Comments
0,ight im back here again
1,🎉🎉🎉🎉🎉🎉❤❤❤
2,Ive literally never once rewatched a half time...
3,Crazy that watching this performance is now a ...
4,the marching during ‘all the stars’ is so sati...


In [None]:
# Save data frames to CSV files
kendrick_sentiment.to_csv("/content/drive/MyDrive/kendrick_sentiment.csv")
beyonce_sentiment.to_csv("/content/drive/MyDrive/beyonce_sentiment.csv")

In [None]:
print(f'Kendrick Comments: {len(kendrick_sentiment)}')
print(f'Beyonce Comments: {len(beyonce_sentiment)}')

Kendrick Comments: 134799
Beyonce Comments: 27302


In [None]:
sampled = kendrick_sentiment.sample(n=3000, random_state=42).copy()
sampled['label'] = ""
sampled['confidence'] = ""
sampled.reset_index(drop=True, inplace=True)
sampled['comment_id'] = sampled.index
sampled.to_csv('/content/drive/MyDrive/kcomments_to_label.csv', index=False)

#### Analysis

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np

# Load model and tokenizer
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# Sentiment labels
labels = ['negative', 'neutral', 'positive']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [None]:
def get_sentiment(text):
    # Tokenize and truncate long comments
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    scores = outputs[0][0].numpy()
    scores = np.exp(scores) / np.sum(np.exp(scores))  # Softmax

    sentiment = labels[np.argmax(scores)]
    return sentiment, scores

In [None]:
tqdm.pandas()

kendrick_sentiment['Sentiment_Result'] = kendrick_sentiment['Comments'].progress_apply(lambda x: get_sentiment(str(x))[0])
kendrick_sentiment['Sentiment_Scores'] = kendrick_sentiment['Comments'].progress_apply(lambda x: get_sentiment(str(x))[1])

NameError: name 'tqdm' is not defined

In [None]:
tqdm.pandas()

# Add columns for sentiment and raw scores
beyonce_sentiment['Sentiment_Result'] = beyonce_sentiment['Comments'].progress_apply(lambda x: get_sentiment(str(x))[0])
beyonce_sentiment['Sentiment_Scores'] = beyonce_sentiment['Comments'].progress_apply(lambda x: get_sentiment(str(x))[1])

100%|██████████| 27298/27298 [43:36<00:00, 10.43it/s]
100%|██████████| 27298/27298 [43:05<00:00, 10.56it/s]


In [None]:
beyonce_sentiment.head()

Unnamed: 0,Comments,Sentiment_Result,Sentiment_Scores
0,Мои любимые❤,neutral,"[0.0150132105, 0.5154763, 0.46951053]"
1,Love it❤ 😍🎉😊,positive,"[0.001503222, 0.008867398, 0.9896294]"
2,I love Beyonce more ❤❤❤❤❤❤❤❤❤😊🎉🎉❤,positive,"[0.0015113754, 0.0057966304, 0.992692]"
3,Greatest performer alive.,positive,"[0.0028399553, 0.039102487, 0.9580575]"
4,3:18 Beyonce ate,neutral,"[0.015575977, 0.60946256, 0.3749615]"


In [None]:
beyonce_sentiment.to_csv("/content/drive/MyDrive/beyonce_sentiment_results.csv", index=False)

In [None]:
def samples(df):
  df['label'] = ""
  df.reset_index(drop=True, inplace=True)
  df['comment_id'] = df.index
  df.drop_duplicates(subset=['comment_id'], inplace=True)


In [None]:
#drive.flush_and_unmount()
drive.mount('/content/drive')

Mounted at /content/drive
