In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import google.colab.userdata
import pandas as pd
import re
import os

from googleapiclient.discovery import build

In [3]:
API_KEY = google.colab.userdata.get('Youtube_API_key')
youtube = build('youtube', 'v3', developerKey=API_KEY)

In [4]:
# Read file video_ids.csv
file_path = '/content/drive/MyDrive/NLP/toxic_comment_vietnamese/data/raw/video_ids.csv'

if not os.path.exists(file_path):
    print(f"Lỗi: Không tìm thấy file '{file_path}'.")
    exit()

df_urls = pd.read_csv(file_path, sep='\t', header=0)

In [5]:
def extract_video_id(url):
    """Extracts the video ID from a YouTube URL."""
    if pd.isna(url):
        return None
    # Extract from standard watch URL
    match = re.search(r'v=([^&]*)', url)
    if match:
        return match.group(1)
    # Extract from shorts URL
    match = re.search(r'/shorts/([^/?]*)', url)
    if match:
        return match.group(1)
    return None

df_urls['video_id'] = df_urls['source'].apply(extract_video_id)
display(df_urls.head())

Unnamed: 0,id,source,topic,video_id
0,1,https://www.youtube.com/watch?v=64NKBn6XgCA,Việt Tân,64NKBn6XgCA
1,2,https://www.youtube.com/watch?v=F6lpMsA-LfU,Việt Tân,F6lpMsA-LfU
2,4,https://www.youtube.com/watch?v=owjfxRI2zI0,Việt Tân,owjfxRI2zI0
3,5,https://www.youtube.com/watch?v=ppMUX6WAQGM,Jack,ppMUX6WAQGM
4,6,https://www.youtube.com/watch?v=x18Ik_yO7A4,Sơn Tùng,x18Ik_yO7A4


In [6]:
def get_comments(video_id, youtube):
    """Fetches all comments for a given YouTube video ID."""
    all_comments = []
    next_page_token = None

    while True:
        try:
            request = youtube.commentThreads().list(
                part='snippet',
                videoId=video_id,
                maxResults=100,
                pageToken=next_page_token,
                textFormat='plainText' # Use plainText for easier extraction
            )
            response = request.execute()

            for item in response.get('items', []):
                comment = item['snippet']['topLevelComment']['snippet']['textOriginal']
                all_comments.append(comment)

            next_page_token = response.get('nextPageToken')

            if not next_page_token:
                break
        except Exception as e:
            print(f"An error occurred while fetching comments for video {video_id}: {e}")
            break # Exit the loop in case of an error

    return all_comments

In [7]:
all_comments = []

for video_id in df_urls['video_id']:
    if pd.notna(video_id):
        comments = get_comments(video_id, youtube)
        all_comments.extend(comments)

print(f"Total comments fetched: {len(all_comments)}")

Total comments fetched: 8666


In [8]:
df_comments = pd.DataFrame(all_comments, columns=['comment'])
display(df_comments.head())

Unnamed: 0,comment
0,Giống vụ con bé báo tao cháu tô lâm đây . CA b...
1,BIẾT BỐ MÀY LÀ AI KHÔNG .\nĐMCS .
2,Còn cái này là góp ý. Hy vọng là có đọc cmt. N...
3,Bò đỏ lên bình luận cái coi
4,"Duoc dao tao chinh qui,hoc truong cao cap ,duo..."


In [9]:
output_file_path = '/content/drive/MyDrive/NLP/toxic_comment_vietnamese/data/raw/youtube_comments.csv'
df_comments.to_csv(output_file_path, index=False)
print(f"Comments saved successfully to '{output_file_path}'")

Comments saved successfully to '/content/drive/MyDrive/NLP/toxic_comment_vietnamese/data/raw/youtube_comments.csv'
