In [2]:
import googleapiclient.discovery
from googleapiclient.errors import HttpError

import pandas as pd
import time
import tqdm

In [None]:
# YouTube API key
API_KEY = "YOUR-API-KEY" # YOUR-API-KEY
youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=API_KEY)

In [5]:
# Get video ids for query. Youtube API allows only up to 50 videos

def get_video_ids(query, max_results=100):
    video_ids = []
    results_per_page = 50  # YouTube API maxResults 
    pages = (max_results + results_per_page - 1) // results_per_page  # calculate #pages
    next_page_token = None
    
    for _ in range(pages): # call api as many times as #pages
        try:
            request = youtube.search().list(
                q=query,
                part="snippet",
                maxResults=results_per_page,
                type="video",
                pageToken=next_page_token
            )
            response = request.execute()      

            # Only check if 'id' key exists and 'videoId' is accessible
            for item in response['items']:
                if isinstance(item, dict) and 'id' in item and 'videoId' in item['id']:
                    video_ids.append(item['id']['videoId'])

            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break

        except HttpError as e:
            error_reason = e.resp.get('reason')
            if error_reason == 'quotaExceeded':
                print("Quota exceeded. Saving collected data...")
                save_data_to_csv(video_comments)
                exit()
            else:
                print(f"An error occurred: {e}")
                
    return video_ids[:max_results]

In [6]:
# Get comments for 1 video. Youtube API allows only up to 100 comments per video
def get_top_korean_comments(video_id, max_results=100):
    comments = []
    try:
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=max_results,
            textFormat="plainText"
        )
        response = request.execute()

        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)  
                
    except HttpError as e:
        error_reason = e.resp.get('reason')
        if error_reason == 'commentsDisabled':
            print(f"Comments are disabled for video {video_id}. Skipping.")
        elif error_reason == 'quotaExceeded':
            print("Quota exceeded. Saving collected data...")
            save_data_to_csv(video_comments)
            exit()
        else:
            print(f"An error occurred: {e}")
    
    return comments

In [7]:
# Make data to dataframe
# video_comments looks like: {"4DUYBXdUYzA": ["와 재밌다", "재미없다", ]}
def save_data_to_csv(video_comments):    
    
    data = {"Video_ID": [], "Comment": []}
    
    for video_id, comments in video_comments.items():
        for comment in comments:
            data["Video_ID"].append(video_id)
            data["Comment"].append(comment)

    df = pd.DataFrame(data)
    
    # Export to CSV 
    df.to_csv("data/youtube_comments.csv", index=False)

In [8]:
participants = ["흑백요리사","백종원","안성재","에드워드 리","나폴리 맛피아","트리플스타","요리하는 돌아이","최현석","장호준","여경래","안유성","정지선","최강록","조은주","오세득","파브리치오 페라리","이영숙","선경 롱게스트","김도윤","박준우"]

In [9]:
video_comments = {}
# Ex: {"4DUYBXdUYzA": ["와 재밌다", "재미없다", ]}

start = time.time()
query_baisic = "흑백요리사"

for participant in tqdm.tqdm(participants):
    query = query_baisic + " " + participant 

    try:
        video_ids = get_video_ids(query, max_results=50) 

        for video_id in video_ids:
            comments = get_top_korean_comments(video_id)
            video_comments[video_id] = comments
    except HttpError as e:
        if e.resp.get('reason') == 'quotaExceeded':
            print("Quota exceeded. Saving collected data...")
            save_data_to_csv(video_comments)
            exit()

    end = time.time()    
    print(f"{end - start}s for query: {query}")    

save_data_to_csv(video_comments)

  0%|          | 0/20 [00:00<?, ?it/s]

  5%|▌         | 1/20 [00:12<04:02, 12.75s/it]

12.767638921737671s for query: 흑백요리사 흑백요리사


 10%|█         | 2/20 [00:23<03:33, 11.85s/it]

23.98433256149292s for query: 흑백요리사 백종원


 15%|█▌        | 3/20 [00:35<03:16, 11.57s/it]

35.21283006668091s for query: 흑백요리사 안성재


 20%|██        | 4/20 [00:46<03:06, 11.66s/it]

47.00940942764282s for query: 흑백요리사 에드워드 리


 25%|██▌       | 5/20 [00:57<02:48, 11.25s/it]

57.524434089660645s for query: 흑백요리사 나폴리 맛피아


 30%|███       | 6/20 [01:07<02:32, 10.91s/it]

67.78552746772766s for query: 흑백요리사 트리플스타


 35%|███▌      | 7/20 [01:18<02:20, 10.83s/it]

78.44955778121948s for query: 흑백요리사 요리하는 돌아이


 40%|████      | 8/20 [01:29<02:09, 10.82s/it]

89.24549412727356s for query: 흑백요리사 최현석


 45%|████▌     | 9/20 [01:38<01:54, 10.45s/it]

98.87991786003113s for query: 흑백요리사 장호준


 50%|█████     | 10/20 [01:48<01:42, 10.29s/it]

108.82306933403015s for query: 흑백요리사 여경래


 55%|█████▌    | 11/20 [01:59<01:33, 10.42s/it]

119.52593541145325s for query: 흑백요리사 안유성


 60%|██████    | 12/20 [02:09<01:23, 10.43s/it]

129.9796130657196s for query: 흑백요리사 정지선


 65%|██████▌   | 13/20 [02:21<01:15, 10.79s/it]

141.58497309684753s for query: 흑백요리사 최강록


 70%|███████   | 14/20 [02:32<01:04, 10.70s/it]

152.094176530838s for query: 흑백요리사 조은주


 75%|███████▌  | 15/20 [02:41<00:52, 10.46s/it]

An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=2iVC6EgKOHY&maxResults=100&textFormat=plainText&key=AIzaSyC43Ohag0uBkVRGx9vX1G9mZ1evWvW8qS4&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">
161.98191285133362s for query: 흑백요리사 오세득


 80%|████████  | 16/20 [02:48<00:37,  9.40s/it]

168.91727375984192s for query: 흑백요리사 파브리치오 페라리


 85%|████████▌ | 17/20 [02:59<00:29,  9.69s/it]

179.2769148349762s for query: 흑백요리사 이영숙


 90%|█████████ | 18/20 [03:09<00:19,  9.99s/it]

189.97369170188904s for query: 흑백요리사 선경 롱게스트


 95%|█████████▌| 19/20 [03:19<00:09,  9.71s/it]

199.02794742584229s for query: 흑백요리사 김도윤


100%|██████████| 20/20 [03:29<00:00, 10.46s/it]

209.1494266986847s for query: 흑백요리사 박준우





### Merge youtube_comments with movie_rating_dataset

In [10]:
comments = pd.read_csv("data/youtube_comments.csv")

In [11]:
comments.head()

Unnamed: 0,Video_ID,Comment
0,vebF7wUQLMo,"《흑백요리사: 요리 계급 전쟁》, 9월 17일 넷플릭스에서 시청하세요: https:..."
1,vebF7wUQLMo,백종원 지 주제에ㅋㅋ ㅋㅇㅋ
2,vebF7wUQLMo,빽햄요리사ㄷㄷ
3,vebF7wUQLMo,0:07
4,vebF7wUQLMo,백수저중에 옴진리교 교주가 있노 ㄷㄷㄷㄷ


## SKIP

In [None]:

import urllib.request
# download naver movie ratings dataset
# urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt", filename="ratings.txt")

In [None]:
movie_data = pd.read_table('data/ratings.txt')
movie_data.head()

In [None]:
comments.head()

In [None]:
print(f"movie data length: {len(movie_data)}")
print(f"comments data length: {len(comments)}")

In [None]:
# Merge two dataset because number of Comments dataset is not big enough to train word vectors.
df1_text = movie_data[['document']].rename(columns={'document': 'text'})
df2_text = comments[['Comment']].rename(columns={'Comment': 'text'})

# merge movie_data and yt_comments_data
merged_df = pd.concat([df1_text, df2_text], ignore_index=True)
merged_df

In [None]:
# NULL check
print(merged_df.isnull().values.any())

In [None]:
merged_df = merged_df.dropna(how = 'any') # drop rows with null values
print(merged_df.isnull().values.any()) 

In [None]:
print(len(merged_df)) 

In [None]:
# remove all characters other than Hangeul
merged_df['text'] = merged_df['text'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", regex=True)

In [None]:
# SKIP END

In [12]:
# !pip install konlpy
# !pip install gensim

from konlpy.tag import Okt
okt = Okt()

In [None]:
# !pip install fugashi[unidic-lite]
# import fugashi
# tagger = fugashi.Tagger()

In [13]:
# NULL check
print(comments.isnull().values.any()) # => True

comments = comments.dropna(how = 'any') # drop rows with null values

print(comments.isnull().values.any()) # => False

True
False


In [14]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

tokenized_data = []

# merged_df['text'] => comments['Comment']
for sentence in tqdm.tqdm(comments['Comment']): 
    sentence = str(sentence).strip()
    
    if not sentence:  # 빈 문자열이면 건너뛰기
        continue
        
    tokenized_sentence = okt.morphs(sentence, stem=True)    # 토큰화
    
    stopwords_removed_sentence = [word for word in tokenized_sentence 
                                  if not word in stopwords  # 조건1
                                     and len(word) >= 2     # 조건2   
                                     and word.isalpha()]    # 한글이나 영어 
    
    if stopwords_removed_sentence:  # 빈 리스트가 아니라면 추가
        tokenized_data.append(stopwords_removed_sentence)


100%|██████████| 41800/41800 [01:00<00:00, 690.42it/s] 


In [15]:
from gensim.models import Word2Vec

model = Word2Vec(sentences = tokenized_data, vector_size = 100, window = 5, min_count = 5, workers = 4, sg = 0)

In [16]:
model.wv.vectors.shape

(6269, 100)

In [17]:
print(model.wv.most_similar("백종원"))

[('기준', 0.9369398951530457), ('누가', 0.9333535432815552), ('누굴', 0.9310179352760315), ('사기꾼', 0.9259648323059082), ('무슨', 0.9180936813354492), ('저기', 0.916887104511261), ('의원', 0.9114497303962708), ('자격', 0.9107682704925537), ('참가자', 0.9106348156929016), ('위원', 0.900736927986145)]


In [22]:
print(model.wv.most_similar("최현석"))

[('안성재', 0.9994552731513977), ('에드워드', 0.9988729953765869), ('최강록', 0.9987758994102478), ('백종원', 0.9987531304359436), ('셰프', 0.9987066984176636), ('셰프님', 0.9986703991889954), ('쉐프', 0.9985906481742859), ('진짜', 0.9984706044197083), ('귀여워', 0.9982514977455139), ('너무', 0.9982216358184814)]


## Save W2V model

In [18]:
model.wv.save_word2vec_format('model/ko_w2v')

In [19]:
!python -m gensim.scripts.word2vec2tensor --input ko_w2v --output ko_w2v

2025-05-13 10:40:48,674 - word2vec2tensor - INFO - running d:\apps\minicond3\envs\python-env-311\Lib\site-packages\gensim\scripts\word2vec2tensor.py --input ko_w2v --output ko_w2v
2025-05-13 10:40:48,674 - keyedvectors - INFO - loading projection weights from ko_w2v
Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "d:\apps\minicond3\envs\python-env-311\Lib\site-packages\gensim\scripts\word2vec2tensor.py", line 94, in <module>
    word2vec2tensor(args.input, args.output, args.binary)
  File "d:\apps\minicond3\envs\python-env-311\Lib\site-packages\gensim\scripts\word2vec2tensor.py", line 68, in word2vec2tensor
    model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_model_path, binary=binary)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\apps\minicond3\envs\python-env-311\Lib\site-packages\gensim\models\keyedvectors.py"

## Visualization for embedding

In [None]:
## Go to https://projector.tensorflow.org/
## and load ko_w2v_tensor.tsv and ko_w2v_metadata.tsv