In [1]:
import googleapiclient.discovery
from googleapiclient.errors import HttpError

import pandas as pd
import time
import tqdm

In [3]:
# YouTube API key
API_KEY = "AIzaSyC43Ohag0uBkVRGx9vX1G9mZ1evWvW8qS4" # YOUR-API-KEY
youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=API_KEY)

In [4]:
# Get video ids for query. Youtube API allows only up to 50 videos

def get_video_ids(query, max_results=100):
    video_ids = []
    results_per_page = 50  # YouTube API maxResults 
    pages = (max_results + results_per_page - 1) // results_per_page  # calculate #pages
    next_page_token = None
    
    for _ in range(pages): # call api as many times as #pages
        try:
            request = youtube.search().list(
                q=query,
                part="snippet",
                maxResults=results_per_page,
                type="video",
                pageToken=next_page_token
            )
            response = request.execute()      

            # Only check if 'id' key exists and 'videoId' is accessible
            for item in response['items']:
                if isinstance(item, dict) and 'id' in item and 'videoId' in item['id']:
                    video_ids.append(item['id']['videoId'])

            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break

        except HttpError as e:
            error_reason = e.resp.get('reason')
            if error_reason == 'quotaExceeded':
                print("Quota exceeded. Saving collected data...")
                save_data_to_csv(video_comments)
                exit()
            else:
                print(f"An error occurred: {e}")
                
    return video_ids[:max_results]

In [5]:
# Get comments for 1 video. Youtube API allows only up to 100 comments per video
def get_top_korean_comments(video_id, max_results=100):
    comments = []
    try:
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=max_results,
            textFormat="plainText"
        )
        response = request.execute()

        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)  
                
    except HttpError as e:
        error_reason = e.resp.get('reason')
        if error_reason == 'commentsDisabled':
            print(f"Comments are disabled for video {video_id}. Skipping.")
        elif error_reason == 'quotaExceeded':
            print("Quota exceeded. Saving collected data...")
            save_data_to_csv(video_comments)
            exit()
        else:
            print(f"An error occurred: {e}")
    
    return comments

In [7]:
# Make data to dataframe
# video_comments looks like: {"4DUYBXdUYzA": ["와 재밌다", "재미없다", ]}
def save_data_to_csv(video_comments):    
    
    data = {"Video_ID": [], "Comment": []}
    
    for video_id, comments in video_comments.items():
        for comment in comments:
            data["Video_ID"].append(video_id)
            data["Comment"].append(comment)

    df = pd.DataFrame(data)
    
    # Export to CSV 
    df.to_csv("youtube_comments.csv", index=False)

In [8]:
participants = ["흑백요리사","백종원","안성재","에드워드 리","나폴리 맛피아","트리플스타","요리하는 돌아이","최현석","장호준","여경래","안유성","정지선","최강록","조은주","오세득","파브리치오 페라리","이영숙","선경 롱게스트","김도윤","박준우"]

In [13]:
video_comments = {}
# Ex: {"4DUYBXdUYzA": ["와 재밌다", "재미없다", ]}

start = time.time()
query_baisic = "흑백요리사"

for participant in tqdm.tqdm(participants):
    query = query_baisic + " " + participant 

    try:
        video_ids = get_video_ids(query, max_results=50) 

        for video_id in video_ids:
            comments = get_top_korean_comments(video_id)
            video_comments[video_id] = comments
    except HttpError as e:
        if e.resp.get('reason') == 'quotaExceeded':
            print("Quota exceeded. Saving collected data...")
            save_data_to_csv(video_comments)
            exit()

    end = time.time()    
    print(f"{end - start}s for query: {query}")    

save_data_to_csv(video_comments)

  5%|▌         | 1/20 [00:15<04:49, 15.25s/it]

15.248111009597778s for query: 흑백요리사 흑백요리사


 10%|█         | 2/20 [00:28<04:17, 14.30s/it]

28.887962102890015s for query: 흑백요리사 백종원


 15%|█▌        | 3/20 [00:42<04:01, 14.19s/it]

42.94471049308777s for query: 흑백요리사 안성재


 20%|██        | 4/20 [00:57<03:51, 14.48s/it]

57.860220432281494s for query: 흑백요리사 에드워드 리


 25%|██▌       | 5/20 [01:14<03:46, 15.09s/it]

74.05014872550964s for query: 흑백요리사 나폴리 맛피아


 30%|███       | 6/20 [01:27<03:24, 14.58s/it]

87.63070130348206s for query: 흑백요리사 트리플스타


 35%|███▌      | 7/20 [01:41<03:04, 14.19s/it]

101.0162513256073s for query: 흑백요리사 요리하는 돌아이


 40%|████      | 8/20 [01:56<02:56, 14.70s/it]

116.82235884666443s for query: 흑백요리사 최현석


 45%|████▌     | 9/20 [02:12<02:44, 14.91s/it]

132.1999397277832s for query: 흑백요리사 장호준


 50%|█████     | 10/20 [02:25<02:23, 14.31s/it]

145.16038131713867s for query: 흑백요리사 여경래


 55%|█████▌    | 11/20 [02:39<02:08, 14.25s/it]

159.2785131931305s for query: 흑백요리사 안유성


 60%|██████    | 12/20 [02:53<01:54, 14.34s/it]

173.82319259643555s for query: 흑백요리사 정지선


 65%|██████▌   | 13/20 [03:09<01:42, 14.65s/it]

189.16910910606384s for query: 흑백요리사 최강록


 70%|███████   | 14/20 [03:22<01:25, 14.18s/it]

202.27476477622986s for query: 흑백요리사 조은주


 75%|███████▌  | 15/20 [03:34<01:08, 13.61s/it]

214.5624566078186s for query: 흑백요리사 오세득


 80%|████████  | 16/20 [03:46<00:52, 13.02s/it]

226.22314310073853s for query: 흑백요리사 파브리치오 페라리


 85%|████████▌ | 17/20 [03:59<00:39, 13.10s/it]

239.5098898410797s for query: 흑백요리사 이영숙


 90%|█████████ | 18/20 [04:12<00:25, 12.92s/it]

252.01423382759094s for query: 흑백요리사 선경 롱게스트


 95%|█████████▌| 19/20 [04:23<00:12, 12.37s/it]

263.0879054069519s for query: 흑백요리사 김도윤


100%|██████████| 20/20 [04:34<00:00, 13.72s/it]

274.48420906066895s for query: 흑백요리사 박준우





### Merge youtube_comments with movie_rating_dataset

In [14]:
comments = pd.read_csv("youtube_comments.csv")

In [15]:
comments.head()

Unnamed: 0,Video_ID,Comment
0,vebF7wUQLMo,"《흑백요리사: 요리 계급 전쟁》, 9월 17일 넷플릭스에서 시청하세요: https:..."
1,vebF7wUQLMo,빽햄요리사ㄷㄷ
2,vebF7wUQLMo,0:07
3,vebF7wUQLMo,백수저중에 옴진리교 교주가 있노 ㄷㄷㄷㄷ
4,vebF7wUQLMo,심사위원 등장씬은 대한민국 역대 등장씬 고트중에 하나다 ㄹㅇ


## SKIP

In [None]:
import urllib.request
# download naver movie ratings dataset
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt", filename="ratings.txt")

In [None]:
movie_data = pd.read_table('ratings.txt')
movie_data.head()

In [None]:
comments.head()

In [None]:
print(f"movie data length: {len(movie_data)}")
print(f"comments data length: {len(comments)}")

In [None]:
# Merge two dataset because number of Comments dataset is not big enough to train word vectors.
df1_text = movie_data[['document']].rename(columns={'document': 'text'})
df2_text = comments[['Comment']].rename(columns={'Comment': 'text'})

# merge movie_data and yt_comments_data
merged_df = pd.concat([df1_text, df2_text], ignore_index=True)
merged_df

In [None]:
# NULL check
print(merged_df.isnull().values.any())

In [None]:
merged_df = merged_df.dropna(how = 'any') # drop rows with null values
print(merged_df.isnull().values.any()) 

In [None]:
print(len(merged_df)) 

In [None]:
# remove all characters other than Hangeul
merged_df['text'] = merged_df['text'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", regex=True)

In [None]:
# SKIP END

In [19]:
# !pip install konlpy

# from konlpy.tag import Okt
# okt = Okt()

# !pip install fugashi[unidic-lite]

In [16]:
import fugashi

tagger = fugashi.Tagger()

In [17]:
# NULL check
print(comments.isnull().values.any()) # => True

comments = comments.dropna(how = 'any') # drop rows with null values

print(comments.isnull().values.any()) # => False

True
False


In [18]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

tokenized_data = []

# merged_df['text'] => comments['Comment']
for sentence in tqdm.tqdm(comments['Comment']): 
    sentence = str(sentence).strip()
    
    if not sentence:  # 빈 문자열이면 건너뛰기
        continue
        
    tokenized_sentence = [word.surface for word in tagger(sentence)]
    
    stopwords_removed_sentence = [word for word in tokenized_sentence 
                                  if not word in stopwords # 조건1
                                     and len(word) >= 2 # 조건2   
                                     and word.isalpha()]  # 한글이나 영어 
    
    if stopwords_removed_sentence:  # 빈 리스트가 아니라면 추가
        tokenized_data.append(stopwords_removed_sentence)


100%|██████████| 43589/43589 [00:00<00:00, 60743.21it/s]


In [14]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-win_amd64.whl.metadata (8.2 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-win_amd64.whl.metadata (61 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.1.0-py3-none-any.whl.metadata (24 kB)
Downloading gensim-4.3.3-cp311-cp311-win_amd64.whl (24.0 MB)
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   --- ------------------------------------ 2.4/24.0 MB 11.2 MB/s eta 0:00:02
   ------ --------------------------------- 4.2/24.0 MB 10.9 MB/s eta 0:00:02
   -------- ------------------------------- 5.2/24.0 MB 8.8 MB/s eta 0:00:03
   ---------- ----------------------------- 6.6/24.0 MB 7.7 MB/s eta 0:00:03
   ------------ --------------------------- 7.6/24.0 MB 7.3 MB/s eta 0:00:03
   -------------- ------------------------- 8.9/24.0 MB

  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.


In [19]:
from gensim.models import Word2Vec

model = Word2Vec(sentences = tokenized_data, vector_size = 100, window = 5, min_count = 5, workers = 4, sg = 0)

In [20]:
model.wv.vectors.shape

(9225, 100)

In [21]:
print(model.wv.most_similar("백종원"))

[('셰프', 0.9992114305496216), ('이게', 0.9991965293884277), ('ㅋㅋㅋ', 0.999139130115509), ('진짜', 0.9991201162338257), ('근데', 0.9991068840026855), ('쉐프', 0.9990993142127991), ('ㅈㄴ', 0.9990933537483215), ('아니', 0.9990734457969666), ('ㅋㅋ', 0.9990469813346863), ('그냥', 0.9989633560180664)]


In [22]:
print(model.wv.most_similar("최현석"))

[('안성재', 0.9994552731513977), ('에드워드', 0.9988729953765869), ('최강록', 0.9987758994102478), ('백종원', 0.9987531304359436), ('셰프', 0.9987066984176636), ('셰프님', 0.9986703991889954), ('쉐프', 0.9985906481742859), ('진짜', 0.9984706044197083), ('귀여워', 0.9982514977455139), ('너무', 0.9982216358184814)]


## Save W2V model

In [23]:
model.wv.save_word2vec_format('ko_w2v')

In [24]:
!python -m gensim.scripts.word2vec2tensor --input ko_w2v --output ko_w2v

2025-05-02 14:36:00,846 - word2vec2tensor - INFO - running d:\apps\minicond3\envs\python-env-311\Lib\site-packages\gensim\scripts\word2vec2tensor.py --input ko_w2v --output ko_w2v
2025-05-02 14:36:00,846 - keyedvectors - INFO - loading projection weights from ko_w2v
2025-05-02 14:36:01,507 - utils - INFO - KeyedVectors lifecycle event {'msg': 'loaded (9225, 100) matrix of type float32 from ko_w2v', 'binary': False, 'encoding': 'utf8', 'datetime': '2025-05-02T14:36:01.483753', 'gensim': '4.3.3', 'python': '3.11.11 | packaged by Anaconda, Inc. | (main, Dec 11 2024, 16:34:19) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'load_word2vec_format'}
2025-05-02 14:36:02,057 - word2vec2tensor - INFO - 2D tensor file saved to ko_w2v_tensor.tsv
2025-05-02 14:36:02,057 - word2vec2tensor - INFO - Tensor metadata file saved to ko_w2v_metadata.tsv
2025-05-02 14:36:02,058 - word2vec2tensor - INFO - finished running word2vec2tensor.py


## Visualization for embedding

In [None]:
## Go to https://projector.tensorflow.org/
## and load ko_w2v_tensor.tsv and ko_w2v_metadata.tsv