## 1. Data Loading

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
comment_df = pd.read_csv('./datasets/gd1Ab-TeGCY_comment_info.csv', index_col=0)
comment_list = list(comment_df['comment'])
len(comment_list)

1678

In [3]:
comment_list[:3]

['곽윤기 선수님 추월 진짜 스무스 하게 너무 잘하심,,, 매번 느끼고 보는 거지만 볼때마다 감탄함ㅠㅠㅠㅠㅠ 곽윤기 선수님, 김동욱 선수님, 황대헌 선수님, 이준서 선수님 4분 모두 수고하셨어요!!! 이준서 선수님 아프신거 빨리 나으시길 바라요ㅠㅠㅠㅠㅠㅠ',
 '곽윤기 진짜 대단함 진짜 스케이트를 사랑하는사람. 비슷한 나이때에 처음보고 이제 30대중반이 됐는데 아직도 현역뛰고 기량도 좋고 밑에서 치고 올라 올 때마다 자존심도 상할 수 있고 현타도 올텐데 다이겨내고 국대까지.... 진짜 멋지다.',
 '와 진짜...곽윤기선수님은 진짜 전설.... 인코스 추월이 쉽게 보일정도로 엄청나게 잘하심ㅠㅠ']

## 2. 텍스트 전처리

In [4]:
# 워드 임베딩
# TF
text = ''.join(comment_list)
text = re.sub(r'[^ ㄱ-ㅣ가-힣+]', '', text)
words = text.replace('.', '').split()
len(words)

10789

In [5]:
word_count = np.unique(words, return_counts=True)
word_count

(array(['+애민정신', 'ㄱ', 'ㄱㅇㄱ야ㅠㅜㅠ', ..., '힘을', '힘이면', '힘입니다'], dtype='<U83'),
 array([1, 1, 1, ..., 1, 1, 1], dtype=int64))

In [6]:
word_to_cnt = {}
for word, cnt in zip(*word_count):
    word_to_cnt[word] = cnt

word_to_cnt = sorted(word_to_cnt.items(), key=lambda x: x[1], reverse=True)
word_to_cnt

[('진짜', 206),
 ('곽윤기', 203),
 ('너무', 170),
 ('선수', 148),
 ('선수들', 72),
 ('정말', 57),
 ('윤기', 49),
 ('잘', 47),
 ('선수님', 44),
 ('역시', 43),
 ('금메달', 40),
 ('인코스', 38),
 ('다', 36),
 ('왜', 33),
 ('대한민국', 32),
 ('경기', 30),
 ('마지막', 30),
 ('쇼트트랙', 30),
 ('추월', 30),
 ('결승', 28),
 ('카메라', 28),
 ('꽉잡아', 24),
 ('화이팅', 23),
 ('넘', 22),
 ('우리', 22),
 ('우리나라', 22),
 ('사이로', 21),
 ('선수가', 21),
 ('ㅋㅋㅋ', 20),
 ('모두', 20),
 ('보고', 20),
 ('중국', 20),
 ('곽윤기선수', 19),
 ('꼭', 19),
 ('꽉잡아윤기', 19),
 ('때', 19),
 ('수', 19),
 ('이준서', 19),
 ('거', 18),
 ('귀신같이', 18),
 ('더', 18),
 ('마지막에', 18),
 ('와', 18),
 ('유튜버', 18),
 ('ㅋ', 17),
 ('다른', 17),
 ('다시', 17),
 ('좋은', 17),
 ('최고', 17),
 ('황대헌', 17),
 ('계속', 16),
 ('올림픽', 16),
 ('유튜버가', 16),
 ('이게', 16),
 ('인코스로', 16),
 ('김동욱', 15),
 ('너무너무', 15),
 ('보는', 15),
 ('유튜브', 15),
 ('이', 15),
 ('것', 14),
 ('이렇게', 14),
 ('존나', 14),
 ('항상', 14),
 ('ㅋㅋ', 13),
 ('곽윤기가', 13),
 ('그리고', 13),
 ('대박', 13),
 ('저렇게', 13),
 ('ㅈㄴ', 12),
 ('같이', 12),
 ('그', 12),
 ('그냥', 12),
 ('다들', 12),
 ('

In [9]:
# TDM
from sklearn.feature_extraction.text import CountVectorizer

new_comment_list = []
for comment in comment_list:
    comment = re.sub(r'[^ ㄱ-ㅣ가-힣+]', '', comment)
    new_comment_list.append(comment)

vector = CountVectorizer()
tdm_arr = vector.fit_transform(new_comment_list).toarray()
tf_dic = vector.vocabulary_

# print(tdm_arr)
# print(tf_dic)

In [10]:
tf_dic_sorted = dict(sorted(tf_dic.items(), key=lambda x : x[1], reverse=True))
tdm_df = pd.DataFrame(tdm_arr, columns=tf_dic_sorted.keys())
tdm_df

Unnamed: 0,힘입니다,힘이면,힘을,힘은,힘받지,힘들었을수도,힘들었을까,힘들듯,힘들겠다,힘든곳에서,...,ㄹㄱㄴ,ㄷㄷㄷ진짜,ㄷㄷㄷㄷㄷㄷㄷ,ㄷㄷㄷㄷㄷ,ㄷㄷㄷㄷ,ㄷㄷㄷ,ㄷㄷ,ㄲ아,ㄱ엔딩요정,ㄱㅇㄱ야ㅠㅜㅠ
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1673,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1674,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1675,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1676,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
