# Make Corpus

In [1]:
import numpy as np; import pandas as pd
import pickle
from collections import defaultdict

raw = pickle.load(open('for_hyunse_re.pickle', 'rb'))

# 하나의 채널에 대한 데이터이고 Count가 지시하는 바가 동영상의 개수임
# Items는 결국 길이를 Count만큼 갖는 리스트가 되겠음.

In [2]:
re = []; rere = []
for item in raw: # 8개의 동영상에 대해서 돌게 됨
    for indiv_com in item['CommentThreads']: # 개별 댓글에 대해 돎
        if type(indiv_com) is dict:
            re.append(indiv_com['comment']['tidyed_text']) # comment는 댓글 관련 정보
            for rerep in indiv_com['replies']: # replies는 댓글 내의 대댓글 관련 정보쓰..
                rere.append(rerep['tidyed_text'])

In [3]:
total_replies = re + rere

In [18]:
len(total_replies)

529134

In [35]:
import re
hangul = re.compile('[^ ㄱ-ㅣ가-힣.'']+') # # 한글과 띄어쓰기 . ''""를 제외한 모든 글자

final_total =[]
for comments in total_replies:
    final_total.append(hangul.sub('', comments)) # 한글과 띄어쓰기를 제외한 모든 글자 제외

# Cohesion Probability

In [30]:
class CohesionProbability: # 김현중 님 코드 그대로 복붙
    def __init__(self, left_min_length=1, left_max_length=10, right_min_length=1, right_max_length=6):

        self.left_min_length = left_min_length
        self.left_max_length = left_max_length
        self.right_min_length = right_min_length
        self.right_max_length = right_max_length

        self.L = defaultdict(int)
        self.R = defaultdict(int)

    def get_cohesion_probability(self, word):

        if not word:
            return (0, 0, 0, 0)

        word_len = len(word)

        l_freq = 0 if not word in self.L else self.L[word]
        r_freq = 0 if not word in self.R else self.R[word]

        if word_len == 1:
            return (0, 0, l_freq, r_freq)

        l_cohesion = 0
        r_cohesion = 0

        # forward cohesion probability (L)
        if (self.left_min_length <= word_len) and (word_len <= self.left_max_length):

            l_sub = word[:self.left_min_length]
            l_sub_freq = 0 if not l_sub in self.L else self.L[l_sub]

            if l_sub_freq > 0:
                l_cohesion = np.power((l_freq / float(l_sub_freq)), (1 / (word_len - len(l_sub) + 1.0)))

        # backward cohesion probability (R)
        if (self.right_min_length <= word_len) and (word_len <= self.right_max_length):

            r_sub = word[-1 * self.right_min_length:]
            r_sub_freq = 0 if not r_sub in self.R else self.R[r_sub]

            if r_sub_freq > 0:
                r_cohesion = np.power((r_freq / float(r_sub_freq)), (1 / (word_len - len(r_sub) + 1.0)))

        return (l_cohesion, r_cohesion, l_freq, r_freq)

    def get_all_cohesion_probabilities(self):

        cp = {}
        words = set(self.L.keys())
        for word in self.R.keys():
            words.add(word)

        for word in words:
            cp[word] = self.get_cohesion_probability(word)

        return cp

    def counter_size(self):
        return (len(self.L), len(self.R))

    def prune_extreme_case(self, min_count):

        before_size = self.counter_size()
        self.L = defaultdict(int, {k: v for k, v in self.L.items() if v > min_count})
        self.R = defaultdict(int, {k: v for k, v in self.R.items() if v > min_count})
        after_size = self.counter_size()

        return (before_size, after_size)

    def train(self, sents, num_for_pruning=0, min_count=5):

        for num_sent, sent in enumerate(sents):
            for word in sent.split():

                if not word:
                    continue

                word_len = len(word)

                for i in range(self.left_min_length, min(self.left_max_length, word_len) + 1):
                    self.L[word[:i]] += 1

                # for i in range(self.right_min_length, min(self.right_max_length, word_len)+1):
                for i in range(self.right_min_length, min(self.right_max_length, word_len)):
                    self.R[word[-i:]] += 1

            if (num_for_pruning > 0) and ((num_sent + 1) % num_for_pruning == 0):
                self.prune_extreme_case(min_count)

        if (num_for_pruning > 0) and ((num_sent + 1) % num_for_pruning == 0):
            self.prune_extreme_case(min_count)

    def extract(self, min_count=5, min_cohesion=(0.05, 0), min_droprate=0.8, remove_subword=True):

        word_to_score = self.get_all_cohesion_probabilities()
        word_to_score = {word: score for word, score in word_to_score.items()
                         if (score[0] >= min_cohesion[0])
                         and (score[1] >= min_cohesion[1])
                         and (score[2] >= min_count)}

        if not remove_subword:
            return word_to_score

        words = {}

        for word, score in sorted(word_to_score.items(), key=lambda x: len(x[0])):
            len_word = len(word)
            if len_word <= 2:
                words[word] = score
                continue

            try:
                subword = word[:-1]
                subscore = self.get_cohesion_probability(subword)
                droprate = score[2] / subscore[2]

                if (droprate >= min_droprate) and (subword in words):
                    del words[subword]

                words[word] = score

            except:
                print(word, score, subscore)
                break

        return words

    def transform(self, docs, l_word_set):

        def left_match(word):
            for i in reversed(range(1, len(word) + 1)):
                if word[:i] in l_word_set:
                    return word[:i]
            return ''

        return [[left_match(word) for sent in doc.split('  ') for word in sent.split() if left_match(word)] for doc in
                docs]

    def load(self, fname):
        try:
            with open(fname, encoding='utf-8') as f:

                next(f)  # SKIP: parameters(left_min_length left_max_length ...
                token = next(f).split()
                self.left_min_length = int(token[0])
                self.left_max_length = int(token[1])
                self.right_min_length = int(token[2])
                self.right_max_length = int(token[3])

                next(f)  # SKIP: L count
                is_right_side = False

                for line in f:

                    if '# R count' in line:
                        is_right_side = True
                        continue

                    token = line.split('\t')
                    if is_right_side:
                        self.R[token[0]] = int(token[1])
                    else:
                        self.L[token[0]] = int(token[1])

        except Exception as e:
            print(e)

    def save(self, fname):
        try:
            with open(fname, 'w', encoding='utf-8') as f:

                f.write('# parameters(left_min_length left_max_length right_min_length right_max_length)\n')
                f.write('%d %d %d %d\n' % (
                self.left_min_length, self.left_max_length, self.right_min_length, self.right_max_length))

                f.write('# L count')
                for word, freq in self.L.items():
                    f.write('%s\t%d\n' % (word, freq))

                f.write('# R count')
                for word, freq in self.R.items():
                    f.write('%s\t%d\n' % (word, freq))

        except Exception as e:
            print(e)

    def words(self):
        words = set(self.L.keys())
        words = words.union(set(self.R.keys()))
        return words
      
      
class CohesionTokenizer:
    def __init__(self, cohesion):
        self.cohesion = cohesion
        self.range_l = cohesion.left_max_length

    def tokenize(self, sentence, max_ngram=4, length_penalty=-0.05, ngram=False, debug=False):

        def flatten(tokens):
            return [word for token in tokens for word in token]

        tokens = [self._recursive_tokenize(token, max_ngram, length_penalty, ngram, debug) for token in
                  sentence.split()]
        words = flatten(tokens)

        if not debug:
            tokens = [word if type(word) == str else word[0] for word in words]

        return tokens

    def _recursive_tokenize(self, token, max_ngram=4, length_penalty=-0.05, ngram=False, debug=False):

        length = len(token)
        if length <= 2:
            return [token]

        range_l = min(self.range_l, length)

        scores = self._initialize(token, range_l, length)
        if debug:
            pprint(scores)

        result = self._find(scores)

        adds = self._add_inter_subtokens(token, result)

        if result[-1][2] != length:
            adds += self._add_first_subtoken(token, result)

        if result[0][1] != 0:
            adds += self._add_last_subtoken(token, result)

        result = sorted(result + adds, key=lambda x: x[1])

        if ngram:
            result = self._extract_ngram(result, max_ngram, length_penalty)

        return result

    def _initialize(self, token, range_l, length):
        scores = []
        for b in range(0, length - 1):
            for r in range(2, range_l + 1):
                e = b + r

                if e > length:
                    continue

                subtoken = token[b:e]
                score = self.cohesion.get_cohesion_probability(subtoken)
                # (subtoken, begin, end, cohesion_l, frequency_l, range)
                scores.append((subtoken, b, e, score[0], score[2], r))

        return sorted(scores, key=lambda x: (x[3], x[5]), reverse=True)

    def _find(self, scores):
        result = []
        num_iter = 0

        while scores:
            word, b, e, cp_l, freq_l, r = scores.pop(0)
            result.append((word, b, e, cp_l, freq_l, r))

            if not scores:
                break

            removals = []
            for i, (_1, b_, e_, _2, _3, _4) in enumerate(scores):
                if (b_ < e and b < e_) or (b_ < e and e_ > b):
                    removals.append(i)

            for i in reversed(removals):
                del scores[i]

            num_iter += 1
            if num_iter > 100: break

        return sorted(result, key=lambda x: x[1])

    def _add_inter_subtokens(self, token, result):
        adds = []
        for i, base in enumerate(result[:-1]):
            if base[2] == result[i + 1][1]:
                continue

            b = base[2]
            e = result[i + 1][1]
            subtoken = token[b:e]
            adds.append((subtoken, b, e, 0, self.cohesion.L.get(subtoken, 0), e - b))

        return adds

    def _add_first_subtoken(self, token, result):
        b = result[-1][2]
        subtoken = token[b:]
        score = self.cohesion.get_cohesion_probability(subtoken)
        return [(subtoken, b, len(token), score[0], score[2], len(subtoken))]

    def _add_last_subtoken(self, token, result):
        e = result[0][1]
        subtoken = token[0:e]
        score = self.cohesion.get_cohesion_probability(subtoken)
        return [(subtoken, 0, e, score[0], score[2], e)]

    def _extract_ngram(self, words, max_ngram=4, length_penalty=-0.05):

        def ngram_average_score(words):
            words = [word for word in words if len(word) > 1]
            scores = [word[3] for word in words]
            return max(0, np.mean(scores) + length_penalty * len(scores))

        length = len(words)
        scores = []

        if length <= 1:
            return words

        for word in words:
            scores.append(word)

        for b in range(0, length - 1):
            for r in range(2, max_ngram + 1):
                e = b + r

                if e > length:
                    continue

                ngram = words[b:e]
                ngram_str = ''.join([word[0] for word in ngram])
                ngram_str_ = '-'.join([word[0] for word in ngram])

                ngram_freq = self.cohesion.L.get(ngram_str, 0)
                if ngram_freq == 0:
                    continue

                base_freq = min([word[4] for word in ngram])
                ngram_score = np.power(ngram_freq / base_freq, 1 / (r - 1)) if base_freq > 0 else 0
                ngram_score -= r * length_penalty

                scores.append((ngram_str_, words[b][1], words[e - 1][2], ngram_score, ngram_freq, 0))

        scores = sorted(scores, key=lambda x: x[3], reverse=True)
        return self._find(scores)

In [37]:
cohesion = CohesionProbability()
cohesion.train(total_replies)
cohesiontokenizer = CohesionTokenizer(cohesion)
cohesion_tokenized_reviews = [cohesiontokenizer.tokenize(i) for i in total_replies]

In [38]:
cohesion_tokenized_reviews

[['아', '개마싯갯다', 'ㄹㅇ'],
 ['너불님',
  '이',
  '펩시',
  '와',
  '부먹',
  '충인',
  '걸',
  '처음',
  '알았습니다',
  '..',
  '..',
  '..',
  '오늘',
  '구독',
  '취소',
  '눌렀',
  '습니다',
  '..',
  '..',
  '..',
  '.',
  '어이쿠',
  '한번',
  '더',
  '눌렀',
  '네'],
 ['이', '영상', '을', '천성', '부먹', '멋사', '님께서', '좋아', '하', '십니다', '.'],
 ['?', '?', '?', '?', '코카', '가', '더', '낫다', '코카', '일어나라', '!!', '!!'],
 ['코카', '편', 'ㅡㅡ', 'ㅡ안'],
 ['코카',
  '콜라',
  '넼',
  'ㅋㅋ',
  'ㅋㅋ',
  'ㅋㅋ',
  'ㅋㅋ',
  'ㅋㅋ',
  'ㅋㅋ',
  'ㅋㅋ',
  'ㅋㅋ',
  'ㅋ',
  '펩시',
  '파',
  '소리',
  '쥘러',
  '!!',
  '!!',
  '!!',
  '!'],
 ['자장', '면이', '되게진해보인다', 'ㅋㅋ', '근데', '탕수육', '은', '먹음직~'],
 ['코카',
  '콜라',
  '펩시',
  '부먹',
  '찍먹',
  '이런',
  '얘길',
  '해야',
  '돼나',
  '난',
  '너불님',
  '이',
  '먹는',
  '게',
  '너무',
  '흐뭇해서',
  '아무',
  '생각',
  '이',
  '안',
  '나는',
  '데',
  '안',
  '뜨거',
  '워요',
  '나만',
  '피식하고',
  '웃었나',
  '개',
  '귀여',
  '워',
  'ㅋㅋ',
  'ㅋ'],
 ['부먹', '..', '..', '.', '펩시', '..', '.', '코카', ',', '찍먹', '찬양', '해라', '~~~~'],
 ['너불님', '미안해요', '부먹', '이랑', '펩시', '소리', '듣고', '

In [37]:
from soynlp.tokenizer import RegexTokenizer

tokenizer = RegexTokenizer()
h=[]

for comments in final_total:
    h.append(' '.join(tokenizer.tokenize(comments)))
    
final_total = h
del h

In [20]:
from soynlp.word import WordExtractor

word_extractor = WordExtractor(min_frequency=10,
    min_cohesion_forward=0.05, 
    min_right_branching_entropy=0.0
)

word_extractor.train(final_total) # list of str or like
words = word_extractor.extract()

training was done. used memory 2.769 Gbse memory 3.079 Gb
all cohesion probabilities was computed. # words = 85654
all branching entropies was computed # words = 201833
all accessor variety was computed # words = 201833


In [21]:
scores = {word:score.cohesion_forward for word, score in words.items()}

In [8]:
scores2 = {word:max(score.cohesion_forward,score.cohesion_backward) for word, score in words.items()}

In [9]:
scores = scores2

In [36]:
final_total = final_total[100000:100100]

In [23]:
from soynlp.tokenizer import LTokenizer, MaxScoreTokenizer

tokenizer= LTokenizer(scores = scores)
tokenizer2 = MaxScoreTokenizer(scores=scores)
tmp = []; tmp2 = []
r_tmp = []; r_tmp2 = []

for i in final_total:
    tmp.append(tokenizer.tokenize(i))
    
tmp = sum(tmp, [])

for i in tmp:
    tmp2.append(tokenizer2.tokenize(i))
    
tmp2 = sum(tmp2, [])

In [24]:
tmp

['악',
 '미쳤다',
 '요즘',
 '언니',
 '진챠',
 '너무',
 '너무너무',
 '좋아',
 '서',
 '하루',
 '다섯',
 '시간씩',
 '언니',
 '영상',
 '보고있어요',
 '엉엉',
 '진짜',
 '방금',
 '도',
 '보는데',
 '또',
 '영상',
 '올라와서',
 '진짜',
 '행보',
 '케요',
 '라이',
 '프진진',
 '언니',
 '예전',
 '영상',
 '보다가',
 '오늘',
 '올라',
 '온',
 '영상',
 '보니까',
 '피부',
 '가',
 '너무',
 '좋아',
 '신거',
 '같아요',
 '이제',
 '아침',
 '저녁',
 '으로',
 '쌀쌀',
 '하니까',
 '언니',
 '가',
 '좋아',
 '하시는',
 '잠옷',
 '추천',
 '영상',
 '을',
 '보고싶어요',
 'ㅠㅠ',
 'ㅠ',
 '일본에서',
 '구독하구잇어요어늬너무이뻐요',
 'ㅠ',
 '그냥',
 '다해요',
 'ㅠㅠ',
 'ㅠㅠㅠ',
 '사랑해요',
 'ㅠ',
 '아',
 'ㅜㅜ',
 'ㅜ',
 '미모',
 '대방출',
 '너무',
 '예쁜',
 '거',
 '아닙니까',
 '들숨',
 '에',
 '재력',
 '을',
 '날숨에',
 '건강',
 '을',
 '..',
 '보라색',
 '아이',
 '섀도',
 '넘',
 '잘어울려',
 '여',
 '그리',
 '규',
 '파운데이션',
 '도',
 '코부분은',
 '놔두',
 '고',
 '볼이',
 '랑',
 '외곽은',
 '컨실러',
 '로',
 '살짝',
 '덮으니까',
 '중앙',
 '이',
 '밝아',
 '보이고',
 '잘어울리',
 '는거가태요',
 '오늘',
 '영상',
 '잘봤어',
 '여',
 'ㅎ',
 'ㅡ',
 'ㅎ',
 '언니',
 '아이라인',
 '가끔',
 '그리',
 '실',
 '때',
 '꼬리',
 '짧게',
 '위로',
 '빼면',
 '더',
 '자연스러',
 '울거가타여',
 '저도',
 '아라',
 '안그',
 '리고',
 '속눈썹',
 '집착'

In [19]:
for i in final_total:
    r_tmp.append(tokenizer2.tokenize(i))
    
r_tmp = sum(r_tmp, [])

for i in r_tmp:
    r_tmp2.append(tokenizer.tokenize(i))
    
r_tmp2 = sum(r_tmp2, [])

In [33]:
former = final_total
tmp = []

while 1:
    for i in former:
        tmp.append(tokenizer.tokenize(i))
    
    tmp = sum(tmp, [])
    
    if len(former) == len(tmp):
        break
    
    former = tmp; tmp = []

In [34]:
tmp

['악',
 '미쳤다',
 '요즘',
 '언니',
 '진챠',
 '너무',
 '너무',
 '너무',
 '좋아',
 '서',
 '하루',
 '다섯',
 '시간',
 '씩',
 '언니',
 '영상',
 '보고있어요',
 '엉엉',
 '진짜',
 '방금',
 '도',
 '보는데',
 '또',
 '영상',
 '올라와서',
 '진짜',
 '행보',
 '케요',
 '라이',
 '프진진',
 '언니',
 '예전',
 '영상',
 '보다가',
 '오늘',
 '올라',
 '온',
 '영상',
 '보니까',
 '피부',
 '가',
 '너무',
 '좋아',
 '신거',
 '같아요',
 '이제',
 '아침',
 '저녁',
 '으로',
 '쌀쌀',
 '하니까',
 '언니',
 '가',
 '좋아',
 '하시는',
 '잠옷',
 '추천',
 '영상',
 '을',
 '보고싶어요',
 'ㅠㅠ',
 'ㅠ',
 '일본에서',
 '구독',
 '하구',
 '잇어요',
 '어늬너무이뻐요',
 'ㅠ',
 '그냥',
 '다해',
 '요',
 'ㅠㅠ',
 'ㅠㅠ',
 'ㅠ',
 '사랑해요',
 'ㅠ',
 '아',
 'ㅜㅜ',
 'ㅜ',
 '미모',
 '대방출',
 '너무',
 '예쁜',
 '거',
 '아닙니까',
 '들숨',
 '에',
 '재력',
 '을',
 '날숨에',
 '건강',
 '을',
 '..',
 '보라색',
 '아이',
 '섀도',
 '넘',
 '잘어울려',
 '여',
 '그리',
 '규',
 '파운데이션',
 '도',
 '코부분은',
 '놔두',
 '고',
 '볼이',
 '랑',
 '외곽은',
 '컨실러',
 '로',
 '살짝',
 '덮으니까',
 '중앙',
 '이',
 '밝아',
 '보이고',
 '잘어울리',
 '는거',
 '가태',
 '요',
 '오늘',
 '영상',
 '잘봤어',
 '여',
 'ㅎ',
 'ㅡ',
 'ㅎ',
 '언니',
 '아이라인',
 '가끔',
 '그리',
 '실',
 '때',
 '꼬리',
 '짧게',
 '위로',
 '빼면',
 '더',
 '자연스러',
 '울거',

In [None]:
with open('straight.pickle', 'wb') as f:
    pickle.dump(tmp2, f, pickle.HIGHEST_PROTOCOL)
    
with open('reversed.pickle', 'wb') as f:
    pickle.dump(r_tmp2, f, pickle.HIGHEST_PROTOCOL)

In [41]:
print(len(tmp), len(tmp2))
print(len(r_tmp), len(r_tmp2))

46717 58668
58830 58831


In [51]:
# import math

# def word_score(score):
#     return (score.cohesion_forward * math.exp(score.right_branching_entropy))

# print('단어   (빈도수, cohesion, branching entropy)\n')
# for word, score in sorted(words.items(), key=lambda x:word_score(x[1]), reverse=True)[:30]:
#     print('%s     (%d, %.3f, %.3f)' % (
#             word, 
#             score.leftside_frequency, 
#             score.cohesion_forward,
#             score.right_branching_entropy
#             )
#          )

단어   (빈도수, cohesion, branching entropy)

..     (73, 0.901, 4.314)
...     (42, 0.720, 3.936)
광숙이     (547, 0.930, 3.573)
너불님     (942, 0.737, 3.709)
ㅋㅋ     (421, 0.966, 3.205)
ㅠㅠ     (175, 0.897, 3.197)
!!     (15, 0.714, 3.392)
진짜     (161, 0.778, 3.146)
ㅋㅋㅋ     (315, 0.850, 3.051)
??     (36, 0.720, 3.065)
영상     (176, 0.907, 2.750)
ㅋㅋㅋㅋ     (214, 0.789, 2.873)
....     (18, 0.606, 3.030)
많이     (125, 0.727, 2.699)
근데     (67, 0.870, 2.516)
라쿤     (299, 0.920, 2.439)
ㅎㅎ     (64, 0.842, 2.479)
ㅠㅠㅠ     (73, 0.612, 2.793)
코카     (446, 0.880, 2.374)
버거     (59, 0.738, 2.526)
역시     (47, 0.959, 2.245)
ㅋㅋㅋㅋㅋ     (146, 0.761, 2.378)
펩시     (285, 0.973, 2.099)
콜라     (89, 0.947, 2.104)
???     (21, 0.648, 2.465)
ㅜㅜ     (44, 0.815, 2.230)
햄버거     (169, 0.948, 2.026)
ㅠㅠㅠㅠ     (46, 0.618, 2.399)
봤는데     (17, 0.753, 2.161)
독감     (108, 0.982, 1.844)


# Konlpy

In [47]:
from konlpy.tag import Hannanum, Kkma, Komoran, Mecab, Twitter

use = final_total
han = []; kk = []; ko = []; me = []; twi = []

hannanum = Hannanum(); kkma = Kkma(); twitter = Twitter()

for i in use:
    han.append(hannanum.morphs(i))
    kk.append(kkma.morphs(i))
    #ko.append(komoran.morphs(i))
    #me.append(mecab.morphs(i))
    twi.append(twitter.morphs(i))

  warn('"Twitter" has changed to "Okt" since KoNLPy v0.4.5.')


In [48]:
han

[['악',
  '미치',
  '었다',
  '요즘',
  '언니',
  '진챠',
  '너무너무너무',
  '좋',
  '아',
  '하루',
  '다섯시간',
  '씩',
  '언니',
  '영상',
  '보',
  '고',
  '있',
  '어',
  '요',
  '엉엉',
  '진짜',
  '방금',
  '도',
  '보',
  'ㄴ데',
  '또',
  '영상',
  '오르',
  '아',
  '오',
  '아',
  '진짜',
  '행보케요',
  '라이프진진'],
 ['언니',
  '예전',
  '영상',
  '보',
  '다가',
  '오늘',
  '오르',
  '아',
  '오',
  'ㄴ',
  '영상',
  '보',
  '니까',
  '피부',
  '가',
  '너무',
  '좋아신거',
  '같',
  '아',
  '요',
  '이제',
  '아침',
  '저녁',
  '으로',
  '쌀쌀',
  '하',
  '니까',
  '언니',
  '가',
  '좋',
  '아',
  '하',
  '시는',
  '잠옷',
  '추천',
  '영상',
  '을',
  '보',
  '고',
  '싶',
  '어',
  '요',
  'ㅠㅠㅠ'],
 ['일본에서구독하구잇어요어늬너무이뻐요',
  'ㅠ',
  '그냥다해요',
  'ㅠㅠㅠㅠㅠ',
  '사랑',
  '하',
  '어',
  '요',
  'ㅠ',
  '아',
  'ㅜㅜㅜ'],
 ['미모', '대방출', '너무', '예쁘', 'ㄴ', '것', '아니', 'ㅂ니까'],
 ['들숨', '에', '재력', '을', '날숨', '에', '건강', '을', '..'],
 ['보라색',
  '아이섀',
  '도',
  '널',
  'ㅁ',
  '잘어울려',
  '이',
  '어',
  '그리규',
  '파운데이션',
  '도',
  '코부분',
  '은',
  '놔두',
  '고',
  '볼',
  '이랑',
  '외곽',
  '은',
  '컨실러',
  '로',
  '살짝',
  '덮',
  '으니까',
  

In [49]:
kk

[['악',
  '미치',
  '었',
  '다',
  '요즘',
  '언니',
  '진',
  '챠',
  '너무',
  '너무너무',
  '좋',
  '아서',
  '하루',
  '다섯',
  '시간',
  '씩',
  '언니',
  '영상',
  '보',
  '고',
  '있',
  '어요',
  '엉엉',
  '진짜',
  '방금',
  '도',
  '보',
  '는데',
  '또',
  '영상',
  '올라오',
  '아서',
  '진짜',
  '행보',
  '하',
  '게요',
  '라이프',
  '진진'],
 ['언니',
  '예전',
  '영상',
  '보',
  '다가',
  '오늘',
  '올라오',
  'ㄴ',
  '영상',
  '보',
  '니까',
  '피부',
  '가',
  '너무',
  '좋',
  '아',
  '시',
  'ㄴ',
  '거',
  '같',
  '아요',
  '이제',
  '아침',
  '저녁',
  '으로',
  '쌀쌀',
  '하',
  '니까',
  '언니',
  '가',
  '좋아하',
  '시',
  '는',
  '잠옷',
  '추천',
  '영상',
  '을',
  '보',
  '고',
  '싶',
  '어요',
  'ㅠㅠㅠ'],
 ['일본',
  '에서',
  '구독',
  '하',
  '구',
  '잇',
  '어요',
  '어',
  '어',
  '늬',
  '너무',
  '이쁘',
  '어요',
  'ㅠ',
  '그냥',
  '다하',
  '어요',
  'ㅠㅠㅠㅠㅠ',
  '사랑',
  '하',
  '어요',
  'ㅠ',
  '아',
  'ㅜㅜㅜ'],
 ['미모', '대', '방출', '너무', '예쁘', 'ㄴ', '거', '아니', 'ㅂ니까'],
 ['들숨', '에', '재력', '을', '날숨', '에', '건강', '을', '..'],
 ['보라색',
  '아이섀도',
  '넘',
  '잘',
  '어울리',
  '어',
  '여',
  '그리',
  '규',
  '파운데이션',
  '도',

In [50]:
twi

[['악',
  '미쳤다',
  '요즘',
  '언니',
  '진챠',
  '너무',
  '너무',
  '너무',
  '좋아서',
  '하루',
  '다섯',
  '시간',
  '씩',
  '언니',
  '영상',
  '보고있어요',
  '엉엉',
  '진짜',
  '방금',
  '도',
  '보는데',
  '또',
  '영상',
  '올라와서',
  '진짜',
  '행',
  '보케',
  '요',
  '라이프',
  '진진'],
 ['언니',
  '예전',
  '영상',
  '보다가',
  '오늘',
  '올라온',
  '영상',
  '보',
  '니까',
  '피부',
  '가',
  '너무',
  '좋',
  '아신거',
  '같아요',
  '이제',
  '아침',
  '저녁',
  '으로',
  '쌀쌀',
  '하니까',
  '언니',
  '가',
  '좋아하시는',
  '잠옷',
  '추천',
  '영상',
  '을',
  '보고싶어요',
  'ㅠㅠㅠ'],
 ['일본',
  '에서',
  '구독',
  '하구',
  '잇어요',
  '어늬',
  '너무',
  '이뻐요',
  'ㅠ',
  '그냥',
  '다해',
  '요',
  'ㅠㅠㅠㅠㅠ',
  '사랑',
  '해요',
  'ㅠ',
  '아',
  'ㅜㅜㅜ'],
 ['미모', '대', '방출', '너무', '예쁜', '거', '아닙니까'],
 ['들숨', '에', '재력', '을', '날숨', '에', '건강', '을', '..'],
 ['보라색',
  '아이',
  '섀도',
  '넘',
  '잘',
  '어울려여',
  '그리',
  '규',
  '파운데이션',
  '도',
  '코',
  '부분',
  '은',
  '놔두고',
  '볼',
  '이랑',
  '외곽',
  '은',
  '컨실러',
  '로',
  '살짝',
  '덮으니까',
  '중앙',
  '이',
  '밝아',
  '보이',
  '고',
  '잘',
  '어울리는거가',
  '태',
  '요',
  '오늘',
  '영상',