In [None]:
import pandas as pd
import numpy as np

In [None]:
# 전체데이터를 KOBERT로 학습
# rating >= 3.0 긍정 (1) or 부정 (0)
# valid랑 train 나눠서 (BERT가 알아서)
# BERT 모델로 계산한 평균 vs 실제 평균

df = pd.read_csv('top_10_spell.csv')
df['spell'] = df['spell'].astype(str)
df.head()

In [None]:
print(df.name.unique())
print(df.info())
print(np.round(np.mean(df.rating), 3)) # 3.218

In [None]:
df[df['name']=='네이버']

In [None]:
# 추가 전처리
df['spell'] = df['spell'].apply(lambda x:x.replace('오라 밸', '워라밸'))
df['spell'] = df['spell'].apply(lambda x:x.replace('워라밸만', '워라밸'))
df['spell'] = df['spell'].apply(lambda x:x.replace('워라밸이', '워라밸'))
df['spell'] = df['spell'].apply(lambda x:x.replace('워라밸은', '워라밸'))

df['spell'] = df['spell'].apply(lambda x:x.replace('커리어', '커리어'))
df['spell'] = df['spell'].apply(lambda x:x.replace('커리', '커리어'))
df['spell'] = df['spell'].apply(lambda x:x.replace('커리 어', '커리어'))
df['spell'] = df['spell'].apply(lambda x:x.replace('직업', '커리어'))

### 키워드 분석 ###

1.   빈도수에 따른 명사형 형태소 OR 형용사 형태소

*   회사별 키워드 분석
*   전체 리뷰 키워드 분석

2.  TF-IDF를 이용한 회사별 & 전체 키워드

*   회사별 키워드 분석
*   전체 리뷰 키워드 분석

In [None]:
!apt-get update
!apt-get install g++ openjdk-8-jdk python-dev python3-dev # 오류 해결 명령어

!pip3 install JPype1-py3 # 파이썬 언어와 자바 언어의 중간 다리 JPype1-py3
!pip3 install konlpy

In [None]:
# 한국어 형태소 분석
from konlpy.tag import Kkma, Komoran, Hannanum, Okt
kkma = Kkma()
komoran = Komoran()
hannanum = Hannanum()
okt = Okt()

In [None]:
# 1.   빈도수에 따른 명사형 형태소 OR 형용사 형태소
# *   회사별 키워드 분석
# *   전체 리뷰 키워드 분석

def keyword(data, name=None):
  from collections import Counter
  from konlpy.tag import Kkma, Komoran, Hannanum, Okt
  if name == None:
    data = data
  else:
    data = data[data['name']==name]
  words = ' '.join(list(data['spell']))

  mode = Hannanum()
  # morphs_lst = komoran.morphs(words)
  pos_lst = mode.pos(words)

  # 불용어 제거
  stop_words = ['회사']

  cleaned_words = [word for word in pos_lst if word[0] not in stop_words]

  nouns = [word for word in cleaned_words
           if word[1].startswith('N') | word[1].startswith('V')]

  count_nouns = Counter(nouns)
  count_nouns_list = count_nouns.most_common(20)

  return count_nouns_list

# 2.  TF-IDF를 이용한 회사별 & 전체 키워드
# *   회사별 키워드 분석
# *   전체 리뷰 키워드 분석

def vectorize(data, name=None):
  from sklearn.feature_extraction.text import TfidfVectorizer

  if name == None:
    data = data
  else:
    data = data[data['name']==name]

  data = data[data['name']== name]['spell']
  tfidf_vectorizer = TfidfVectorizer(max_features= 1000)
  tfidf_vectorizer.fit(data)
  vocab = tfidf_vectorizer.vocabulary_
  text_tfidf = tfidf_vectorizer.transform(data).toarray()

  # 가중치 ?? 이상의 단어
  weight = 0.6
  vocab_c = {v:k for k,v in vocab.items()}
  words_lst = []
  for item in text_tfidf:
    words = []
    for index, value in enumerate(item):
      if value >= weight:
        words.append(vocab_c[index])
    words_lst.append(words)

  # best_keyword
  tfidf_top = [vocab_c[text.argmax()] for text in text_tfidf]

  return pd.DataFrame(data=zip(df[df['name']==name]['spell'], words_lst, tfidf_top),
                      columns = ['리뷰', 'Above_T', 'top_keyword'])

def collocation(df, name):
  import nltk
  from konlpy.tag import Kkma, Komoran, Hannanum, Okt
  from nltk.collocations import BigramAssocMeasures, TrigramAssocMeasures, QuadgramAssocMeasures
  from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder, QuadgramCollocationFinder

  komoran = Komoran()
  bigram_measures = BigramAssocMeasures()
  trigram_measures = TrigramAssocMeasures()
  fourgram_measures = QuadgramAssocMeasures()

  # list의 data -> 한 문장으로
  data = df[df['name']==name]['spell']
  data = ' '.join(data)

  words = komoran.morphs(data)

  bi_ngrams = BigramCollocationFinder.from_words(words)
  tri_ngrams = TrigramCollocationFinder.from_words(words)
  quad_ngrams = QuadgramCollocationFinder.from_words(words)

  bi_ngrams.apply_freq_filter(2) # 최소 2번 등장하는 n-gram 필터링
  tri_ngrams.apply_freq_filter(2) # 최소 2번 등장하는 n-gram 필터링
  quad_ngrams.apply_freq_filter(2) # 최소 2번 등장하는 n-gram 필터링

  bi_result = bi_ngrams.nbest(bigram_measures.pmi, 10)
  tri_result = tri_ngrams.nbest(trigram_measures.pmi, 10)
  quad_result = quad_ngrams.nbest(fourgram_measures.pmi, 10)

  result = pd.DataFrame({'Bi_gram': bi_result,
                       'Tri_gram': tri_result,
                       'Quad_gram': quad_result})

  return result

In [None]:
# 'HD현대중공업' 'HMM' 'KB국민은행' 'KB금융' 'KT&G' 'KT' 'LG' 'LG에너지솔루션' 'LG전자' 'LG화학'
#  'POSCO홀딩스' 'S-Oil' 'SK' 'SK이노베이션' 'SK텔레콤' 'SK하이닉스' '고려아연' '기아' '기업은행'
#  '두산에너빌리티' '메리츠금융지주' '삼성SDI' '삼성바이오로직스' '삼성생명' '삼성에스디에스' '삼성전기' '삼성화재'
#  '셀트리온' '셀트리온헬스케어' '신한지주' '에코프로' '에코프로머티' '에코프로비엠' '카카오뱅크' '크래프톤' '포스코DX'
#  '포스코인터내셔널' '포스코퓨처엠' '하나금융지주' '하이브' '한국전력' '한화오션' '현대모비스'

keyword(df, '네이버')

In [None]:
vectorize(df, '네이버')

In [None]:
collocation(df, '네이버')

In [None]:
df1 = pd.read_csv('/content/blind_deloitte.csv', index_col = 0)

df1['title'] = df1['title'].astype(str)
df1['title'] = df1['title'].apply(lambda x:x.replace('“', ''))
df1['title'] = df1['title'].apply(lambda x:x.replace('”', ''))
df1

In [None]:
df

In [None]:
import nltk
from konlpy.tag import Kkma, Komoran, Hannanum, Okt
from nltk.collocations import BigramAssocMeasures, TrigramAssocMeasures, QuadgramAssocMeasures
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder, QuadgramCollocationFinder

komoran = Komoran()
bigram_measures = BigramAssocMeasures()
trigram_measures = TrigramAssocMeasures()
fourgram_measures = QuadgramAssocMeasures()

# list의 data -> 한 문장으로
data = df[df['name']=='현대자동차']['spell']
data = ' '.join(data)

words = komoran.morphs(data)

bi_ngrams = BigramCollocationFinder.from_words(words)
tri_ngrams = TrigramCollocationFinder.from_words(words)
quad_ngrams = QuadgramCollocationFinder.from_words(words)

bi_ngrams.apply_freq_filter(2) # 최소 2번 등장하는 n-gram 필터링
tri_ngrams.apply_freq_filter(2) # 최소 2번 등장하는 n-gram 필터링
quad_ngrams.apply_freq_filter(2) # 최소 2번 등장하는 n-gram 필터링

bi_result = bi_ngrams.nbest(bigram_measures.pmi, 10)
tri_result = tri_ngrams.nbest(trigram_measures.pmi, 10)
quad_result = quad_ngrams.nbest(fourgram_measures.pmi, 10)

result = pd.DataFrame({'Bi_gram': bi_result,
                      'Tri_gram': tri_result,
                      'Quad_gram': quad_result})
result

In [None]:
from collections import Counter
from konlpy.tag import Kkma, Komoran, Hannanum, Okt
data = df[df['name']=='현대자동차']['spell']
words = ' '.join(data)

mode = Hannanum()
# morphs_lst = komoran.morphs(words)
pos_lst = mode.pos(words)

# 불용어 제거
stop_words = ['회사']

cleaned_words = [word for word in pos_lst if word[0] not in stop_words]

nouns = [word for word in cleaned_words
          if word[1].startswith('N') | word[1].startswith('V')]

count_nouns = Counter(nouns)
count_nouns_list = count_nouns.most_common(20)

count_nouns_list

### 감정 분석 파트 ###

참고 자료

[korean-sentiment-analysis](https://github.com/ehsong/korean-sentiment-analysis)

[한국어 감성 분석기](https://github.com/mrlee23/KoreanSentimentAnalyzer)

In [2]:
import pandas as pd

df = pd.read_csv('./top_10_spell.csv') # file_path

df['spell'] = df['spell'].astype(str)
df['label'] = [1 if score>=3.0 else 0 for score in df['rating']]

df['spell'] = df['spell'].apply(lambda x:x.replace('오라 밸', '워라밸'))
df['spell'] = df['spell'].apply(lambda x:x.replace('워라밸만', '워라밸'))
df['spell'] = df['spell'].apply(lambda x:x.replace('워라밸이', '워라밸'))
df['spell'] = df['spell'].apply(lambda x:x.replace('워라밸은', '워라밸'))

df['spell'] = df['spell'].apply(lambda x:x.replace('커리어', '커리어'))
df['spell'] = df['spell'].apply(lambda x:x.replace('커리', '커리어'))
df['spell'] = df['spell'].apply(lambda x:x.replace('커리 어', '커리어'))
df['spell'] = df['spell'].apply(lambda x:x.replace('직업', '커리어'))


In [3]:
# SHOW dataset
df.tail(10)

Unnamed: 0,info,date,rating,title,name,spell,label
34387,연구개발(R&D),2020.08.07,4.0,차 좋아하는 사람한텐 최고,현대자동차,차 좋아하는 사람한텐 최고,1
34388,연구개발(R&D),2020.08.07,4.0,안정적인 회사,현대자동차,안정적인 회사,1
34389,연구개발(R&D),2020.08.07,4.0,공 같은 사기업,현대자동차,공 같은 사기업,1
34390,생산엔지니어·생산관리,2020.08.07,4.0,무난하고 좋은 회사,현대자동차,무난하고 좋은 회사,1
34391,고객서비스 전문가,2020.07.31,4.0,다닐만한 회사 남들이 부러워하는 회사,현대자동차,다닐만한 회사 남들이 부러워하는 회사,1
34392,연구개발(R&D),2020.07.31,4.0,동종 자동차 업계 기업들보다는 안정적인 느낌입니다,현대자동차,동종 자동차 업계 기업들보다는 안정적인 느낌입니다,1
34393,IT 엔지니어,2020.07.31,5.0,만족스럽습니다,현대자동차,만족스럽습니다,1
34394,생산엔지니어·생산관리,2020.07.31,4.0,글로벌 기업으로 보이고 싶은 회사 밖에서 바라보았을 땐 엄청 좋은 회사이나 실제 근...,현대자동차,글로벌 기업으로 보이고 싶은 회사 밖에서 바라보았을 땐 엄청 좋은 회사이나 실제 근...,1
34395,품질관리·보증 전문가,2020.07.31,4.0,불합리한 점이 없진 않지만 복지 연봉이 좋아 다닐만한 회사,현대자동차,불합리한 점이 없진 않지만 복지 연봉이 좋아 다닐만한 회사,1
34396,연구개발(R&D),2020.07.31,4.0,자동차 산업 환경 변화에 발맞추어 새로운 산업 패러다임으로의 전환을 준비하고 있지만...,현대자동차,자동차 산업 환경 변화에 발맞추어 새로운 산업 패러다임으로의 전환을 준비하고 있지만...,1


In [4]:
!pip install keybert

  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence-transformers>=0.3.8->keybert)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: keybert, sentence-transformers
  Building wheel for keybert (setup.py) ... [?25l[?25hdone
  Created wheel for keybert: filename=keybert-0.8.3-py3-none-any.whl size=39124 sha256=ffd8078f3ca057c39baf41b449978a5f9ef3deb8bdef6d7724b4ec07ccf2c1c6
  Stored in directory: /root/.cache/pip/wheels/70/88/07/1a3bc11fd1dd5f89924a02dcbca89a3015e25e8faa31f904dc
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=6e6734f6ed04b993f739056208ce77052d3a7d00180f8edeb24ce2824182a68b
  Stored

In [None]:
from keybert import KeyBERT
model = KeyBERT('distiluse-base-multilingual-cased-v1')

data = list(df[df['name']=='현대자동차']['spell'])

keywords = model.extract_keywords(data, keyphrase_ngram_range=(1,3), top_n=20)

review_kw = []
for review in keywords:
  if len(review) > 0:
    kw = review[0][0]
  else:
    kw = None
  review_kw.append(kw)

In [None]:
pd.DataFrame({'리뷰': data, '최고 감정': review_kw})

### KOBERT 모델 ###

In [1]:
!git clone https://github.com/monologg/KoBERT-Transformers.git

Cloning into 'KoBERT-Transformers'...
remote: Enumerating objects: 75, done.[K
remote: Counting objects: 100% (75/75), done.[K
remote: Compressing objects: 100% (50/50), done.[K
remote: Total 75 (delta 27), reused 58 (delta 17), pack-reused 0[K
Receiving objects: 100% (75/75), 21.17 KiB | 10.58 MiB/s, done.
Resolving deltas: 100% (27/27), done.


In [None]:
!pip3 install kobert-transformers

In [None]:
!pip install transformers

In [14]:
!pip install --upgrade transformers

In [18]:
!pip install --upgrade sentencepiece



In [27]:
from kobert_transformers.tokenization_kobert import KoBertTokenizer

tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') # monologg/distilkobert도 동일
tokenizer.tokenize("[CLS] 한국어 모델을 공유합니다. [SEP]")
tokenizer.convert_tokens_to_ids(['[CLS]', '▁한국', '어', '▁모델', '을', '▁공유', '합니다', '.', '[SEP]'])

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'KoBertTokenizer'.


AttributeError: ignored

In [20]:
# coding=utf-8
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team and Jangwon Park
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tokenization classes for KoBERT model """

import logging
import os
import unicodedata
from shutil import copyfile

from transformers import PreTrainedTokenizer

logger = logging.getLogger(__name__)

VOCAB_FILES_NAMES = {
    "vocab_file": "tokenizer_78b3253a26.model",
    "vocab_txt": "vocab.txt",
}

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "monologg/kobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert/tokenizer_78b3253a26.model",
        "monologg/kobert-lm": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert-lm/tokenizer_78b3253a26.model",
        "monologg/distilkobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/distilkobert/tokenizer_78b3253a26.model",
    },
    "vocab_txt": {
        "monologg/kobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert/vocab.txt",
        "monologg/kobert-lm": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert-lm/vocab.txt",
        "monologg/distilkobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/distilkobert/vocab.txt",
    },
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "monologg/kobert": 512,
    "monologg/kobert-lm": 512,
    "monologg/distilkobert": 512,
}

PRETRAINED_INIT_CONFIGURATION = {
    "monologg/kobert": {"do_lower_case": False},
    "monologg/kobert-lm": {"do_lower_case": False},
    "monologg/distilkobert": {"do_lower_case": False},
}

SPIECE_UNDERLINE = "▁"


class KoBertTokenizer(PreTrainedTokenizer):
    """
    SentencePiece based tokenizer. Peculiarities:
        - requires `SentencePiece <https://github.com/google/sentencepiece>`_
    """

    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

    def __init__(
        self,
        vocab_file,
        vocab_txt,
        do_lower_case=False,
        remove_space=True,
        keep_accents=False,
        unk_token="[UNK]",
        sep_token="[SEP]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        mask_token="[MASK]",
        **kwargs,
    ):
        super().__init__(
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            **kwargs,
        )

        # Build vocab
        self.token2idx = dict()
        self.idx2token = []
        with open(vocab_txt, "r", encoding="utf-8") as f:
            for idx, token in enumerate(f):
                token = token.strip()
                self.token2idx[token] = idx
                self.idx2token.append(token)

        try:
            import sentencepiece as spm
        except ImportError:
            logger.warning(
                "You need to install SentencePiece to use KoBertTokenizer: https://github.com/google/sentencepiece"
                "pip install sentencepiece"
            )

        self.do_lower_case = do_lower_case
        self.remove_space = remove_space
        self.keep_accents = keep_accents
        self.vocab_file = vocab_file
        self.vocab_txt = vocab_txt

        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(vocab_file)

    @property
    def vocab_size(self):
        return len(self.idx2token)

    def get_vocab(self):
        return dict(self.token2idx, **self.added_tokens_encoder)

    def __getstate__(self):
        state = self.__dict__.copy()
        state["sp_model"] = None
        return state

    def __setstate__(self, d):
        self.__dict__ = d
        try:
            import sentencepiece as spm
        except ImportError:
            logger.warning(
                "You need to install SentencePiece to use KoBertTokenizer: https://github.com/google/sentencepiece"
                "pip install sentencepiece"
            )
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(self.vocab_file)

    def preprocess_text(self, inputs):
        if self.remove_space:
            outputs = " ".join(inputs.strip().split())
        else:
            outputs = inputs
        outputs = outputs.replace("``", '"').replace("''", '"')

        if not self.keep_accents:
            outputs = unicodedata.normalize("NFKD", outputs)
            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
        if self.do_lower_case:
            outputs = outputs.lower()

        return outputs

    def _tokenize(self, text):
        """Tokenize a string."""
        text = self.preprocess_text(text)
        pieces = self.sp_model.encode(text, out_type=str)
        new_pieces = []
        for piece in pieces:
            if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
                cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
                if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
                    if len(cur_pieces[0]) == 1:
                        cur_pieces = cur_pieces[1:]
                    else:
                        cur_pieces[0] = cur_pieces[0][1:]
                cur_pieces.append(piece[-1])
                new_pieces.extend(cur_pieces)
            else:
                new_pieces.append(piece)

        return new_pieces

    def _convert_token_to_id(self, token):
        """ Converts a token (str/unicode) in an id using the vocab. """
        return self.token2idx.get(token, self.token2idx[self.unk_token])

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (string/unicode) using the vocab."""
        return self.idx2token[index]

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (strings for sub-words) in a single string."""
        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
        return out_string

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
        by concatenating and adding special tokens.
        A KoBERT sequence has the following format:
            single sequence: [CLS] X [SEP]
            pair of sequences: [CLS] A [SEP] B [SEP]
        """
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + token_ids_1 + sep

    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
        Args:
            token_ids_0: list of ids (must not contain special tokens)
            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
                for sequence pairs
            already_has_special_tokens: (default False) Set to True if the token list is already formated with
                special tokens for the model
        Returns:
            A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
        """

        if already_has_special_tokens:
            if token_ids_1 is not None:
                raise ValueError(
                    "You should not supply a second sequence if the provided sequence of "
                    "ids is already formated with special tokens for the model."
                )
            return list(
                map(
                    lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0,
                    token_ids_0,
                )
            )

        if token_ids_1 is not None:
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1]

    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
        A KoBERT sequence pair mask has the following format:
        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence
        if token_ids_1 is None, only returns the first portion of the mask (0's).
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    def save_vocabulary(self, save_directory):
        """Save the sentencepiece vocabulary (copy original file) and special tokens file
        to a directory.
        """
        if not os.path.isdir(save_directory):
            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
            return

        # 1. Save sentencepiece model
        out_vocab_model = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])

        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_model):
            copyfile(self.vocab_file, out_vocab_model)

        # 2. Save vocab.txt
        index = 0
        out_vocab_txt = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_txt"])
        with open(out_vocab_txt, "w", encoding="utf-8") as writer:
            for token, token_index in sorted(self.token2idx.items(), key=lambda kv: kv[1]):
                if index != token_index:
                    logger.warning(
                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
                        " Please check that the vocabulary is not corrupted!".format(out_vocab_txt)
                    )
                    index = token_index
                writer.write(token + "\n")
                index += 1

        return out_vocab_model, out_vocab_txt


In [23]:
!pip3 install kobert-transformers



In [24]:
from kobert_transformers import get_tokenizer
tokenizer = get_tokenizer()
tokenizer.tokenize("[CLS] 한국어 모델을 공유합니다. [SEP]")
tokenizer.convert_tokens_to_ids(['[CLS]', '▁한국', '어', '▁모델', '을', '▁공유', '합니다', '.', '[SEP]'])

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'KoBertTokenizer'.


AttributeError: ignored

In [21]:
# Load the KoBERT tokenizer
tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') # monologg/distilkobert도 동일
tokenizer.tokenize("[CLS] 한국어 모델을 공유합니다. [SEP]")
# Access the token-to-index mapping
token_to_idx = tokenizer.vocab

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'KoBertTokenizer'.


AttributeError: ignored

In [None]:
def convert_data(data_df):
    global tokenizer

    SEQ_LEN = 64 #SEQ_LEN : 버트에 들어갈 인풋의 길이

    tokens, masks, segments, targets = [], [], [], []

    for i in tqdm(range(len(data_df))):
        # token : 문장을 토큰화함
        token = tokenizer.encode(data_df[''][i], truncation=True, padding='max_length', max_length=SEQ_LEN)

        # 마스크는 토큰화한 문장에서 패딩이 아닌 부분은 1, 패딩인 부분은 0으로 통일
        num_zeros = token.count(0)
        mask = [1]*(SEQ_LEN-num_zeros) + [0]*num_zeros

        # 문장의 전후관계를 구분해주는 세그먼트는 문장이 1개밖에 없으므로 모두 0
        segment = [0]*SEQ_LEN

        # 버트 인풋으로 들어가는 token, mask, segment를 tokens, segments에 각각 저장
        tokens.append(token)
        masks.append(mask)
        segments.append(segment)

        # 정답(긍정 : 1 부정 0)을 targets 변수에 저장해 줌
        targets.append(data_df[LABEL_COLUMN][i])

    # tokens, masks, segments, 정답 변수 targets를 numpy array로 지정
    tokens = np.array(tokens)
    masks = np.array(masks)
    segments = np.array(segments)
    targets = np.array(targets)

    return [tokens, masks, segments], targets

# 위에 정의한 convert_data 함수를 불러오는 함수를 정의
def load_data(data):
    data_df = data
    data_df[DATA_COLUMN] = data_df[DATA_COLUMN].astype(str)
    data_df[LABEL_COLUMN] = data_df[LABEL_COLUMN].astype(int)
    data_x, data_y = convert_data(data_df)
    return data_x, data_y

### Training & Demonstration ###

In [None]:
SEQ_LEN = 64
BATCH_SIZE = 32
# 긍부정 문장을 포함하고 있는 칼럼
DATA_COLUMN = "spell"
# 긍정인지 부정인지를 (1=긍정,0=부정) 포함하고 있는 칼럼
LABEL_COLUMN = "label"

# train 데이터를 버트 인풋에 맞게 변환
train_x, train_y = load_data(df)

In [None]:
model = TFBertModel.from_pretrained('monologg/kobert', from_pt=True)

token_inputs = tf.keras.layers.Input((SEQ_LEN, ), dtypes=tf.int32, name='inputs_tokens')
mask_inputs = tf.keras.layers.Input((SEQ_LEN, ), dtypes=tf.int32, name='inputs_mask')
segment_inputs = tf.keras.layers.Input((SEQ_LEN, ), dtypes=tf.int32, name='input_segment')

outputs = model([token_inputs, mask_inputs, segment_inputs])

In [None]:
out