In [6]:
!pip install mxnet
!pip install gluonnlp==0.8.0
!pip install tqdm pandas
!pip install sentencepiece
!pip install transformers>=4.8.2
!pip install numpy==1.23.1
!pip install torch>=1.8.1
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

Collecting kobert_tokenizer
  Cloning https://github.com/SKTBrain/KoBERT.git to /tmp/pip-install-0tlyucy6/kobert-tokenizer_2369f6cbdcce47fbba8b75af5742c5a8
  Running command git clone --filter=blob:none --quiet https://github.com/SKTBrain/KoBERT.git /tmp/pip-install-0tlyucy6/kobert-tokenizer_2369f6cbdcce47fbba8b75af5742c5a8
  Resolved https://github.com/SKTBrain/KoBERT.git to commit 47a69af87928fc24e20f571fe10c3cc9dd9af9a3
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [7]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import gluonnlp as nlp
from tqdm import tqdm, tqdm_notebook
import pandas as pd
import numpy as np
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [8]:
device = torch.device("cuda:0")

In [9]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


In [10]:
import pandas as pd
data = pd.read_csv('startup_ideas.csv')

In [11]:
datalabel = data.loc[:, ['Technology','Healthcare & Wellness','Education','Environment & Sustainability','Media & Entertainment','Culture & Arts','Finance & Business','Social Impact & Public Good','Consumer Products & Services','Transportation & Logistics']]

data_list = data.apply(lambda row: [row['Idea'], row[1:].tolist()], axis=1)

print(data_list[1])

['스마트 시티 솔루션 개발', [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [12]:
from sklearn.model_selection import train_test_split

dataset_train, dataset_test = train_test_split(data_list, test_size=0.25, random_state=0)

print(len(dataset_train))
print(len(dataset_test))

1500
500


In [13]:
class BERTSentenceTransform:
    r"""BERT style data transformation.

    Parameters
    ----------
    tokenizer : BERTTokenizer.
        Tokenizer for the sentences.
    max_seq_length : int.
        Maximum sequence length of the sentences.
    pad : bool, default True
        Whether to pad the sentences to maximum length.
    pair : bool, default True
        Whether to transform sentences or sentence pairs.
    """

    def __init__(self, tokenizer, max_seq_length,vocab, pad=True, pair=True):
        self._tokenizer = tokenizer
        self._max_seq_length = max_seq_length
        self._pad = pad
        self._pair = pair
        self._vocab = vocab

    def __call__(self, line):
        """Perform transformation for sequence pairs or single sequences.

        The transformation is processed in the following steps:
        - tokenize the input sequences
        - insert [CLS], [SEP] as necessary
        - generate type ids to indicate whether a token belongs to the first
        sequence or the second sequence.
        - generate valid length

        For sequence pairs, the input is a tuple of 2 strings:
        text_a, text_b.

        Inputs:
            text_a: 'is this jacksonville ?'
            text_b: 'no it is not'
        Tokenization:
            text_a: 'is this jack ##son ##ville ?'
            text_b: 'no it is not .'
        Processed:
            tokens: '[CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]'
            type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
            valid_length: 14

        For single sequences, the input is a tuple of single string:
        text_a.

        Inputs:
            text_a: 'the dog is hairy .'
        Tokenization:
            text_a: 'the dog is hairy .'
        Processed:
            text_a: '[CLS] the dog is hairy . [SEP]'
            type_ids: 0     0   0   0  0     0 0
            valid_length: 7

        Parameters
        ----------
        line: tuple of str
            Input strings. For sequence pairs, the input is a tuple of 2 strings:
            (text_a, text_b). For single sequences, the input is a tuple of single
            string: (text_a,).

        Returns
        -------
        np.array: input token ids in 'int32', shape (batch_size, seq_length)
        np.array: valid length in 'int32', shape (batch_size,)
        np.array: input token type ids in 'int32', shape (batch_size, seq_length)

        """

        # convert to unicode
        text_a = line[0]
        if self._pair:
            assert len(line) == 2
            text_b = line[1]

        tokens_a = self._tokenizer.tokenize(text_a)
        tokens_b = None

        if self._pair:
            tokens_b = self._tokenizer(text_b)

        if tokens_b:
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            self._truncate_seq_pair(tokens_a, tokens_b,
                                    self._max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > self._max_seq_length - 2:
                tokens_a = tokens_a[0:(self._max_seq_length - 2)]

        # The embedding vectors for `type=0` and `type=1` were learned during
        # pre-training and are added to the wordpiece embedding vector
        # (and position vector). This is not *strictly* necessary since
        # the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.

        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        #vocab = self._tokenizer.vocab
        vocab = self._vocab
        tokens = []
        tokens.append(vocab.cls_token)
        tokens.extend(tokens_a)
        tokens.append(vocab.sep_token)
        segment_ids = [0] * len(tokens)

        if tokens_b:
            tokens.extend(tokens_b)
            tokens.append(vocab.sep_token)
            segment_ids.extend([1] * (len(tokens) - len(segment_ids)))

        input_ids = self._tokenizer.convert_tokens_to_ids(tokens)

        # The valid length of sentences. Only real  tokens are attended to.
        valid_length = len(input_ids)

        if self._pad:
            # Zero-pad up to the sequence length.
            padding_length = self._max_seq_length - valid_length
            # use padding tokens for the rest
            input_ids.extend([vocab[vocab.padding_token]] * padding_length)
            segment_ids.extend([0] * padding_length)

        return np.array(input_ids, dtype='int32'), np.array(valid_length, dtype='int32'),\
            np.array(segment_ids, dtype='int32')

In [14]:
class BERTDataset(Dataset):
    def __init__(self, dataset, bert_tokenizer, vocab, max_len, pad, pair):
        transform = BERTSentenceTransform(bert_tokenizer, max_seq_length = max_len, vocab = vocab, pad=pad, pair=pair)

        self.sentences = [transform([txt[0]]) for txt in dataset]
        self.labels = torch.tensor([label[1] for label in dataset]).to(device)

    def __getitem__(self,i):
        return (self.sentences[i] + (self.labels[i],))

    def __len__(self):
        return(len(self.labels))

In [15]:
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 20
max_grad_norm = 1
log_interval = 10
learning_rate =  5e-5

In [16]:
data_train = BERTDataset(dataset_train, tokenizer, vocab, max_len, True, False)
data_test = BERTDataset(dataset_test, tokenizer, vocab, max_len, True, False)

In [17]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=0)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=0)

In [18]:
class BERTClassifier(nn.Module):
    def __init__(self, bert, hidden_size = 768, num_classes=10, dr_rate=None, params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [19]:
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.BCEWithLogitsLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)



In [20]:
from sklearn.metrics import accuracy_score

def accuracy(y_true,y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred > 0.5, dtype=float)
    y_pred=y_pred.T
    y_true=y_true.T
    acc_list=[]
    for cate in range(0,y_pred.shape[0]):
        acc_list.append(accuracy_score(y_pred[cate],y_true[cate]))
    return sum(acc_list)/len(acc_list)

In [21]:
train_history = []
test_history = []
loss_history = []

for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.float().to(device)
        out = model(token_ids, valid_length, segment_ids)

        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()

        train_acc = accuracy(label.cpu().detach().numpy(), nn.Sigmoid()(out).cpu().detach().numpy())

        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc))
            train_history.append(train_acc)
            loss_history.append(loss.data.cpu().numpy())

    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc = accuracy(label.cpu().detach().numpy(), out.cpu().detach().numpy())
    print("epoch {} acc {}".format(e+1, test_acc))
    test_history.append(test_acc)

epoch 1 batch id 1 loss 0.7419866919517517 acc 0.4453125
epoch 1 acc 0.8673076923076923
epoch 2 batch id 1 loss 0.6935362815856934 acc 0.5484375
epoch 2 acc 0.8673076923076923
epoch 3 batch id 1 loss 0.6004552841186523 acc 0.7703125
epoch 3 acc 0.8673076923076923
epoch 4 batch id 1 loss 0.5207034945487976 acc 0.8515625
epoch 4 acc 0.8673076923076923
epoch 5 batch id 1 loss 0.45216426253318787 acc 0.865625
epoch 5 acc 0.8673076923076923
epoch 6 batch id 1 loss 0.4061201214790344 acc 0.865625
epoch 6 acc 0.8673076923076923
epoch 7 batch id 1 loss 0.3689536154270172 acc 0.865625
epoch 7 acc 0.8673076923076923
epoch 8 batch id 1 loss 0.3369368016719818 acc 0.871875
epoch 8 acc 0.8673076923076923
epoch 9 batch id 1 loss 0.31327247619628906 acc 0.8734375
epoch 9 acc 0.8673076923076923
epoch 10 batch id 1 loss 0.29832932353019714 acc 0.8796875
epoch 10 acc 0.9096153846153847
epoch 11 batch id 1 loss 0.2755594253540039 acc 0.9
epoch 11 acc 0.9173076923076924
epoch 12 batch id 1 loss 0.24872942

In [22]:
def predict(predict_sentence):
    data = [predict_sentence, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, tokenizer, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size = batch_size, num_workers = 0)

    model.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length = valid_length
        label = label.long().to(device)

        sigmoid = nn.Sigmoid()

        out = model(token_ids, valid_length, segment_ids)
        out = sigmoid(out)

        test_eval = []
        for logits in out.cpu().detach().numpy()[0]:
            test_eval.append(int(logits > 0.5))

        result = []
        label = ['Technology','Healthcare & Wellness','Education','Environment & Sustainability','Media & Entertainment','Culture & Arts','Finance & Business','Social Impact & Public Good','Consumer Products & Services','Transportation & Logistics']

        for i in range(len(label)):
          if test_eval[i] == 1:
            result.append(label[i])
            print(label[i])

        return test_eval

In [24]:
while (True):
  sentence = input("문장을 입력해주세요 : ")
  print(predict(sentence))

문장을 입력해주세요 : 인공지능 챗봇을 이용한 헬스케어 서비스 아이디어
Technology
Healthcare & Wellness
Consumer Products & Services
[1, 1, 0, 0, 0, 0, 0, 0, 1, 0]
문장을 입력해주세요 : 친환경 안경 판매 서비스
Environment & Sustainability
Consumer Products & Services
[0, 0, 0, 1, 0, 0, 0, 0, 1, 0]
문장을 입력해주세요 : 변호사 매칭 서비스
Consumer Products & Services
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
문장을 입력해주세요 : 노년층을 위한 방송장비 제공 서비스
Consumer Products & Services
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
문장을 입력해주세요 : 다이어트 교육 플랫폼
Education
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
문장을 입력해주세요 : 다이어트 가르침 플랫폼
Education
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
문장을 입력해주세요 : 헬스케어 교육 플랫폼
Technology
Education
[1, 0, 1, 0, 0, 0, 0, 0, 0, 0]
문장을 입력해주세요 : 헬스
Technology
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
문장을 입력해주세요 : 헬스케어 서비스
Technology
Healthcare & Wellness
Consumer Products & Services
[1, 1, 0, 0, 0, 0, 0, 0, 1, 0]
문장을 입력해주세요 : 헬스케어 교육 서비스
Technology
Consumer Products & Services
[1, 0, 0, 0, 0, 0, 0, 0, 1, 0]
문장을 입력해주세요 : 수륙양용 자동차 배달 서비스
Consumer Products & Services
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
문장을 입력

KeyboardInterrupt: Interrupted by user