In [1]:
import pandas as pd
import numpy as np
import re

kiumSet = pd.read_csv('.\TrainSet _1차_복사.csv')
#print(pd.__file__)

In [2]:
'''
1. 모든 필드의 데이터에 줄넘김 '\n' 문자열이 존재. 이를 띄어쓰기(' ')로 변환한다.
2. Conclusion 필드의 값이 NULL이면 AcuteInfarction(진단 결과)는 모두 0, 검사 내용도 미비 (MRI...)
   -> 해당 데이터는 중요하지 않으니 제외시켜도 괜찮은 부분일까? - 아니면 결과 0처리 단독으로?
3. Findings 필드의 값이 NULL(NaN)이어도 Conclusion 설명이 적혀있으며 검사 결과도 0과 1로 구분된다.
4. Findings와 Conclusion 두 필드 모두 NULL인 경우는 없다.

5. 항목마다 번호 분류가 있다(ex. (1)(2)..., 1.2..., ). 정규표현식 사용해서 삭제처리.
6. 모든 문장 데이터를 소문자 변환 후 처리한다.
'''
kiumSet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6190 entries, 0 to 6189
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Findings         4814 non-null   object
 1   Conclusion       6156 non-null   object
 2   AcuteInfarction  6190 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 145.2+ KB


In [3]:
df = pd.DataFrame(kiumSet)

In [4]:
'''
 Conclusion 값이 없는 데이터
 --> 소견이 없다면 뇌경색이 없다고 판단해도 되는 부분?
 1. MRI for radiosurgery
 2. MRI for GKRS, a malignant mass, left cerebellum.
 3. MRI for radiosurgery of brain metastasis
'''
df[df['Conclusion'].isnull()]

Unnamed: 0,Findings,Conclusion,AcuteInfarction
266,MRI for radiosurgery \r\n,,0
446,MRI for radiosurgery\r\n,,0
482,MRI for radiosurgery\r\n,,0
537,MRI for radiosurgery\r\n,,0
716,MRI for radiosurgery \r\n,,0
790,MRI for radiosurgery \r\n,,0
870,MRI for radiosurgery \r\n,,0
1068,MRI for radiosurgery \r\n,,0
1091,MRI for radiosurgery\r\n,,0
1164,MRI for radiosurgery \r\n,,0


In [5]:
# Findings에는 1376개의 NaN(결측치) 데이터 존재.
# Conclusion에는 34개의 NaN(결측치) 데이터존재.
print(f"Findings 결측값 = {df['Findings'].isnull().sum()}")
print(f"Conclusion 결측값 = {df['Conclusion'].isnull().sum()}")

# 모든 결측값에 빈 문자열 대체
df.fillna('', inplace=True)

Findings 결측값 = 1376
Conclusion 결측값 = 34


In [6]:
# 결측치 처리 결과
print(f"Findings 결측값 = {df['Findings'].isnull().sum()}")
print(f"Conclusion 결측값 = {df['Conclusion'].isnull().sum()}")

Findings 결측값 = 0
Conclusion 결측값 = 0


In [7]:
# '\n' 문자를 띄어쓰기 처리
# '\r' 문자를 삭제
# 별도의 특수문자(-, >, <, (, ) 삭제처리 및 항목 번호구조 ('1.', '2.', '1)', '2)'...) 삭제처리)
for i in range(df.shape[0]):
    row = df.iloc[i]
    Ftext = ' '.join(map(str, row['Findings'].split('\n'))).strip()
    Ftext = Ftext.replace('\r', '')
    Ctext = ' '.join(map(str, row['Conclusion'].split('\n'))).strip()
    Ctext = Ctext.replace('\r', '')
    
    Ftext = re.sub('[1-9]\.[^0-9]|[1-9]\)|[\-\<\>\(\)\:]', "", Ftext)
    Ctext = re.sub('[1-9]\.[^0-9]|[1-9]\)|[\-\<\>\(\)\:]', "", Ctext)

    Atext = int(str(row['AcuteInfarction']).strip())

    df.iloc[i] = [Ftext, Ctext, Atext]
    
print(df)

                                               Findings  \
0     Clinical information  두부외상 후 후유증 평가  Axial T1W...   
1     Clinical information  lung cancer Axial T1WI, ...   
2     Clinical information  Multiple Sclerosis  Axia...   
3     Clinical information  patient with DLBCL.  Axi...   
4     Clinical information  Transient cerebral ische...   
...                                                 ...   
6185  Clinical information  s/p Removal of vestibula...   
6186  CI, headache of sudden onset known UIA. Axial ...   
6187  Clinical information  patient with DLBCL.  Axi...   
6188  Clinical information  Lung cancer patient 임.  ...   
6189  CI, cerebellar mass metastatic carcinoma, a ne...   

                                             Conclusion  AcuteInfarction  
0     Encephalomalacic change in both frontal lobes,...                0  
1     No change of focal enhancing lesion in left ce...                0  
2     No significant interval change of abnormal hyp...           

In [8]:
# DataFrame에서 무작위로 표본 추출하기
# https://rfriend.tistory.com/602
'''
pd.DataFrame.sample(
    n = 추출할 표본 개수(1~정수)
    frac = 추출할 표본 비율 (위의 n이랑 둘 중 하나만 사용)
    replace = 복원 추출 유무 (True, False)
    weights = 가중치 부여 (column 이름)
    random_state = 난수 발생 초깃값 (재현 가능성을 위한 경우)
    axis = 0:인덱스 기준, 1:column 기준
)
'''
# reset_index = 뒤죽박죽된 이전의 인덱스를 초기화 시킴
df_shuffled = df.sample(frac=1).reset_index(drop=True)
print(df_shuffled)

                                               Findings  \
0                                                         
1     CI, stroke. Axial T2* GRE image, axial DWI 획득하...   
2                                                         
3     Clinical information  Spinal cord injury Axial...   
4                                                         
...                                                 ...   
6185  Clinical information  Aneurysm unruptured  Axi...   
6186  Clinical information  Metabolic encephalopathy...   
6187                                                      
6188                                                      
6189  Clinical information  lung cancer  Axial T1WI,...   

                                             Conclusion  AcuteInfarction  
0     MRI; mild microangiopathy  MRA  no specific in...                0  
1     No evidence of acute infarction. Encephalomala...                0  
2     no acute lesion bilateral maxillary, ethmoid a...           

In [9]:
#train data & test data 로드 
train = df_shuffled[:5190] # 5190개 train
test = df_shuffled[5190:] # 1000개 test

In [11]:
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cjsqh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cjsqh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
'''
http://www.koreascience.kr/article/JAKO202009135419336.pdf
BERT 분류 모델의 경우 각 문장의 앞마다 [CLS]를 붙여 문장 시작을 명시.
문장의 종료는 [SEP]. 
[CLS]을 인식함으로써 문장의 처음이라 알 수 있게 하고, 
[SEP]을 인식함으로써 문장의 끝을 알 수 있다. 

BERT의 pretrain 방법은 [SEP]를 인식하여 두 문장이 이어지는 문장인지, 관련 없는 문장인지 학습하는 것.
'''
train_sentences = []
for idx, Fs in enumerate(zip(train.Findings, train.Conclusion)):
    text = Fs[0] + Fs[1]
    sentences = sent_tokenize(text)
    Bert_sentences = "[CLS] "
    for s in sentences:
        Bert_sentences += s + " [SEP] "
    
    train_sentences.append(Bert_sentences)

print(train_sentences)

['[CLS] MRI; mild microangiopathy  MRA  no specific interval change      right inferior M2 focal severe stenosis      right superior M2 focal moderate stenosis. [SEP] both P2 stenosis. [SEP] ', '[CLS] CI, stroke. [SEP] Axial T2* GRE image, axial DWI 획득하였으며 조영증강은 시행하지 않았음.No evidence of acute infarction. [SEP] Encephalomalacia at the right frontal lobe. [SEP] ', '[CLS] no acute lesion bilateral maxillary, ethmoid and left sphenoid sinusitis [SEP] ', '[CLS] Clinical information  Spinal cord injury Axial T1WI, sagittal T1WI, axial T2WI, axial FLAIR, axial T2* GRE image, intracranial TOF MRA 획득하였으며 brain 및 neck MRA에 대해 조영증강을 시행함.Microbleed at right frontal lobe and left parietal lobe      r/o cavernous malformation at right parietal lobe Left maxillary sinusitis MRA      Focal severe stenosis of left proximal A1           probable artifact          rec F/U CT Angiography      Right VA hypoplasia [SEP] ', '[CLS] no specific interval change compare to the latest MR brain [SEP] ', '[CLS] Clin

In [13]:
len(train_sentences)

5190

In [16]:
# 정답지 라벨 저장
labels = train['AcuteInfarction'].values

print(labels, len(labels))

[0 0 0 ... 0 0 0] 5190


In [17]:
from transformers import BertTokenizer

In [18]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
MAX_LEN = 512
tokenized_texts = []
for s in train_sentences:
    t = tokenizer.tokenize(s)
    #MAX_LEN = max(MAX_LEN, len(t))
    
    tokenized_texts.append(t[:512])

In [None]:
print(train_sentences[3])
print(tokenized_texts[3])
print(MAX_LEN)

In [19]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# !conda install tensorflow
# !conda install keras
# !conda install h5py

In [20]:
#https://blog.naver.com/qbxlvnf11/221945962124
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="pre", padding="pre")

In [21]:
print(input_ids[3000])

[     0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      

In [22]:
attention_masks = []

for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

In [None]:
print(attention_masks[5190])

In [27]:
import torch
import tensorflow as tf
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import random
import time
import datetime


train_inputs, validation_inputs, train_labels, validation_labels = \
train_test_split(input_ids, labels, random_state=2000, test_size=0.1)
                                                
train_masks, validation_masks, _, _ = \
train_test_split(attention_masks, input_ids, random_state=2000, test_size=0.1)     
                                                       
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks) 

In [28]:
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [29]:
# 테스트 데이터 전처리
test_sentences = []
for idx, Fs in enumerate(zip(test.Findings, test.Conclusion)):
    text = Fs[0] + Fs[1]
    sentences = sent_tokenize(text)
    Bert_sentences = "[CLS] "
    for s in sentences:
        Bert_sentences += s + " [SEP] "
    
    test_sentences.append(Bert_sentences)

In [30]:
labels = test['AcuteInfarction'].values
#print(labels, len(labels))

In [31]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)

tokenized_texts = []
for s in test_sentences:
    t = tokenizer.tokenize(s)
    tokenized_texts.append(t[:512])

In [32]:
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [33]:
attention_masks = []
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

In [34]:
# 파이토치 텐서로 변환
test_inputs = torch.tensor(input_ids)
test_labels = torch.tensor(labels)
test_masks = torch.tensor(attention_masks)

# 배치 사이즈 설정 및 데이터 설정
batch_size = 32
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [35]:
# https://projectlog-eraser.tistory.com/26
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)
model.cpu()
#model.cuda()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [36]:
# 옵티마이저
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # 학습률(learning rate)
                  eps = 1e-8 
                )
# 에폭수
epochs = 4

# 총 훈련 스텝 : 배치반복 횟수 * 에폭
total_steps = len(train_dataloader) * epochs

# 스케줄러 생성
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)



In [37]:
# 정확도 계산 함수
def flat_accuracy(preds, labels):
    
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    return np.sum(pred_flat == labels_flat) / len(labels_flat)
    
    
# 시간 표시 함수
def format_time(elapsed):

    # 반올림
    elapsed_rounded = int(round((elapsed)))
    
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [39]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

No GPU available, using the CPU instead.


In [40]:
#랜덤시드 고정
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

#그래디언트 초기화
model.zero_grad()

# 학습
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # 시작 시간 설정
    t0 = time.time()

    # 로스 초기화
    total_loss = 0

    # 훈련모드로 변경
    model.train()
        
    # 데이터로더에서 배치만큼 반복하여 가져옴
    for step, batch in enumerate(train_dataloader):
        # 경과 정보 표시
        if step % 500 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch

        # Forward 수행                
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        
        # 로스 구함
        loss = outputs[0]

        # 총 로스 계산
        total_loss += loss.item()

        # Backward 수행으로 그래디언트 계산
        loss.backward()

        # 그래디언트 클리핑
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # 그래디언트를 통해 가중치 파라미터 업데이트
        optimizer.step()

        # 스케줄러로 학습률 감소
        scheduler.step()

        # 그래디언트 초기화
        model.zero_grad()

    # 평균 로스 계산
    avg_train_loss = total_loss / len(train_dataloader)            

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    #시작 시간 설정
    t0 = time.time()

    # 평가모드로 변경
    model.eval()

    # 변수 초기화
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # 데이터로더에서 배치만큼 반복하여 가져옴
    for batch in validation_dataloader:
        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch
        
        # 그래디언트 계산 안함
        with torch.no_grad():     
            # Forward 수행
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # 로스 구함
        logits = outputs[0]

        # CPU로 데이터 이동
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # 출력 로짓과 라벨을 비교하여 정확도 계산
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...

  Average training loss: 0.15
  Training epcoh took: 8:58:10

Running Validation...
  Accuracy: 0.98
  Validation took: 0:17:50

Training...

  Average training loss: 0.05
  Training epcoh took: 8:49:27

Running Validation...
  Accuracy: 0.99
  Validation took: 0:18:01

Training...

  Average training loss: 0.04
  Training epcoh took: 9:15:53

Running Validation...
  Accuracy: 0.99
  Validation took: 0:18:55

Training...

  Average training loss: 0.03
  Training epcoh took: 9:02:30

Running Validation...
  Accuracy: 0.99
  Validation took: 0:17:42

Training complete!


In [47]:
torch.save(model, '.\model_save.pht')

In [48]:
#https://jimmy-ai.tistory.com/166
torch.save(model.state_dict(), '.\model_dict_save.pht')

In [49]:
torch.save(model, '.\model_save.pt')
torch.save(model.state_dict(), '.\model_dict_save.pt')

In [50]:
#시작 시간 설정
t0 = time.time()

# 평가모드로 변경
model.eval()

# 변수 초기화
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

# 데이터로더에서 배치만큼 반복하여 가져옴
for step, batch in enumerate(test_dataloader):
    # 경과 정보 표시
    if step % 100 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(test_dataloader), elapsed))

    # 배치를 GPU에 넣음
    batch = tuple(t.to(device) for t in batch)
    
    # 배치에서 데이터 추출
    b_input_ids, b_input_mask, b_labels = batch
    
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)
    
    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    # 출력 로짓과 라벨을 비교하여 정확도 계산
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

print("")
print("Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
print("Test took: {:}".format(format_time(time.time() - t0)))


Accuracy: 0.99
Test took: 0:38:27


In [None]:
# https://076923.github.io/posts/Python-pytorch-10/

In [None]:
df.to_csv('.\정제결과.csv', encoding='cp949', index=None)

In [None]:
df.shape

In [None]:
import re  #정규표현식

# 1. 순서 번호 삭제 (1., 2., 3., ...)
# 2. 특수문자 포함된 구조 삭제. 단 '.'은 소수점 표현 때문에 살림. ('-,<>()')
p = re.compile('[1-9]\.[^0-9]')

m = p.findall('''"1. No definite abnormal enhancing lesion on this MR.
2. Old infarctions at the right cerebellum, right temporal lobe, both BG.
3. Diffuse brain atrophy.
4. Microangiopathy.
5. Both maxillary sinusitis.''')

In [None]:
print(m)

In [None]:
str = '''"Two metastases in the brain.
 1) Rt parietal lobe: increased extent of enhancing portion (indeterminate change).
     -> probable tumor progression (DDx. radiation-induced change).
     Rec) F/U or MR Perfusion, if clinically necessary.
 2) Midbrain: slightly decreased extent of enhancing portion.'''
for t in m:
    result = re.sub('[1-9]\.[^0-9]|[1-9]\)|[\-\<\>\(\)\:]', "", str)
print(result)

In [None]:
#  !pip3 install --upgrade pip
#  !pip3 install tensorflow-cpu
#  !pip3 install transformers
# !pip3 install tensorflow==2.3.0

#!pip3 install transformers

In [None]:
import pandas as pd
from transformers import BertTokenizer
#tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") # Bert-base의 토크나이저

In [None]:
tokenizer = BertTokenizer.from_pretrained("wiki_multilingual_cased")

result = tokenizer.tokenize('''CI, F/U for cerebral metastases, s/p GKRS. Axial T1WI, sagittal T1WI, axial T2WI, axial FLAIR, axial T2* GRE image 획득하였으며 조영증강을 시행함.  Brain, CSF space, and related findings Multiple cerebral metastases.   Rt frontal lobe 6 lesions     A. Middle frontal gyrus 9 mm  8mm.     B. Other smaller lesions all slightly decreased or no change in size   Rt occipital lobe  all slightly decreased in size.   Rt parietal lobe  slightly decreased in size.   Lt cerebellum  slightly decreased in size.  Slightly decreased extent of an indeterminate enhancement at the left subinsular area.    Rec F/U to exclude metastasis.  New appearance of an indeterminate enhancing lesion at the right frontal lobe Skull, PNS, orbits, and temporal Unremarkable.''')

In [None]:
print(result)

In [None]:
with open('vocabulary.txt', 'w', encoding='utf-8') as f:
  for token in tokenizer.vocab.keys():
    f.write(token + '\n')

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

In [None]:
text = '''조영증강 전후의 영상에서 뇌실질에 이상 신호는 관찰되지 않고, 백질회색질의 구분도 잘 되고 있다. 소뇌, 뇌간, 송과체, 뇌하수체, 해면정맥동, 시신경로  등에도 이상소견은 보이지 않는다. 뇌실을 포함한 뇌척수액공간에도 특이소견은 없으며, 축외 병변도 관찰되지 않는다. 부비동, 안와, 측두골에서도 비정상적인 소견은 보이지 않는다. 자기공명 뇌혈관 조영 영상에서 이상 소견은 보이지 않는다.'''

print(sent_tokenize(text))

In [None]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer

In [None]:
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
english_stoprs = set(stopwords.words('english'))
text = '''CI, F/U for cerebral metastases, s/p GKRS. Axial T1WI, sagittal T1WI, axial T2WI, axial FLAIR, axial T2* GRE image 획득하였으며 조영증강을 시행함.  Brain, CSF space, and related findings Multiple cerebral metastases.   Rt frontal lobe 6 lesions     A. Middle frontal gyrus 9 mm  8mm.     B. Other smaller lesions all slightly decreased or no change in size   Rt occipital lobe  all slightly decreased in size.   Rt parietal lobe  slightly decreased in size.   Lt cerebellum  slightly decreased in size.  Slightly decreased extent of an indeterminate enhancement at the left subinsular area.    Rec F/U to exclude metastasis.  New appearance of an indeterminate enhancing lesion at the right frontal lobe Skull, PNS, orbits, and temporal Unremarkable.'''
words = text_to_word_sequence(text)
final = [word for word in words if word not in english_stoprs]
print(final)

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words = None,
    filters = '"#$%&()*+,-.:;<=>?@[\]^_`{|}~\t\n→',
    lower=True,
    split=' ', 
    char_level=False, 
    oov_token=None, 
    document_count=0
)


sentences = [
  'CI, F/U for cerebral metastases, s/p GKRS. Axial T1WI, sagittal T1WI, axial T2WI, axial FLAIR, axial T2* GRE image 획득하였으며 조영증강을 시행함.',
  'Brain, CSF space, and related findings Multiple cerebral metastases.',
  'Rt frontal lobe 6 lesions A. Middle frontal gyrus 9 mm  8mm.',
  'B. Other smaller lesions all slightly decreased or no change in size   Rt occipital lobe  all slightly decreased in size.',
  'Rt parietal lobe  slightly decreased in size.',
  'Lt cerebellum  slightly decreased in size.',
  'Slightly decreased extent of an indeterminate enhancement at the left subinsular area.',
  '    Rec F/U to exclude metastasis.',
  '  New appearance of an indeterminate enhancing lesion at the right frontal lobe Skull, PNS, orbits, and temporal Unremarkable.'
]

#tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_dic = tokenizer.word_index
print(word_dic)

sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

padded = pad_sequences(sequences)
print(padded)


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

In [None]:
from transformers import BertTokenizer
tz = BertTokenizer.from_pretrained("bert-base-cased")

In [None]:
sent = [
  'CI, F/U for cerebral metastases, s/p GKRS. Axial T1WI, sagittal T1WI, axial T2WI, axial FLAIR, axial T2* GRE image 획득하였으며 조영증강을 시행함.',
  'Brain, CSF space, and related findings Multiple cerebral metastases.',
  'Rt frontal lobe 6 lesions A. Middle frontal gyrus 9 mm  8mm.',
  'B. Other smaller lesions all slightly decreased or no change in size   Rt occipital lobe  all slightly decreased in size.',
  'Rt parietal lobe  slightly decreased in size.',
  'Lt cerebellum  slightly decreased in size.',
  'Slightly decreased extent of an indeterminate enhancement at the left subinsular area.',
  '    Rec F/U to exclude metastasis.',
  '  New appearance of an indeterminate enhancing lesion at the right frontal lobe Skull, PNS, orbits, and temporal Unremarkable.'
]

tz.tokenize(sent)
tz.convert_tokens_to_ids(tz.tokenize(sent))

In [None]:
import torch
import tensorflow as tf
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import random
import time
import datetime


In [None]:
n_devices = torch.cuda.device_count()
print(n_devices)

for i in range(n_devices):
    print(torch.cuda.get_device_name(i))

In [None]:
#!pip3 install torch torchvision torchaudio
#!conda install torch torchvision torchaudio

In [None]:
sentences = '''Clinical information  Transient cerebral ischemic attack  Axial T1WI, sagittal T1WI, axial T2WI, axial FLAIR, axial T2* GRE image, axial DWI, intracranial TOF MRA 획득하였으며 neck MRA에 대해서 조영증강을 시행함.'''

tokenizer1 = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
tokenizer2 = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
#tokenized_texts = [tokenizer1.tokenize(s) for s in sentences]
tokenized_texts = tokenizer1.tokenize(sentences)

In [None]:
print(sentences)  #토크나이징 전
print(tokenized_texts) #토크나이징 후
len(tokenized_texts)

In [None]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')