In [7]:
import pandas as pd
import numpy as np
import re

kiumSet = pd.read_csv('.\TrainSet _1차_복사.csv')

In [2]:
'''
1. 모든 필드의 데이터에 줄넘김 '\n' 문자열이 존재. 이를 띄어쓰기(' ')로 변환한다.
2. Conclusion 필드의 값이 NULL이면 AcuteInfarction(진단 결과)는 모두 0, 검사 내용도 미비 (MRI...)
   -> 해당 데이터는 중요하지 않으니 제외시켜도 괜찮은 부분일까? - 아니면 결과 0처리 단독으로?
3. Findings 필드의 값이 NULL(NaN)이어도 Conclusion 설명이 적혀있으며 검사 결과도 0과 1로 구분된다.
4. Findings와 Conclusion 두 필드 모두 NULL인 경우는 없다.

5. 항목마다 번호 분류가 있다(ex. (1)(2)..., 1.2..., ). 정규표현식 사용해서 삭제처리.
6. 모든 문장 데이터를 소문자 변환 후 처리한다.
'''
kiumSet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6190 entries, 0 to 6189
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Findings         4814 non-null   object
 1   Conclusion       6156 non-null   object
 2   AcuteInfarction  6190 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 145.2+ KB


In [3]:
df = pd.DataFrame(kiumSet)

In [5]:
'''
 Conclusion 값이 없는 데이터
 --> 소견이 없다면 뇌경색이 없다고 판단해도 되는 부분?
 1. MRI for radiosurgery
 2. MRI for GKRS, a malignant mass, left cerebellum.
 3. MRI for radiosurgery of brain metastasis
'''
df[df['Conclusion'].isnull()]

Unnamed: 0,Findings,Conclusion,AcuteInfarction
266,MRI for radiosurgery \r\n,,0
446,MRI for radiosurgery\r\n,,0
482,MRI for radiosurgery\r\n,,0
537,MRI for radiosurgery\r\n,,0
716,MRI for radiosurgery \r\n,,0
790,MRI for radiosurgery \r\n,,0
870,MRI for radiosurgery \r\n,,0
1068,MRI for radiosurgery \r\n,,0
1091,MRI for radiosurgery\r\n,,0
1164,MRI for radiosurgery \r\n,,0


In [4]:
# Findings에는 1376개의 NaN(결측치) 데이터 존재.
# Conclusion에는 34개의 NaN(결측치) 데이터존재.
print(f"Findings 결측값 = {df['Findings'].isnull().sum()}")
print(f"Conclusion 결측값 = {df['Conclusion'].isnull().sum()}")

# 모든 결측값에 빈 문자열 대체
df.fillna('', inplace=True)

Findings 결측값 = 1376
Conclusion 결측값 = 34


In [5]:
# 결측치 처리 결과
print(f"Findings 결측값 = {df['Findings'].isnull().sum()}")
print(f"Conclusion 결측값 = {df['Conclusion'].isnull().sum()}")

Findings 결측값 = 0
Conclusion 결측값 = 0


In [18]:
# '\n' 문자를 띄어쓰기 처리
# '\r' 문자를 삭제
# 별도의 특수문자(-, >, <, (, ) 삭제처리 및 항목 번호구조 ('1.', '2.', '1)', '2)'...) 삭제처리)
for i in range(df.shape[0]):
    row = df.iloc[i]
    Ftext = ' '.join(map(str, row['Findings'].split('\n'))).strip()
    Ftext = Ftext.replace('\r', '')
    Ctext = ' '.join(map(str, row['Conclusion'].split('\n'))).strip()
    Ctext = Ctext.replace('\r', '')
    
    Ftext = re.sub('[1-9]\.[^0-9]|[1-9]\)|[\-\<\>\(\)\:]', "", Ftext)
    Ctext = re.sub('[1-9]\.[^0-9]|[1-9]\)|[\-\<\>\(\)\:]', "", Ctext)

    Atext = int(str(row['AcuteInfarction']).strip())

    df.iloc[i] = [Ftext, Ctext, Atext]
    
print(df)

                                               Findings  \
0     Clinical information  두부외상 후 후유증 평가  Axial T1W...   
1     Clinical information  lung cancer Axial T1WI, ...   
2     Clinical information  Multiple Sclerosis  Axia...   
3     Clinical information  patient with DLBCL.  Axi...   
4     Clinical information  Transient cerebral ische...   
...                                                 ...   
6185  Clinical information  s/p Removal of vestibula...   
6186  CI, headache of sudden onset known UIA. Axial ...   
6187  Clinical information  patient with DLBCL.  Axi...   
6188  Clinical information  Lung cancer patient 임.  ...   
6189  CI, cerebellar mass metastatic carcinoma, a ne...   

                                             Conclusion  AcuteInfarction  
0     Encephalomalacic change in both frontal lobes,...                0  
1     No change of focal enhancing lesion in left ce...                0  
2     No significant interval change of abnormal hyp...           

In [20]:
df.to_csv('.\정제결과.csv', encoding='cp949', index=None)

In [7]:
df.shape

(6190, 3)

In [8]:
import re  #정규표현식

# 1. 순서 번호 삭제 (1., 2., 3., ...)
# 2. 특수문자 포함된 구조 삭제. 단 '.'은 소수점 표현 때문에 살림. ('-,<>()')
p = re.compile('[1-9]\.[^0-9]')

m = p.findall('''"1. No definite abnormal enhancing lesion on this MR.
2. Old infarctions at the right cerebellum, right temporal lobe, both BG.
3. Diffuse brain atrophy.
4. Microangiopathy.
5. Both maxillary sinusitis.''')

In [9]:
print(m)

['1. ', '2. ', '3. ', '4. ', '5. ']


In [27]:
str = '''"Two metastases in the brain.
 1) Rt parietal lobe: increased extent of enhancing portion (indeterminate change).
     -> probable tumor progression (DDx. radiation-induced change).
     Rec) F/U or MR Perfusion, if clinically necessary.
 2) Midbrain: slightly decreased extent of enhancing portion.'''
for t in m:
    result = re.sub('[1-9]\.[^0-9]|[1-9]\)|[\-\<\>\(\)\:]', "", str)
print(result)

"Two metastases in the brain.
  Rt parietal lobe: increased extent of enhancing portion indeterminate change.
      probable tumor progression DDx. radiationinduced change.
     Rec F/U or MR Perfusion, if clinically necessary.
  Midbrain: slightly decreased extent of enhancing portion.


In [24]:
 !pip3 install --upgrade pip
 !pip3 install tensorflow-cpu
 !pip3 install transformers
# !pip3 install tensorflow==2.3.0

#!pip3 install transformers



ERROR: To modify pip, please run the following command:
c:\anaconda3\python.exe -m pip install --upgrade pip



Collecting pip
  Downloading pip-22.2.2-py3-none-any.whl (2.0 MB)
     ---------------------------------------- 2.0/2.0 MB 10.0 MB/s eta 0:00:00

[notice] A new release of pip available: 22.1.2 -> 22.2.2
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip available: 22.1.2 -> 22.2.2
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip available: 22.1.2 -> 22.2.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import pandas as pd
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") # Bert-base의 토크나이저

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




In [8]:
tokenizer = BertTokenizer.from_pretrained("wiki_multilingual_cased")

result = tokenizer.tokenize('''CI, F/U for cerebral metastases, s/p GKRS. Axial T1WI, sagittal T1WI, axial T2WI, axial FLAIR, axial T2* GRE image 획득하였으며 조영증강을 시행함.  Brain, CSF space, and related findings Multiple cerebral metastases.   Rt frontal lobe 6 lesions     A. Middle frontal gyrus 9 mm  8mm.     B. Other smaller lesions all slightly decreased or no change in size   Rt occipital lobe  all slightly decreased in size.   Rt parietal lobe  slightly decreased in size.   Lt cerebellum  slightly decreased in size.  Slightly decreased extent of an indeterminate enhancement at the left subinsular area.    Rec F/U to exclude metastasis.  New appearance of an indeterminate enhancing lesion at the right frontal lobe Skull, PNS, orbits, and temporal Unremarkable.''')

OSError: wiki_multilingual_cased is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`.

In [27]:
print(result)

['ci', ',', 'f', '/', 'u', 'for', 'cerebral', 'meta', '##sta', '##ses', ',', 's', '/', 'p', 'g', '##kr', '##s', '.', 'axial', 't', '##1', '##wi', ',', 'sa', '##git', '##tal', 't', '##1', '##wi', ',', 'axial', 't', '##2', '##wi', ',', 'axial', 'flair', ',', 'axial', 't', '##2', '*', 'gr', '##e', 'image', '[UNK]', 'ᄌ', '##ᅩ', '##ᄋ', '##ᅧ', '##ᆼ', '##ᄌ', '##ᅳ', '##ᆼ', '##ᄀ', '##ᅡ', '##ᆼ', '##ᄋ', '##ᅳ', '##ᆯ', 'ᄉ', '##ᅵ', '##ᄒ', '##ᅢ', '##ᆼ', '##ᄒ', '##ᅡ', '##ᆷ', '.', 'brain', ',', 'cs', '##f', 'space', ',', 'and', 'related', 'findings', 'multiple', 'cerebral', 'meta', '##sta', '##ses', '.', 'rt', 'frontal', 'lobe', '6', 'lesions', 'a', '.', 'middle', 'frontal', 'g', '##yr', '##us', '9', 'mm', '8', '##mm', '.', 'b', '.', 'other', 'smaller', 'lesions', 'all', 'slightly', 'decreased', 'or', 'no', 'change', 'in', 'size', 'rt', 'o', '##cci', '##pit', '##al', 'lobe', 'all', 'slightly', 'decreased', 'in', 'size', '.', 'rt', 'par', '##ie', '##tal', 'lobe', 'slightly', 'decreased', 'in', 'size', '

In [10]:
with open('vocabulary.txt', 'w', encoding='utf-8') as f:
  for token in tokenizer.vocab.keys():
    f.write(token + '\n')

In [2]:
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cjsqh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
text = '''조영증강 전후의 영상에서 뇌실질에 이상 신호는 관찰되지 않고, 백질회색질의 구분도 잘 되고 있다. 소뇌, 뇌간, 송과체, 뇌하수체, 해면정맥동, 시신경로  등에도 이상소견은 보이지 않는다. 뇌실을 포함한 뇌척수액공간에도 특이소견은 없으며, 축외 병변도 관찰되지 않는다. 부비동, 안와, 측두골에서도 비정상적인 소견은 보이지 않는다. 자기공명 뇌혈관 조영 영상에서 이상 소견은 보이지 않는다.'''

print(sent_tokenize(text))

['조영증강 전후의 영상에서 뇌실질에 이상 신호는 관찰되지 않고, 백질회색질의 구분도 잘 되고 있다.', '소뇌, 뇌간, 송과체, 뇌하수체, 해면정맥동, 시신경로  등에도 이상소견은 보이지 않는다.', '뇌실을 포함한 뇌척수액공간에도 특이소견은 없으며, 축외 병변도 관찰되지 않는다.', '부비동, 안와, 측두골에서도 비정상적인 소견은 보이지 않는다.', '자기공명 뇌혈관 조영 영상에서 이상 소견은 보이지 않는다.']


In [26]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer

In [34]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cjsqh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [27]:
from nltk.corpus import stopwords
english_stoprs = set(stopwords.words('english'))
text = '''CI, F/U for cerebral metastases, s/p GKRS. Axial T1WI, sagittal T1WI, axial T2WI, axial FLAIR, axial T2* GRE image 획득하였으며 조영증강을 시행함.  Brain, CSF space, and related findings Multiple cerebral metastases.   Rt frontal lobe 6 lesions     A. Middle frontal gyrus 9 mm  8mm.     B. Other smaller lesions all slightly decreased or no change in size   Rt occipital lobe  all slightly decreased in size.   Rt parietal lobe  slightly decreased in size.   Lt cerebellum  slightly decreased in size.  Slightly decreased extent of an indeterminate enhancement at the left subinsular area.    Rec F/U to exclude metastasis.  New appearance of an indeterminate enhancing lesion at the right frontal lobe Skull, PNS, orbits, and temporal Unremarkable.'''
words = text_to_word_sequence(text)
final = [word for word in words if word not in english_stoprs]
print(final)

['ci', 'f', 'u', 'cerebral', 'metastases', 'p', 'gkrs', 'axial', 't1wi', 'sagittal', 't1wi', 'axial', 't2wi', 'axial', 'flair', 'axial', 't2', 'gre', 'image', '획득하였으며', '조영증강을', '시행함', 'brain', 'csf', 'space', 'related', 'findings', 'multiple', 'cerebral', 'metastases', 'rt', 'frontal', 'lobe', '6', 'lesions', 'middle', 'frontal', 'gyrus', '9', 'mm', '8mm', 'b', 'smaller', 'lesions', 'slightly', 'decreased', 'change', 'size', 'rt', 'occipital', 'lobe', 'slightly', 'decreased', 'size', 'rt', 'parietal', 'lobe', 'slightly', 'decreased', 'size', 'lt', 'cerebellum', 'slightly', 'decreased', 'size', 'slightly', 'decreased', 'extent', 'indeterminate', 'enhancement', 'left', 'subinsular', 'area', 'rec', 'f', 'u', 'exclude', 'metastasis', 'new', 'appearance', 'indeterminate', 'enhancing', 'lesion', 'right', 'frontal', 'lobe', 'skull', 'pns', 'orbits', 'temporal', 'unremarkable']


In [24]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words = None,
    filters = '"#$%&()*+,-.:;<=>?@[\]^_`{|}~\t\n→',
    lower=True,
    split=' ', 
    char_level=False, 
    oov_token=None, 
    document_count=0
)


sentences = [
  'CI, F/U for cerebral metastases, s/p GKRS. Axial T1WI, sagittal T1WI, axial T2WI, axial FLAIR, axial T2* GRE image 획득하였으며 조영증강을 시행함.',
  'Brain, CSF space, and related findings Multiple cerebral metastases.',
  'Rt frontal lobe 6 lesions A. Middle frontal gyrus 9 mm  8mm.',
  'B. Other smaller lesions all slightly decreased or no change in size   Rt occipital lobe  all slightly decreased in size.',
  'Rt parietal lobe  slightly decreased in size.',
  'Lt cerebellum  slightly decreased in size.',
  'Slightly decreased extent of an indeterminate enhancement at the left subinsular area.',
  '    Rec F/U to exclude metastasis.',
  '  New appearance of an indeterminate enhancing lesion at the right frontal lobe Skull, PNS, orbits, and temporal Unremarkable.'
]

#tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_dic = tokenizer.word_index
print(word_dic)

sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

padded = pad_sequences(sequences)
print(padded)


{'slightly': 1, 'decreased': 2, 'axial': 3, 'lobe': 4, 'in': 5, 'size': 6, 'rt': 7, 'frontal': 8, 'f/u': 9, 'cerebral': 10, 'metastases': 11, 't1wi': 12, 'and': 13, 'lesions': 14, 'all': 15, 'of': 16, 'an': 17, 'indeterminate': 18, 'at': 19, 'the': 20, 'ci': 21, 'for': 22, 's/p': 23, 'gkrs': 24, 'sagittal': 25, 't2wi': 26, 'flair': 27, 't2': 28, 'gre': 29, 'image': 30, '획득하였으며': 31, '조영증강을': 32, '시행함': 33, 'brain': 34, 'csf': 35, 'space': 36, 'related': 37, 'findings': 38, 'multiple': 39, '6': 40, 'a': 41, 'middle': 42, 'gyrus': 43, '9': 44, 'mm': 45, '8mm': 46, 'b': 47, 'other': 48, 'smaller': 49, 'or': 50, 'no': 51, 'change': 52, 'occipital': 53, 'parietal': 54, 'lt': 55, 'cerebellum': 56, 'extent': 57, 'enhancement': 58, 'left': 59, 'subinsular': 60, 'area': 61, 'rec': 62, 'to': 63, 'exclude': 64, 'metastasis': 65, 'new': 66, 'appearance': 67, 'enhancing': 68, 'lesion': 69, 'right': 70, 'skull': 71, 'pns': 72, 'orbits': 73, 'temporal': 74, 'unremarkable': 75}
[[21, 9, 22, 10, 11, 23