In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Импорт необходимых библиотек и модулей

In [None]:
import pandas as pd
import numpy as np

## Скачиваем набор данных

In [None]:
# Change directory for kaggle JSON
import os
os.chdir("/content/drive/MyDrive/files")

In [None]:
# Create a kaggle folder
!mkdir -p ~/.kaggle

# Copy kaggle.json to created folder
!cp kaggle.json ~/.kaggle/

In [None]:
os.chdir("/content")

In [None]:
# Permission for the json to act
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Download the required dataset
!kaggle datasets download -d raddar/chest-xrays-indiana-university

Downloading chest-xrays-indiana-university.zip to /content
100% 13.2G/13.2G [02:16<00:00, 101MB/s] 
100% 13.2G/13.2G [02:16<00:00, 104MB/s]


In [None]:
# Unzip our dataset
from zipfile import ZipFile
from tqdm import tqdm


file_to_extract = "chest-xrays-indiana-university.zip"

# Open your .zip file
with ZipFile(file=file_to_extract) as zip_file:

    # Loop over each file and extract them
    for file in tqdm(iterable=zip_file.namelist(), total=len(zip_file.namelist())):
        zip_file.extract(member=file)

100%|██████████| 7472/7472 [02:55<00:00, 42.66it/s]


## Загрузка модели

In [None]:
df_1 = pd.read_csv('/content/indiana_reports.csv')
df_1.head()

Unnamed: 0,uid,MeSH,Problems,image,indication,comparison,findings,impression
0,1,normal,normal,Xray Chest PA and Lateral,Positive TB test,None.,The cardiac silhouette and mediastinum size ar...,Normal chest x-XXXX.
1,2,Cardiomegaly/borderline;Pulmonary Artery/enlarged,Cardiomegaly;Pulmonary Artery,"Chest, 2 views, frontal and lateral",Preop bariatric surgery.,None.,Borderline cardiomegaly. Midline sternotomy XX...,No acute pulmonary findings.
2,3,normal,normal,Xray Chest PA and Lateral,"rib pain after a XXXX, XXXX XXXX steps this XX...",,,"No displaced rib fractures, pneumothorax, or p..."
3,4,"Pulmonary Disease, Chronic Obstructive;Bullous...","Pulmonary Disease, Chronic Obstructive;Bullous...","PA and lateral views of the chest XXXX, XXXX a...",XXXX-year-old XXXX with XXXX.,None available,There are diffuse bilateral interstitial and a...,1. Bullous emphysema and interstitial fibrosis...
4,5,Osteophyte/thoracic vertebrae/multiple/small;T...,Osteophyte;Thickening;Lung,Xray Chest PA and Lateral,Chest and nasal congestion.,,The cardiomediastinal silhouette and pulmonary...,No acute cardiopulmonary abnormality.


In [None]:
df_2 = pd.read_csv('/content/indiana_projections.csv')
df_2.head()

Unnamed: 0,uid,filename,projection
0,1,1_IM-0001-4001.dcm.png,Frontal
1,1,1_IM-0001-3001.dcm.png,Lateral
2,2,2_IM-0652-1001.dcm.png,Frontal
3,2,2_IM-0652-2001.dcm.png,Lateral
4,3,3_IM-1384-1001.dcm.png,Frontal


In [None]:
df = pd.merge(df_1, df_2, on='uid')
df.shape

(7466, 10)

In [None]:
df.isnull().sum()

uid              0
MeSH             0
Problems         0
image            0
indication     159
comparison    1157
findings       997
impression      52
filename         0
projection       0
dtype: int64

## Обработка текстовых данных

In [None]:
import string
import regex as re
import nltk
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Уберем сокращения

In [None]:
def decontraction(doc):
    docs = re.sub(r"won't", "will not", doc)
    docs = re.sub(r"can\'t", "can not", docs)
    docs = re.sub(r"n\'t", " not", docs)
    docs = re.sub(r"\'t", " not", docs)
    docs = re.sub(r"\'re", " are", docs)
    docs = re.sub(r"\'s", " is", docs)
    docs = re.sub(r"\'d", " would", docs)
    docs = re.sub(r"\'ve", " have", docs)
    docs = re.sub(r"\'ll", " will", docs)
    docs = re.sub(r"\'m", " am", docs)
    docs = re.sub(r"\*+", "abuse", docs)
    return docs

In [None]:
def remove_stopwords(data):
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')  # removing the "not" from the stopwords
    all_stopwords.remove('no')
    text_tokens = word_tokenize(data)  # tokenizing the sentence into word
    tokens_without_sw = [word for word in text_tokens if not word in all_stopwords]
    return " ".join(tokens_without_sw)

In [None]:
def text_preprocess(data):
    text = str(data)

    # making the text in lower case
    text = text.lower()

    # removing the special character in text
    text = re.sub(r"[|\'||\||/|?|$|.|.|,||;|><!&)(:`\n\t@#=+%^*~-]", "" , text)

    # Removing the irrelavent XXXX words
    text = re.sub(r'x+', "", text)

    # Removing the numbers in text
    text = re.sub(r'[0-9]', "", text)
    text = re.sub(r"yearold", "", text)
    text = re.sub('\s+', " ", text)

    # Removing apoustahe
    text = re.sub("'", "", text)

    # Removing multiple full stop
    text = re.sub(r'\.\.+', '.', text)
    text = re.sub(r'  ', "", text)

    text = decontraction(text)

    return text

In [None]:
df["indication"] = df["indication"].apply(lambda x:text_preprocess(x))
df["findings"] = df["findings"].apply(lambda x:text_preprocess(x))
df["impression"] = df["impression"].apply(lambda x:text_preprocess(x))

In [None]:
df.head()

Unnamed: 0,uid,MeSH,Problems,image,indication,comparison,findings,impression,filename,projection
0,1,normal,normal,Xray Chest PA and Lateral,positive tb test,None.,the cardiac silhouette and mediastinum size ar...,normal chest,1_IM-0001-4001.dcm.png,Frontal
1,1,normal,normal,Xray Chest PA and Lateral,positive tb test,None.,the cardiac silhouette and mediastinum size ar...,normal chest,1_IM-0001-3001.dcm.png,Lateral
2,2,Cardiomegaly/borderline;Pulmonary Artery/enlarged,Cardiomegaly;Pulmonary Artery,"Chest, 2 views, frontal and lateral",preop bariatric surgery,None.,borderline cardiomegaly midline sternotomy enl...,no acute pulmonary findings,2_IM-0652-1001.dcm.png,Frontal
3,2,Cardiomegaly/borderline;Pulmonary Artery/enlarged,Cardiomegaly;Pulmonary Artery,"Chest, 2 views, frontal and lateral",preop bariatric surgery,None.,borderline cardiomegaly midline sternotomy enl...,no acute pulmonary findings,2_IM-0652-2001.dcm.png,Lateral
4,3,normal,normal,Xray Chest PA and Lateral,rib pain after a steps this pain to r back r e...,,,no displaced rib fractures pneumothora or pleu...,3_IM-1384-1001.dcm.png,Frontal


## Сгруппируем изображения

In [None]:
frontal_df = df[df['projection']=='Frontal']
frontal_df.head()

Unnamed: 0,uid,MeSH,Problems,image,indication,comparison,findings,impression,filename,projection
0,1,normal,normal,Xray Chest PA and Lateral,positive tb test,None.,the cardiac silhouette and mediastinum size ar...,normal chest,1_IM-0001-4001.dcm.png,Frontal
2,2,Cardiomegaly/borderline;Pulmonary Artery/enlarged,Cardiomegaly;Pulmonary Artery,"Chest, 2 views, frontal and lateral",preop bariatric surgery,None.,borderline cardiomegaly midline sternotomy enl...,no acute pulmonary findings,2_IM-0652-1001.dcm.png,Frontal
4,3,normal,normal,Xray Chest PA and Lateral,rib pain after a steps this pain to r back r e...,,,no displaced rib fractures pneumothora or pleu...,3_IM-1384-1001.dcm.png,Frontal
6,4,"Pulmonary Disease, Chronic Obstructive;Bullous...","Pulmonary Disease, Chronic Obstructive;Bullous...","PA and lateral views of the chest XXXX, XXXX a...",with,None available,there are diffuse bilateral interstitial and a...,bullous emphysema and interstitial fibrosis p...,4_IM-2050-1001.dcm.png,Frontal
8,5,Osteophyte/thoracic vertebrae/multiple/small;T...,Osteophyte;Thickening;Lung,Xray Chest PA and Lateral,chest and nasal congestion,,the cardiomediastinal silhouette and pulmonary...,no acute cardiopulmonary abnormality,5_IM-2117-1003002.dcm.png,Frontal


In [None]:
lateral_df = df[df['projection']=='Lateral']
lateral_df.head()

Unnamed: 0,uid,MeSH,Problems,image,indication,comparison,findings,impression,filename,projection
1,1,normal,normal,Xray Chest PA and Lateral,positive tb test,None.,the cardiac silhouette and mediastinum size ar...,normal chest,1_IM-0001-3001.dcm.png,Lateral
3,2,Cardiomegaly/borderline;Pulmonary Artery/enlarged,Cardiomegaly;Pulmonary Artery,"Chest, 2 views, frontal and lateral",preop bariatric surgery,None.,borderline cardiomegaly midline sternotomy enl...,no acute pulmonary findings,2_IM-0652-2001.dcm.png,Lateral
5,3,normal,normal,Xray Chest PA and Lateral,rib pain after a steps this pain to r back r e...,,,no displaced rib fractures pneumothora or pleu...,3_IM-1384-2001.dcm.png,Lateral
7,4,"Pulmonary Disease, Chronic Obstructive;Bullous...","Pulmonary Disease, Chronic Obstructive;Bullous...","PA and lateral views of the chest XXXX, XXXX a...",with,None available,there are diffuse bilateral interstitial and a...,bullous emphysema and interstitial fibrosis p...,4_IM-2050-2001.dcm.png,Lateral
9,5,Osteophyte/thoracic vertebrae/multiple/small;T...,Osteophyte;Thickening;Lung,Xray Chest PA and Lateral,chest and nasal congestion,,the cardiomediastinal silhouette and pulmonary...,no acute cardiopulmonary abnormality,5_IM-2117-1004003.dcm.png,Lateral


In [None]:
image_list = []

for i, ind in zip(frontal_df['uid'], frontal_df['uid'].index) :
    k = lateral_df[lateral_df['uid'] == i]['filename'].values
    for j in range(len(k)) :
        L = []
        L.append(frontal_df['filename'][ind])
        L.append(k[j])
        L.append(frontal_df['MeSH'][ind])
        L.append(frontal_df['Problems'][ind])
        L.append(frontal_df['indication'][ind])
        L.append(frontal_df['findings'][ind])
        L.append(frontal_df['impression'][ind])
        image_list.append(L)
    if len(k) == 0 :
        L = []
        L.append(frontal_df['filename'][ind])
        L.append(frontal_df['filename'][ind])
        L.append(frontal_df['MeSH'][ind])
        L.append(frontal_df['Problems'][ind])
        L.append(frontal_df['indication'][ind])
        L.append(frontal_df['findings'][ind])
        L.append(frontal_df['impression'][ind])
        image_list.append(L)

In [None]:
columns_in_new_df = ['Frontal', 'Lateral', 'MeSH', 'Problems', 'indication', 'findings', 'impression']
new_df = pd.DataFrame(image_list,columns=columns_in_new_df)
new_df.head()

Unnamed: 0,Frontal,Lateral,MeSH,Problems,indication,findings,impression
0,1_IM-0001-4001.dcm.png,1_IM-0001-3001.dcm.png,normal,normal,positive tb test,the cardiac silhouette and mediastinum size ar...,normal chest
1,2_IM-0652-1001.dcm.png,2_IM-0652-2001.dcm.png,Cardiomegaly/borderline;Pulmonary Artery/enlarged,Cardiomegaly;Pulmonary Artery,preop bariatric surgery,borderline cardiomegaly midline sternotomy enl...,no acute pulmonary findings
2,3_IM-1384-1001.dcm.png,3_IM-1384-2001.dcm.png,normal,normal,rib pain after a steps this pain to r back r e...,,no displaced rib fractures pneumothora or pleu...
3,4_IM-2050-1001.dcm.png,4_IM-2050-2001.dcm.png,"Pulmonary Disease, Chronic Obstructive;Bullous...","Pulmonary Disease, Chronic Obstructive;Bullous...",with,there are diffuse bilateral interstitial and a...,bullous emphysema and interstitial fibrosis p...
4,5_IM-2117-1003002.dcm.png,5_IM-2117-1004003.dcm.png,Osteophyte/thoracic vertebrae/multiple/small;T...,Osteophyte;Thickening;Lung,chest and nasal congestion,the cardiomediastinal silhouette and pulmonary...,no acute cardiopulmonary abnormality


In [None]:
new_df.shape

(3913, 7)

Заменит строковые Nan на np.nan

In [None]:
new_df['findings'] = new_df['findings'].replace('nan', np.nan)
new_df['indication'] = new_df['indication'].replace('nan', np.nan)
new_df['impression'] = new_df['impression'].replace('nan', np.nan)

In [None]:
new_df.isnull().sum()

Frontal         0
Lateral         0
MeSH            0
Problems        0
indication     88
findings      525
impression     31
dtype: int64

In [None]:
new_df.to_csv('/content/drive/MyDrive/shikha version/data/final.csv', index=False)

In [None]:
pd.read_csv('/content/drive/MyDrive/shikha version/data/final.csv')

Unnamed: 0,Frontal,Lateral,MeSH,Problems,indication,findings,impression
0,1_IM-0001-4001.dcm.png,1_IM-0001-3001.dcm.png,normal,normal,positive tb test,the cardiac silhouette and mediastinum size ar...,normal chest
1,2_IM-0652-1001.dcm.png,2_IM-0652-2001.dcm.png,Cardiomegaly/borderline;Pulmonary Artery/enlarged,Cardiomegaly;Pulmonary Artery,preop bariatric surgery,borderline cardiomegaly midline sternotomy enl...,no acute pulmonary findings
2,3_IM-1384-1001.dcm.png,3_IM-1384-2001.dcm.png,normal,normal,rib pain after a steps this pain to r back r e...,,no displaced rib fractures pneumothora or pleu...
3,4_IM-2050-1001.dcm.png,4_IM-2050-2001.dcm.png,"Pulmonary Disease, Chronic Obstructive;Bullous...","Pulmonary Disease, Chronic Obstructive;Bullous...",with,there are diffuse bilateral interstitial and a...,bullous emphysema and interstitial fibrosis p...
4,5_IM-2117-1003002.dcm.png,5_IM-2117-1004003.dcm.png,Osteophyte/thoracic vertebrae/multiple/small;T...,Osteophyte;Thickening;Lung,chest and nasal congestion,the cardiomediastinal silhouette and pulmonary...,no acute cardiopulmonary abnormality
...,...,...,...,...,...,...,...
3908,3995_IM-2046-1001.dcm.png,3995_IM-2046-2001.dcm.png,Lung/hyperdistention/mild;Diaphragm/bilateral/...,Lung;Diaphragm;Cicatrix;Pulmonary Atelectasis;...,nausea vomiting weeks dialysis patient,the cardiomediastinal silhouette and pulmonary...,interval resolution of bibasilar airspace dis...
3909,3996_IM-2047-1001.dcm.png,3996_IM-2047-3001.dcm.png,Spine/degenerative,Spine,,the lungs are clear heart size is normal no pn...,clear lungs no acute cardiopulmonary abnormality
3910,3997_IM-2048-1001.dcm.png,3997_IM-2048-1002.dcm.png,Opacity/lung/upper lobe/right/round/small;Gran...,Opacity;Granuloma,male with positive ppd,heart size within normal limits small nodular ...,no acute findings no evidence for active tb
3911,3998_IM-2048-1001.dcm.png,3998_IM-2048-1002.dcm.png,normal,normal,tuberculosis positive ppd,,heart size is normal and the lungs are clear
