<a href="https://colab.research.google.com/github/pan7ae/scientific_work/blob/main/name.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Chest X-ray (Indiana University) dataset from kaggle

In [42]:
# Install kaggle
!pip install -q kaggle

In [None]:
# Connect to Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Change directory
import os
os.chdir("/content/drive/MyDrive/Nauchka")

In [None]:
# Create a kaggle folder
!mkdir -p ~/.kaggle

# Copy kaggle.json to created folder
!cp kaggle.json ~/.kaggle/

In [None]:
os.chdir("/content")

In [None]:
# Permission for the json to act
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Download the required dataset
!kaggle datasets download -d raddar/chest-xrays-indiana-university

Downloading chest-xrays-indiana-university.zip to /content
100% 13.2G/13.2G [02:47<00:00, 101MB/s] 
100% 13.2G/13.2G [02:47<00:00, 84.2MB/s]


In [None]:
# Unzip our dataset
from zipfile import ZipFile
from tqdm import tqdm


file_to_extract = "chest-xrays-indiana-university.zip"

# Open your .zip file
with ZipFile(file=file_to_extract) as zip_file:

    # Loop over each file and extract them
    for file in tqdm(iterable=zip_file.namelist(), total=len(zip_file.namelist())):
        zip_file.extract(member=file)

100%|██████████| 7472/7472 [03:59<00:00, 31.18it/s]


# Preprocessing the image and the text data

## Prepare photo data

In [None]:
!pip install Keras-Preprocessing

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Keras-Preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Keras-Preprocessing
Successfully installed Keras-Preprocessing-1.1.2


In [None]:
from os import listdir
from pickle import dump
from keras.applications.vgg16 import VGG16
from keras_preprocessing.image import load_img
from keras_preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.models import Model

In [None]:
# extract features from each photo in the directory
def extract_features(directory):
	# load the model
	model = VGG16()
	# re-structure the model
	model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
	# summarize
	print(model.summary())
	# extract features from each photo
	features = dict()
	for name in listdir(directory):
		# load an image from file
		filename = directory + '/' + name
		image = load_img(filename, target_size=(224, 224))
		# convert the image pixels to a numpy array
		image = img_to_array(image)
		# reshape data for the model
		image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
		# prepare the image for the VGG model
		image = preprocess_input(image)
		# get features
		feature = model.predict(image, verbose=0)
		# get image id
		image_id = name.split('.')[0]
		# store feature
		features[image_id] = feature
		print('>%s' % name)
	return features

In [None]:
# extract features from all images
directory = "/content/images/images_normalized"
features = extract_features(directory)
print(f"Extracted Features: {len(features)}")
# save to file
dump(features, open('/content/drive/MyDrive/Nauchka/features.pkl', 'wb'))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
>3712_IM-1854-1001.dcm.png
>1368_IM-0237-2001.dcm.png
>725_IM-2285-1001.dcm.png
>2071_IM-0705-2001.dcm.png
>623_IM-2205-3003.dcm.png
>2462_IM-0995-1001.dcm.png
>3190_IM-1505-1001.dcm.png
>735_IM-2294-3001.dcm.png
>3793_IM-1907-1001.dcm.png
>2213_IM-0819-1001.dcm.png
>3993_IM-2044-1002.dcm.png
>638_IM-2217-1001.dcm.png
>3328_IM-1594-1002.dcm.png
>1049_IM-0036-1001.dcm.png
>3330_IM-1594-2001.dcm.png
>2180_IM-0793-2001.dcm.png
>3324_IM-1590-2001.dcm.png
>925_IM-2425-2001.dcm.png
>1497_IM-0321-1001.dcm.png
>1443_IM-0286-2001.dcm.png
>2850_IM-1259-2002.dcm.png
>1816_IM-0528-1001.dcm.png
>2018_IM-0665-1001.dcm.png
>1431_IM-0278-1001.dcm.png
>1647_IM-0424-1001.dcm.png
>3352_IM-1608-2001.dcm.png
>2352_IM-0918-1002.dcm.png
>3978_IM-2037-0001-0001.dcm.png
>3812_IM-1922-2001.dcm.png
>3959_IM-2023-1001.dcm.png
>3851_IM-1948-1001.dcm.png
>468_IM-2096-2001.dcm.png
>453_IM-2084-2001.dcm.png
>2541_IM-1053-3001.dcm.png
>1349_IM-0227-2001.

# Prepare csv files

In [None]:
import pandas as pd

In [None]:
# Read in the reports data
reports_df = pd.read_csv("/content/indiana_reports.csv")
reports_df.head()

Unnamed: 0,uid,MeSH,Problems,image,indication,comparison,findings,impression
0,1,normal,normal,Xray Chest PA and Lateral,Positive TB test,None.,The cardiac silhouette and mediastinum size ar...,Normal chest x-XXXX.
1,2,Cardiomegaly/borderline;Pulmonary Artery/enlarged,Cardiomegaly;Pulmonary Artery,"Chest, 2 views, frontal and lateral",Preop bariatric surgery.,None.,Borderline cardiomegaly. Midline sternotomy XX...,No acute pulmonary findings.
2,3,normal,normal,Xray Chest PA and Lateral,"rib pain after a XXXX, XXXX XXXX steps this XX...",,,"No displaced rib fractures, pneumothorax, or p..."
3,4,"Pulmonary Disease, Chronic Obstructive;Bullous...","Pulmonary Disease, Chronic Obstructive;Bullous...","PA and lateral views of the chest XXXX, XXXX a...",XXXX-year-old XXXX with XXXX.,None available,There are diffuse bilateral interstitial and a...,1. Bullous emphysema and interstitial fibrosis...
4,5,Osteophyte/thoracic vertebrae/multiple/small;T...,Osteophyte;Thickening;Lung,Xray Chest PA and Lateral,Chest and nasal congestion.,,The cardiomediastinal silhouette and pulmonary...,No acute cardiopulmonary abnormality.


In [None]:
# Read in the projections data
projections_df = pd.read_csv("/content/indiana_projections.csv")
projections_df.head()

Unnamed: 0,uid,filename,projection
0,1,1_IM-0001-4001.dcm.png,Frontal
1,1,1_IM-0001-3001.dcm.png,Lateral
2,2,2_IM-0652-1001.dcm.png,Frontal
3,2,2_IM-0652-2001.dcm.png,Lateral
4,3,3_IM-1384-1001.dcm.png,Frontal


In [None]:
# Merge the projections and reports data on the UID column
reports = pd.merge(projections_df, reports_df, on="uid")

In [None]:
# Проверка записи по имени файла (для себя)
# pd.reset_option('display.max_colwidth')
pd.set_option('display.max_colwidth', -1)
reports[reports["filename"]=="501_IM-2120-1001.dcm.png"]["impression"]

  pd.set_option('display.max_colwidth', -1)


935    1. There is minimal streaky opacity in the posterior lungs, possibly cyst, scarring, or pneumonia. 2. Heart size and pulmonary XXXX appear normal 3. Mediastinal contours are normal
Name: impression, dtype: object

In [None]:
frontal_reports = reports[reports["projection"]=="Frontal"]
lateral_reports = reports[reports["projection"]=="Lateral"]

## Prepare text data

In [None]:
import re
from typing import List, Dict, Union

In [None]:
# в предложении следующего типа:
# x = "1. Moderate left basilar lung consolidation with mild right basilar opacities, which may represent infection and/or atelectasis. 2. Bilateral rib fractures, most of which appear old. 3. Interval vertebral body XXXX deformity in the lumbar spine since XXXX. ."
# y = "1. There is minimal streaky opacity in the posterior lungs, possibly cyst, scarring, or pneumonia. 2. Heart size and pulmonary XXXX appear normal 3. Mediastinal contours are normal"

dic = {}

for i in range(len(reports)):
    filename = reports.loc[i, "filename"]
    impression = reports.loc[i, "impression"]
    if isinstance(impression, str) and re.match(r'^\d+\.', impression):
        dic.setdefault(filename, []).extend(j for j in impression.split(". ") if len(j) > 1 and not j.isnumeric())

<h3>Extract descriptions for images</h3>

In [None]:
def load_descriptions(dataframe: pd.DataFrame) -> Dict[str, List]:
    mapping = {}
    # process lines
    for i in range(len(dataframe)):
        filename = dataframe.loc[i, "filename"]
        impression = dataframe.loc[i, "impression"]
        # create the list if needed
        if filename not in mapping:
            mapping[filename] = []
        # store description:
        # starts with integer
        if isinstance(impression, str) and re.match(r'^\d+\.', impression):
            mapping[filename].append(impression.split(". ")[1])
        else:
            # if mapping[filename] contains impressions
            if mapping[filename]:
                mapping[filename][-1] += " " + impression
            else:
                mapping[filename].append(impression)
    return mapping

In [76]:
descriptions = load_descriptions(reports)
print('Loaded: %d ' % len(descriptions))

Loaded: 7466 


<h3>Clean the description text

In [66]:
import string

In [77]:
def clean_descriptions(descriptions: Dict[str, List]) -> Dict[str, List]:
    cleaned_descriptions = {}
    # prepare translation table for removing punctuation
    # table = str.maketrans('', '', string.punctuation)
    # как вариант попробовать это
    table = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    for key, desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            # check type of each description
            if isinstance(desc, str):
                # tokenize
                desc = desc.split()
                # convert to lower case
                desc = [word.lower() for word in desc]
                # remove punctuation from each token (if remove following line, then vocabulary's size increases)
                desc = [w.translate(table) for w in desc]
                # remove extra spaces
                desc = [word.strip() for word in desc]
                # remove hanging 's' and 'a' (according to the article: len(word) > 1)
                desc = [word for word in desc if len(word) > 1]
                # remove tokens with numbers in them
                desc = [word for word in desc if word.isalpha]
                # store as string
                desc_list[i] = " ".join(desc)
                # save to a new dictionary
                if key not in cleaned_descriptions:
                    cleaned_descriptions[key] = list()
                cleaned_descriptions[key].append(desc_list[i])
    return cleaned_descriptions                                

In [78]:
descriptions = clean_descriptions(descriptions)

<h3>Convert the loaded descriptions into a vocabulary of words

In [79]:
def to_vocabulary(descriptions):
    # build a list of all description strings
    all_desc = set()
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key]]
    return all_desc

In [80]:
# summarize vocabulary
vocabulary = to_vocabulary(descriptions)
print('Vocabulary Size: %d' % len(vocabulary))

Vocabulary Size: 1352


<h3> Save descriptions to file, one per line

In [None]:
def save_descriptions(descriptions, filename):
	lines = list()
	for key, desc_list in descriptions.items():
		for desc in desc_list:
			lines.append(key + ' ' + desc)
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

In [None]:
# save descriptions
save_descriptions(descriptions, '/content/drive/MyDrive/Nauchka/descriptions_1891.txt')

In [None]:
save_descriptions(descriptions, 'descriptions.txt')

In [43]:
descriptions

{'1_IM-0001-4001.dcm.png': ['normal chest x-xxxx.'],
 '1_IM-0001-3001.dcm.png': ['normal chest x-xxxx.'],
 '2_IM-0652-1001.dcm.png': ['no acute pulmonary findings.'],
 '2_IM-0652-2001.dcm.png': ['no acute pulmonary findings.'],
 '3_IM-1384-1001.dcm.png': ['no displaced rib fractures, pneumothorax, or pleural effusion identified. well-expanded and clear lungs. mediastinal contour within normal limits. no acute cardiopulmonary abnormality identified.'],
 '3_IM-1384-2001.dcm.png': ['no displaced rib fractures, pneumothorax, or pleural effusion identified. well-expanded and clear lungs. mediastinal contour within normal limits. no acute cardiopulmonary abnormality identified.'],
 '4_IM-2050-1001.dcm.png': ['bullous emphysema and interstitial fibrosis'],
 '4_IM-2050-2001.dcm.png': ['bullous emphysema and interstitial fibrosis'],
 '5_IM-2117-1003002.dcm.png': ['no acute cardiopulmonary abnormality.'],
 '5_IM-2117-1004003.dcm.png': ['no acute cardiopulmonary abnormality.'],
 '6_IM-2192-1001.d