# This code is to creat folders and move the images marked as Train and Test and giving labels as Nomal and Cardiomegaly, according to the list provided in the data source 
https://nihcc.app.box.com/v/ChestXray-NIHCC

In [1]:
# General libraries
import os
import shutil
import numpy as np
import pandas as pd 
import random
import cv2
import matplotlib.pyplot as plt
%matplotlib inline

# Deep learning libraries
import keras.backend as K
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Flatten, Dropout, BatchNormalization
from keras.layers import Conv2D, SeparableConv2D, MaxPool2D, LeakyReLU, Activation
from keras.optimizers import Adam
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
import tensorflow as tf

# Setting seeds for reproducibility
seed = 232
np.random.seed(seed)
tf.random.set_seed(seed)

Using TensorFlow backend.


In [2]:
# Create directory, if directory exists remove and create one 

def create_directory(directory_path):
    if os.path.exists(directory_path):
        shutil.rmtree(directory_path) 
        os.mkdir(directory_path)
    else:
        os.mkdir(directory_path)

In [3]:
#moving files from one folder to other,     
def move_allfiles (src,dst):
    if os.path.exists(src):
        files = os.listdir(src) 
        for f in files:
            shutil.copyfile(src+f,dst+f)

In [4]:
#moving list of files from one folder to other,
def move_listfiles (src,dst,img_list):
    if os.path.exists(src):
        files = img_list
        for f in files:
            shutil.copyfile(src+f,dst+f)

In [5]:
#reading the data
df=pd.read_csv("Data_Entry_2017.csv") 

In [6]:
#renaming the columns names
df.rename(columns={'Image Index':'img_index','Finding Labels':'labels','Follow-up #':'follow_up','Patient ID':'pa_id',
                   'Patient Age':'pa_age','Patient Gender':'pa_gender','View Position':'view_position'}, inplace=True)

In [7]:
df.columns

Index(['img_index', 'labels', 'follow_up', 'pa_id', 'pa_age', 'pa_gender',
       'view_position', 'OriginalImage[Width', 'Height]',
       'OriginalImagePixelSpacing[x', 'y]'],
      dtype='object')

In [9]:
#convert the files in train_val_list.txt to a python list 

train_val_list= open('/Users/neeharikasinha/Documents/datascience-course/Capstone1/Chestxray-cardiomegaly3/train_val_list.txt').read().split()
train_val_list[1:3]

['00000001_001.png', '00000001_002.png']

In [10]:
len(train_val_list)

86524

In [11]:
#Selecting the images under train_val_list from Data_Entry_2017.csv file

df_train_img = df[df.img_index.isin(train_val_list)]

In [12]:
df_train_img.head()
#following are the images to be trained

Unnamed: 0,img_index,labels,follow_up,pa_id,pa_age,pa_gender,view_position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y]
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171
12,00000004_000.png,Mass|Nodule,0,4,82,M,AP,2500,2048,0.168,0.168


In [13]:
len(df_train_img)

86524

In [14]:
#checking images with various labels
print(df_train_img['labels'].unique()[0:5])

['Cardiomegaly' 'Cardiomegaly|Emphysema' 'Cardiomegaly|Effusion'
 'No Finding' 'Mass|Nodule']


In [15]:
#assign 1 for 'No Finding' and 2 for 'Cardiomegaly'
df_train_img.loc[df_train_img.labels.str.contains('No Finding'), 'marker'] = '1'

df_train_img.loc[df_train_img.labels.str.contains('Cardiomegaly'), 'marker'] = '2'



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [16]:
df_train_img.head()

Unnamed: 0,img_index,labels,follow_up,pa_id,pa_age,pa_gender,view_position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],marker
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,2.0
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,2.0
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,2.0
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,1.0
12,00000004_000.png,Mass|Nodule,0,4,82,M,AP,2500,2048,0.168,0.168,


In [17]:
df_train_img.describe()

Unnamed: 0,follow_up,pa_id,pa_age,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y]
count,86524.0,86524.0,86524.0,86524.0,86524.0,86524.0,86524.0
mean,5.096621,13795.359415,46.870071,2641.784129,2505.106121,0.155384,0.155384
std,8.656941,8261.548258,16.835434,337.139196,403.957467,0.015615,0.015615
min,0.0,1.0,1.0,1143.0,966.0,0.115,0.115
25%,0.0,6773.75,35.0,2500.0,2048.0,0.143,0.143
50%,2.0,13352.5,49.0,2520.0,2544.0,0.143,0.143
75%,6.0,20185.0,59.0,2992.0,2991.0,0.168,0.168
max,108.0,30801.0,413.0,3550.0,3166.0,0.1988,0.1988


In [18]:
df_train_normal=df_train_img[df_train_img['marker'] == '1']

In [19]:
df_train_cardiomegaly=df_train_img[df_train_img['marker'] == '2']

In [20]:
df_train_normal.head()

Unnamed: 0,img_index,labels,follow_up,pa_id,pa_age,pa_gender,view_position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],marker
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,1
13,00000005_000.png,No Finding,0,5,69,F,PA,2048,2500,0.168,0.168,1
14,00000005_001.png,No Finding,1,5,69,F,AP,2500,2048,0.168,0.168,1
15,00000005_002.png,No Finding,2,5,69,F,AP,2500,2048,0.168,0.168,1
16,00000005_003.png,No Finding,3,5,69,F,PA,2992,2991,0.143,0.143,1


In [21]:
df_train_cardiomegaly.head()

Unnamed: 0,img_index,labels,follow_up,pa_id,pa_age,pa_gender,view_position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],marker
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,2
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,2
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,2
23,00000008_000.png,Cardiomegaly,0,8,69,F,PA,2048,2500,0.171,0.171,2
175,00000033_000.png,Atelectasis|Cardiomegaly|Fibrosis,0,33,72,F,PA,2992,2745,0.143,0.143,2


In [22]:
#number of images which do not come under normal or cardiomegaly
print('Number of images which do not come under normal or cardiomegaly:',len(df_train_img)-(len(df_train_normal)+len(df_train_cardiomegaly)))

Number of images which do not come under normal or cardiomegaly: 34317


In [23]:
#number of raw images being analysed
print('Number of raw images being analysed:',len(df_train_normal)+len(df_train_cardiomegaly))


Number of raw images being analysed: 52207


In [24]:
#making list of images under normal
train_normal_img_list = df_train_normal.img_index.tolist()
train_normal_img_list[1:3]

['00000005_000.png', '00000005_001.png']

In [25]:
#making list of images under cardiomegaly
train_cardiomegaly_img_list = df_train_cardiomegaly.img_index.tolist()
train_cardiomegaly_img_list[1:3]

['00000001_001.png', '00000001_002.png']

In [26]:
#create a folder train_img to collect all the images belonging to train_val_list.txt
create_directory('/Users/neeharikasinha/Documents/datascience-course/Capstone1/Chestxray-cardiomegaly3/train_img/')

In [27]:
#move the images in train_img
move_listfiles('/Users/neeharikasinha/Documents/datascience-course/Capstone1/Chestxray-cardiomegaly3/img_all/','/Users/neeharikasinha/Documents/datascience-course/Capstone1/Chestxray-cardiomegaly3/train_img/',train_val_list)

In [28]:
#create folder train_normal_img
create_directory('/Users/neeharikasinha/Documents/datascience-course/Capstone1/Chestxray-cardiomegaly3/train_normal_img/')

In [29]:
#moving images marked as normal from train_img to train_normal_img

move_listfiles('/Users/neeharikasinha/Documents/datascience-course/Capstone1/Chestxray-cardiomegaly3/train_img/','/Users/neeharikasinha/Documents/datascience-course/Capstone1/Chestxray-cardiomegaly3/train_normal_img/',train_normal_img_list)

In [31]:
#create folder train_cardiomegaly_img
create_directory('/Users/neeharikasinha/Documents/datascience-course/Capstone1/Chestxray-cardiomegaly3/train_cardiomegaly_img/')

In [32]:
#moving images marked as cardiomegaly from train_img to train_cardiomegaly_img

move_listfiles('/Users/neeharikasinha/Documents/datascience-course/Capstone1/Chestxray-cardiomegaly3/train_img/','/Users/neeharikasinha/Documents/datascience-course/Capstone1/Chestxray-cardiomegaly3/train_cardiomegaly_img/',train_cardiomegaly_img_list)

In [33]:
#create a folder test_img to collect all the images belonging to test_list.txt
create_directory('/Users/neeharikasinha/Documents/datascience-course/Capstone1/Chestxray-cardiomegaly3/test_img/')


In [34]:
#convert the files in test_list.txt to a python list 

test_list= open('/Users/neeharikasinha/Documents/datascience-course/Capstone1/Chestxray-cardiomegaly3/test_list.txt').read().split()
test_list[1:3]

['00000003_001.png', '00000003_002.png']

In [35]:
len(test_list)

25596

In [36]:
#Selecting the images under test_list from Data_Entry_2017.csv file

df_test_img = df[df.img_index.isin(test_list)]

In [37]:
len(df_test_img)

25596

In [38]:
df_test_img.head()

Unnamed: 0,img_index,labels,follow_up,pa_id,pa_age,pa_gender,view_position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y]
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143
5,00000003_001.png,Hernia,1,3,74,F,PA,2500,2048,0.168,0.168
6,00000003_002.png,Hernia,2,3,75,F,PA,2048,2500,0.168,0.168
7,00000003_003.png,Hernia|Infiltration,3,3,76,F,PA,2698,2991,0.143,0.143
8,00000003_004.png,Hernia,4,3,77,F,PA,2500,2048,0.168,0.168


In [39]:
#assign 1 for 'No Finding' and 2 for 'Cardiomegaly'
df_test_img.loc[df_test_img.labels.str.contains('No Finding'), 'marker'] = '1'

df_test_img.loc[df_test_img.labels.str.contains('Cardiomegaly'), 'marker'] = '2'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [40]:
df_test_normal=df_test_img[df_test_img['marker'] == '1']

In [41]:
df_test_cardiomegaly=df_test_img[df_test_img['marker'] == '2']

In [42]:
#number of raw images being tested
print('Number of raw images being tested:',len(df_test_normal)+len(df_test_cardiomegaly))


Number of raw images being tested: 10930


In [43]:
#making list of images under normal
test_normal_img_list = df_test_normal.img_index.tolist()
test_normal_img_list[1:3]

['00000013_008.png', '00000013_014.png']

In [44]:
#making list of images under cardiomegaly
test_cardiomegaly_img_list = df_test_cardiomegaly.img_index.tolist()
test_cardiomegaly_img_list[1:3]

['00000013_026.png', '00000013_027.png']

In [45]:
#create a folder test_img to collect all the images belonging to test_list.txt
create_directory('/Users/neeharikasinha/Documents/datascience-course/Capstone1/Chestxray-cardiomegaly3/test_img/')

In [46]:
#move the images in test_img
move_listfiles('/Users/neeharikasinha/Documents/datascience-course/Capstone1/Chestxray-cardiomegaly3/img_all/','/Users/neeharikasinha/Documents/datascience-course/Capstone1/Chestxray-cardiomegaly3/test_img/',test_list)


In [47]:
#create folder test_normal_img
create_directory('/Users/neeharikasinha/Documents/datascience-course/Capstone1/Chestxray-cardiomegaly3/test_normal_img/')

In [49]:
#moving images marked as normal from test_img to test_normal_img

move_listfiles('/Users/neeharikasinha/Documents/datascience-course/Capstone1/Chestxray-cardiomegaly3/test_img/','/Users/neeharikasinha/Documents/datascience-course/Capstone1/Chestxray-cardiomegaly3/test_normal_img/',test_normal_img_list)

In [50]:
#create folder test_cardiomegaly_img
create_directory('/Users/neeharikasinha/Documents/datascience-course/Capstone1/Chestxray-cardiomegaly3/test_cardiomegaly_img/')

In [51]:
#moving images marked as cardiomegaly from test_img to test_cardiomegaly_img

move_listfiles('/Users/neeharikasinha/Documents/datascience-course/Capstone1/Chestxray-cardiomegaly3/test_img/','/Users/neeharikasinha/Documents/datascience-course/Capstone1/Chestxray-cardiomegaly3/test_cardiomegaly_img/',test_cardiomegaly_img_list)

The models will be created and app