<a href="https://colab.research.google.com/github/nikhilvkth/ANN_using_R/blob/main/Data_exploration%26image_preprocessing_expla.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline #magic command plt.show()
import os #provides a way to interact with the operating system
import seaborn as sns  #statistical graphics
sns.set() #set the default style and context for plots


# Read csv file containing training datadata
train_df = pd.read_csv("file")
# Print first 5 rows
print(f'There are {train_df.shape[0]} rows and {train_df.shape[1]} columns in this data frame')#put variables directly inside {}
train_df.head()
#There are {train_df.shape[0]} rows and {train_df.shape[1]} columns in this data frame

# Look at the data type of each column and whether null values are present
train_df.info()
#<class 'pandas.core.frame.DataFrame'>
#RangeIndex: 1000 entries, 0 to 999
#Data columns (total 16 columns):
#Image                 1000 non-null object
#Atelectasis           1000 non-null int64
#Cardiomegaly          1000 non-null int64
#Consolidation         1000 non-null int64
#Edema                 1000 non-null int64
#Effusion              1000 non-null int64
#Emphysema             1000 non-null int64
#Fibrosis              1000 non-null int64
#Hernia                1000 non-null int64
#Infiltration          1000 non-null int64
#Mass                  1000 non-null int64
#Nodule                1000 non-null int64
#PatientId             1000 non-null int64
#Pleural_Thickening    1000 non-null int64
#Pneumonia             1000 non-null int64
#Pneumothorax          1000 non-null int64
#dtypes: int64(15), object(1)
#memory usage: 125.1+ KB



In [None]:
print(f"The total patient ids are {train_df['PatientId'].count()}, from those the unique ids are {train_df['PatientId'].value_counts().shape[0]} ")
#The total patient ids are 1000, from those the unique ids are 928


In [None]:
columns = train_df.keys()
columns = list(columns)
print(columns)

#['Image', 'Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia',
#'Infiltration', 'Mass', 'Nodule', 'PatientId', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']
# Remove unnecesary elements
columns.remove('Image')
columns.remove('PatientId')
# Get the total classes
print(f"There are {len(columns)} columns of labels for these conditions: {columns}")

# Print out the number of positive labels for each class
for column in columns:
    print(f"The class {column} has {train_df[column].sum()} samples")

#The class Atelectasis has 106 samples
#The class Cardiomegaly has 20 samples.....

# Extract numpy values from Image column in data frame
images = train_df['Image'].values
#Data Visualization
#Using the image names listed in the csv file, you can retrieve the image associated with each row of data in your dataframe.
#Run the cell below to visualize a random selection of images from the dataset.
# Extract 9 random images from it
random_images = [np.random.choice(images) for i in range(9)]

# Location of the image dir
img_dir = 'data/nih/images-small/'

print('Display Random Images')

# Adjust the size of your images
plt.figure(figsize=(20,10))

# Iterate and plot random images
for i in range(9):
    plt.subplot(3, 3, i + 1)
    img = plt.imread(os.path.join(img_dir, random_images[i]))
    plt.imshow(img, cmap='gray')
    plt.axis('off')

# Adjust subplot parameters to give specified padding
plt.tight_layout()

#Investigating a Single Image
#Run the cell below to look at the first image in the dataset and print out some details of the image contents.
# Get the first image that was listed in the train_df dataframe
sample_img = train_df.Image[0]
raw_image = plt.imread(os.path.join(img_dir, sample_img))
plt.imshow(raw_image, cmap='gray')
plt.colorbar()
plt.title('Raw Chest X Ray Image')
print(f"The dimensions of the image are {raw_image.shape[0]} pixels width and {raw_image.shape[1]} pixels height, one single color channel")
print(f"The maximum pixel value is {raw_image.max():.4f} and the minimum is {raw_image.min():.4f}")
print(f"The mean value of the pixels is {raw_image.mean():.4f} and the standard deviation is {raw_image.std():.4f}")
#The dimensions of the image are 1024 pixels width and 1024 pixels height, one single color channel
#The maximum pixel value is 0.9804 and the minimum is 0.0000
#The mean value of the pixels is 0.4796 and the standard deviation is 0.2757

#Investigating Pixel Value Distribution¶
#Run the cell below to plot up the distribution of pixel values in the image shown above.

# Plot a histogram of the distribution of the pixels
sns.distplot(raw_image.ravel(),
             label=f'Pixel Mean {np.mean(raw_image):.4f} & Standard Deviation {np.std(raw_image):.4f}', kde=False)
plt.legend(loc='upper center')
plt.title('Distribution of Pixel Intensities in the Image')
plt.xlabel('Pixel Intensity')
plt.ylabel('# Pixels in Image')
#Text(0, 0.5, '# Pixels in Image')


In [None]:
#2. #Image Preprocessing in Keras
# Import data generator from keras
from keras.preprocessing.image import ImageDataGenerator
#2.1 Standardization
#The image_generator you created above will act to adjust your image data such that the new mean of the data will be zero, and
#the standard deviation of the data will be 1.
#In other words, the generator will replace each pixel value in the image with a new value calculated by subtracting the mean and
#dividing by the standard deviation.
#Run the next cell to pre-process your data using the image_generator. In this step you will also be reducing the image size down to 320x320 pixels.

# Flow from directory with specified batch size and target image size
generator = image_generator.flow_from_dataframe(
        dataframe=train_df,
        directory="data/nih/images-small/",
        x_col="Image", # features
        # Let's say we build a model for mass detection
        y_col= ['Mass'], # labels
        class_mode="raw", # 'Mass' column should be in train_df
        batch_size= 1, # images per batch
        shuffle=False, # shuffle the rows or not
        target_size=(320,320) # width and height of output image
)
#Found 1000 validated image filenames.

# Plot a processed image
sns.set_style("white")
generated_image, label = generator.__getitem__(0)
plt.imshow(generated_image[0], cmap='gray')
plt.colorbar()
plt.title('Raw Chest X Ray Image')
print(f"The dimensions of the image are {generated_image.shape[1]} pixels width and {generated_image.shape[2]} pixels height")
print(f"The maximum pixel value is {generated_image.max():.4f} and the minimum is {generated_image.min():.4f}")
print(f"The mean value of the pixels is {generated_image.mean():.4f} and the standard deviation is {generated_image.std():.4f}")

#The dimensions of the image are 320 pixels width and 320 pixels height
#The maximum pixel value is 1.7999 and the minimum is -1.7404
#The mean value of the pixels is 0.0000 and the standard deviation is 1.0000

#Run the cell below to see a comparison of the distribution of pixel values in the new pre-processed image versus the raw image.
# Include a histogram of the distribution of the pixels
sns.set()
plt.figure(figsize=(10, 7))

# Plot histogram for original iamge
sns.distplot(raw_image.ravel(),
             label=f'Original Image: mean {np.mean(raw_image):.4f} - Standard Deviation {np.std(raw_image):.4f} \n '
             f'Min pixel value {np.min(raw_image):.4} - Max pixel value {np.max(raw_image):.4}',
             color='blue',
             kde=False)

# Plot histogram for generated image
sns.distplot(generated_image[0].ravel(),
             label=f'Generated Image: mean {np.mean(generated_image[0]):.4f} - Standard Deviation {np.std(generated_image[0]):.4f} \n'
             f'Min pixel value {np.min(generated_image[0]):.4} - Max pixel value {np.max(generated_image[0]):.4}',
             color='red',
             kde=False)

# Place legends
plt.legend()
plt.title('Distribution of Pixel Intensities in the Image')
plt.xlabel('Pixel Intensity')
plt.ylabel('# Pixel')
#Text(0, 0.5, '# Pixel')

