# Image Quality
Use of isolation random forest on CellProfiler ImageQuality features to detect maximum distorted images i.e. images containing illumination/saturation or mechanical problems (e.g. scratches in well)

### Included Libraries

In [1]:
import pandas as pd
import MySQLdb
from matplotlib import pylab as plt
import numpy as np
import os


from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

### Routinely used functions

In [2]:
def ensure_dir(file_path):
    '''
    Function to ensure a file path exists, else creates the path

    :param file_path:
    :return:
    '''
    directory = os.path.dirname(file_path)
    if not os.path.exists(directory):
        os.makedirs(directory)

### Main code

#### Find problematic images

In [10]:
#make connection
db = MySQLdb.connect("menchelabdb.int.cemm.at","root","cqsr4h","ImageAnalysisDDI" )

# DEFINE BATCH TO ANALYSE:
batch_ = 1


# Get features
# select all features that correspond to image quality features
string = "select COLUMN_NAME from INFORMATION_SCHEMA.COLUMNS where TABLE_NAME='IsabelCLOUPAC_Per_Image'"
all_features = list(pd.read_sql(string, con=db)['COLUMN_NAME'])

i = 0
#Remove MetadataFeatures such as height of image, Exclude Mitotracker from ImageQuality (precipitation won't affect segmentation)
features = []
for f in all_features:
    if  'ImageQuality' in f and ('Max' not in f) and ('Mitotracker' not in f):
        features.append(f)
        print str(i)+'\t'+f +'\tQuality'
        i+=1
    if 'Image' not in f:
        #print str(i) + '\t' + f + '\tMeasurement'
        i += 1
features.sort()
# DONE selecting proper features


# Get ImageQuality values for the individual images
string = 'select ImageNumber,' +','.join(features)+' from IsabelCLOUPAC_Per_Image;'
ImageQuality = pd.read_sql(string, con=db)

ImageQuality.head()
db.close()

0	Image_ImageQuality_Correlation_BetaTubulin_30	Quality
1	Image_ImageQuality_Correlation_BetaTubulin_75	Quality
2	Image_ImageQuality_Correlation_DAPI_30	Quality
3	Image_ImageQuality_Correlation_DAPI_75	Quality
4	Image_ImageQuality_FocusScore_BetaTubulin	Quality
5	Image_ImageQuality_FocusScore_DAPI	Quality
6	Image_ImageQuality_LocalFocusScore_BetaTubulin_30	Quality
7	Image_ImageQuality_LocalFocusScore_BetaTubulin_75	Quality
8	Image_ImageQuality_LocalFocusScore_DAPI_30	Quality
9	Image_ImageQuality_LocalFocusScore_DAPI_75	Quality
10	Image_ImageQuality_MADIntensity_BetaTubulin	Quality
11	Image_ImageQuality_MADIntensity_DAPI	Quality
12	Image_ImageQuality_MeanIntensity_BetaTubulin	Quality
13	Image_ImageQuality_MeanIntensity_DAPI	Quality
14	Image_ImageQuality_MedianIntensity_BetaTubulin	Quality
15	Image_ImageQuality_MedianIntensity_DAPI	Quality
16	Image_ImageQuality_MinIntensity_BetaTubulin	Quality
17	Image_ImageQuality_MinIntensity_DAPI	Quality
18	Image_ImageQuality_PercentMinimal_BetaTubuli

#### Perform Scaler
- StandardScaler

In [11]:
x = ImageQuality.loc[:, features]
x = StandardScaler().fit_transform(x)

y = ImageQuality.loc[:,'ImageNumber'].values

#### Isolation Random Forest

In [12]:
print 'Train Random Forest'
clf = IsolationForest(n_jobs=1,n_estimators=1000, max_samples='auto',behaviour='new', contamination='auto',random_state=1000)
clf.fit(x)
y_pred_train = clf.predict(x)
outliers = y_pred_train
print 'Finished Training'

Train Random Forest
Finished Training


#### Make Output

In [14]:
#make a PCA with ImageQuality Features
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
expl_variance = pca.explained_variance_ratio_

# Create a PCA and Output File
colors = []
ok_image = 0
bad_image = 0
bad_images = []
for o,n in zip(outliers,y):
#for length,n in zip(vectorlength,names):

    if o == -1:
        colors.append('red')
        bad_image += 1
        bad_images.append(n)
    else:
        colors.append('grey')
        ok_image += 1
print 'Number of OK images: %d' %ok_image
print 'Number of bad images: %d' %bad_image
print 'Percent: %.2f' %(float(bad_image)/(ok_image+bad_image))

# Make Output file
ensure_dir('../results/IsabelCLOUPAC_Per_Image/BadImages/BadImages.csv')
# create file containing all measurements and thresholds
fp_out = open('../results/IsabelCLOUPAC_Per_Image/BadImages/BadImages.csv', 'w')
fp_out.write('ImageNumber\n')
for b in bad_images:
    fp_out.write(str(b)+'\n')
fp_out.close()
# DONE making Output file
###


#Make PCA output
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Principal Component 1 [%.2f]' %expl_variance[0], fontsize = 15)
ax.set_ylabel('Principal Component 2 [%.2f]' %expl_variance[1], fontsize = 15)
ax.set_title('ImageQuality PCA', fontsize = 20)
ax.scatter(principalComponents[:,0],principalComponents[:,1],c = colors, alpha=0.1)
plt.savefig('../results/IsabelCLOUPAC_Per_Image/BadImages/Overview_PCA.pdf',format='pdf')
plt.close()
# DONE making PCA output
###


Number of OK images: 49720
Number of bad images: 2312
Percent: 0.04
