# Biofilm Classification

### In this Notebook

- Data Augmentation
- PCA Dimension Reduction
- SVM Classification

In [1]:
# Additional python packages

from __future__ import print_function

from time import time
import logging
import os, urllib, io, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import PIL
from PIL import Image
import cv2
# from ggplot import *
# import umap

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import fetch_lfw_people
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.manifold import TSNE

from image_processing_functions import * # Our functions


## Loading day 4 and day 5 images

In [2]:

size = 150
path = 'final_dataset/'

all_images=[]
all_names=[]
for root, dirs, files in os.walk(path):
    for name in files:
        if int(name.split('_')[1]) > 3:
            mypath = os.path.join(root,name)
            img = Image.open(mypath, mode = 'r')
            img = img.resize((size, size))
            arr = np.array(img).astype('uint8')
            all_images.append(arr)
            img.close()
            all_names.append(name.split('.')[0])
            
dic_features = {'biofilm' : all_names}
labels = np.array([name.split('_')[0] for name in all_names])

## Data augmentation

The biofilm labels are invariant by rotation and transposition. It is therefore easily multiply the size of our training dataset by 8: 4 rotations * 2 transpose. We therefore use this method to improve our SVM. We  fit the PCA on the non-expanded train set for computation efficiency. With this method, our f1 score on the train set jumps from 83% to 90%.

In [3]:
X = [x.flatten() for x in all_images]
X = np.asarray(X)

In [4]:

# Split the train and test sets before augmenting. We only augment the training data set
X_train, X_test, y_train, y_test = train_test_split(all_images, labels, test_size=0.25, random_state=42)


# Augment training set
X_expanded = []
y_expanded = []
rotation = [0, 90, 180, 270]
for i, x in enumerate(X_train):
    for angle in rotation:
        img = Image.fromarray(x)
        img_rotate = img.rotate(angle)
        img_transpose = img_rotate.transpose(Image.FLIP_LEFT_RIGHT)
        arr_rotate = np.array(img_rotate).astype('uint8')
        arr_transpose = np.array(img_transpose).astype('uint8')
        X_expanded.append(arr_rotate)
        X_expanded.append(arr_transpose)
        y_expanded.append(y_train[i])
        y_expanded.append(y_train[i])

# flatten
X_train = np.array([x.flatten() for x in X_expanded])
X_test = np.array([x.flatten() for x in X_test])

# shuffle 
indices = np.arange(len(X_train))
np.random.shuffle(indices)
X_train = X_train[indices]
y_train = np.array(y_expanded)[indices]



### Computing PCA

We choose to use 50 Principle Components, as larger values increase procesing time significantly with little improvement on classification accuracy. 

In [None]:
# Compute a PCA 
n_components = 50

print("Extracting the top %d eigenvectors from %d biofilms"
      % (n_components, X_train.shape[0]))
t0 = time()
pca = PCA(n_components=n_components, svd_solver='randomized',
          whiten=True).fit(X_train)
print("done in %0.3fs" % (time() - t0))


print("Projecting the input data on the eigen orthonormal basis")
t0 = time()
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print("done in %0.3fs" % (time() - t0))

## Training the SVM Classifier

In [5]:

print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'),
                   param_grid, cv=5)
clf = clf.fit(X_train_pca, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)


Extracting the top 50 eigenvectors from 3072 biofilms
done in 5.441s
Projecting the input data on the eigen orthonormal basis
done in 0.982s
Fitting the classifier to the training set
done in 143.376s
Best estimator found by grid search:
SVC(C=50000.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


### Making Predictions on test set

In [7]:
print("Predicting mutant type based on the test set")
t0 = time()

X_test_pca = pca.transform(X_test)
y_pred = clf.predict(X_test_pca)
print("done in %0.3fs" % (time() - t0))

print(classification_report(y_test, y_pred))#, target_names=target_names))
print(confusion_matrix(y_test, y_pred))

Predicting mutant type based on the test set
done in 0.079s
             precision    recall  f1-score   support

       4133       0.79      0.94      0.86        16
   cco1cco2       1.00      0.95      0.97        19
       dipA       0.78      1.00      0.88        14
       pas9       0.87      0.87      0.87        23
        phz       1.00      0.94      0.97        18
       rmcA       1.00      0.93      0.97        15
         wt       0.95      0.78      0.86        23

avg / total       0.92      0.91      0.91       128

[[15  0  0  1  0  0  0]
 [ 1 18  0  0  0  0  0]
 [ 0  0 14  0  0  0  0]
 [ 1  0  1 20  0  0  1]
 [ 0  0  1  0 17  0  0]
 [ 1  0  0  0  0 14  0]
 [ 1  0  2  2  0  0 18]]
