In [4]:
from PIL import Image
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

### Lecture des données 

In [8]:
data = {
    "raw": [],
    "cn": [],
    "cna": [],
    "cnacn": []
}

labels = {
    "raw": [],
    "cn": [],
    "cna": [],
    "cnacn": []
}

# TODO: Fix CNACN?
del data["cnacn"]
del labels["cnacn"]

# Data paths
DIRECTORY_RAW = '../images/data_raw/'
DIRECTORY_CN = '../images/data_cn/'
DIRECTORY_CNA = '../images/data_cna/'
DIRECTORY_CNACN = '../images/data_cnacn/'

In [2]:
def read_images(directory, img_size = 0):
    list_img = []
    labels = []

    for name in os.listdir(directory + 'yes'):
        if name == "Thumbs.db":
            continue
        img = Image.open(directory + 'yes/'+ name)
        if img_size != 0:
            img = img.resize((img_size, img_size))
        if directory == DIRECTORY_RAW:
            img = img.convert('L').convert('RGB')
        list_img.append(np.asarray(img).flatten())
        labels.append(1)
        
    for name in os.listdir(directory + 'no'):
        if name == "Thumbs.db":
            continue
        img = Image.open(directory + 'no/'+ name)
        if img_size != 0:
            img = img.resize((img_size, img_size))
        if directory == DIRECTORY_RAW:
            img = img.convert('L').convert('RGB')
        list_img.append(np.asarray(img).flatten())
        labels.append(0)

    return list_img, labels

### Transformation des données

In [20]:
import sys

sys.path.insert(1, '../webservices/crop/')
from ImageCropper import ImageCropper

sys.path.insert(1, '../webservices/data_augmentation/')
from DataAugmentation import DataAugmentation

sys.path.insert(1, '../webservices/normalization/')
from ImageNormalizer import ImageNormalizer

sys.path.insert(1, '')

##### Crop + normalized

In [22]:
imc2 = ImageCropper(DIRECTORY_RAW, DIRECTORY_CN, ['yes', 'no'])
imc2.createOutputDirectory()
imc2.cropImages()

imgnorm2 = ImageNormalizer(DIRECTORY_CN, DIRECTORY_CN)
imgnorm2.loadImagesData()
imgnorm2.resizeImages(
    mode = ImageNormalizer.MODE_RESIZING_KEEP_RATIO,
    background_color = 'black',
    shape = ImageNormalizer.SHAPE_SQUARE,
    square_size = 1000
)
imgnorm2.convertImages2GrayscaleMode()
imgnorm2.convertImages2RGBMode()
imgnorm2.saveImages()

../images/data_cn/ already exists
../images/data_cn/yes already exists
../images/data_cn/no already exists


20:33:42 - DEBUG - STREAM b'IHDR' 16 13
20:33:42 - DEBUG - STREAM b'IDAT' 41 8192
20:33:43 - DEBUG - STREAM b'IHDR' 16 13
20:33:43 - DEBUG - STREAM b'IDAT' 41 8192


##### Crop + normalized + data augmentation

In [23]:
da = DataAugmentation(DIRECTORY_CN, max_augmentation=1000, directory_to=DIRECTORY_CNA)
da.run()

20:33:58 - INFO - data_augmentation.init
20:33:58 - INFO - ../images/data_cna/ already exists
20:33:58 - INFO - ../images/data_cna/yes already exists
20:33:58 - INFO - ../images/data_cna/no already exists
20:34:06 - INFO - data_augmentation.equilibrate
20:34:06 - INFO - data_augmentation.compute_equilibrate
20:34:06 - INFO - Adding 57 images to ../images/data_cna/no/
20:34:06 - INFO - data_augmentation.compute_flip
20:34:08 - INFO - data_augmentation.augmentation
20:34:08 - INFO - data_augmentation.perform_rotate
20:34:08 - INFO - data_augmentation.apply_filters


### Chargement des données

In [6]:
list_img, list_labels = read_images(DIRECTORY_RAW, 240)
data["raw"] = list_img
labels["raw"] = list_labels

In [7]:
# Saving the data to dictionnary
list_img, list_labels = read_images(DIRECTORY_CN, 240)
data["cn"] = list_img
labels["cn"] = list_labels

In [8]:
# Saving the data to dictionnary
list_img, list_labels = read_images(DIRECTORY_CNA, 240)
data["cna"] = list_img
labels["cna"] = list_labels

### KNN

In [24]:
from sklearn.neighbors import KNeighborsClassifier

In [34]:
def knn_best(args):
    (xtrain, xtest, ytrain, ytest) = args
    max_score = 0
    for k in range(2, 20):
        clf = KNeighborsClassifier(n_neighbors=k, n_jobs=-1)
        clf.fit(xtrain, ytrain)
        score = clf.score(xtest, ytest)
        if score > max_score:
            max_score = score
            max_k = k
       
    return max_score, max_k

In [61]:
for key in data:        
    list_img = data[key]
    list_labels = labels[key]
    
    score, k = knn_best( train_test_split(list_img, list_labels, test_size=0.2) )
    print("Max score using %s: %.4f with %d neighbors" %(key, score, k))

KeyboardInterrupt: 

### GradientBoostingClassifier

In [38]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

In [41]:
for key in data:       
    list_img = data[key]
    list_labels = labels[key]
    xtrain, xtest, ytrain, ytest = train_test_split(list_img, list_labels, test_size=0.2)
    
    gbc = GradientBoostingClassifier(n_estimators=50)
    gbc.fit(xtrain, ytrain)
    score = gbc.score(xtest, ytest)
    print("Max score using %s: %.4f" %(key, score))

Max score using raw: 0.7925
Max score using cn: 0.7900
Max score using cna: 0.7850


 - Max score using raw: 0.7925
 - Max score using cn: 0.7900
 - Max score using cna: 0.7850

### RandomForestClassifier

In [67]:
for key in data:
    list_img = data[key]
    list_labels = labels[key]
    xtrain, xtest, ytrain, ytest = train_test_split(list_img, list_labels, test_size=0.3)    
    
    rfc = RandomForestClassifier(n_estimators=2000, n_jobs=-1)
    rfc.fit(xtrain, ytrain)
    score = rfc.score(xtest, ytest)
    print("Max score using %s: %.4f" %(key, score))

Max score using raw: 0.8816
Max score using cn: 0.6974
Max score using cna: 0.8433


 - Max score using raw: 0.8816
 - Max score using cn: 0.6974
 - Max score using cna: 0.8433

###  SVM

In [15]:
DIRECTORY_RAW = "../images/dataset/"
imgs, lbls = read_images(DIRECTORY_RAW, 240)

In [16]:
from sklearn.model_selection import GridSearchCV
svm = SVC(gamma='auto', random_state=0, probability=True)
grid = {
    'kernel': ['poly', 'linear', 'rbf', 'sigmoid']
}
gs = GridSearchCV(svm, grid, verbose=2, cv=5, n_jobs=-1)
gs.fit(imgs, lbls)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  9.4min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=0, shrinking=True, tol=0.001,
  verbose=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'kernel': ['poly', 'linear', 'rbf', 'sigmoid']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [27]:
p = gs.best_params_
print(p)
svm = SVC(**p, gamma='auto', random_state=0, probability=True)
svm.fit(imgs, lbls)
print(svm.predict(imgs[20]))

{'kernel': 'poly'}


ValueError: Expected 2D array, got 1D array instead:
array=[1. 1. 1. ... 7. 7. 7.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [1]:
from sklearn.svm import SVC

In [11]:
def best_SVM(args):
    (xtrain, xtest, ytrain, ytest) = args
    best_score = 0

    for k in ('poly', 'linear', 'rbf', 'sigmoid'):
        print("Testing %s kernel.." %(k))
        svm = SVC(kernel=k, gamma='auto')
        svm.fit(xtrain, ytrain)
        score = svm.score(xtest,ytest)
        
        if score > best_score:
            best_score = score
            best_kernel = k
    
    return best_score, best_kernel

In [None]:
from sklearn.model_selection import GridSearchCV
svm = SVC(gamma='auto', random_state=0, probability=True)
grid = {
    'kernel' = ['poly', 'linear', 'rbf', 'sigmoid']
    'C' = [10**-4, 10**-2, 1, 10]
}
GridSearchCV(svm, grid, verbose=2, cv=2, n_jobs=-1).fit(xtrain, ytrain)

In [12]:
for key in data:
    list_img = data[key]
    list_labels = labels[key]
    score, kernel = best_SVM( train_test_split(list_img, list_labels, test_size=0.3) )
    print("Max score using %s: %.4f with %s kernel" %(key, score, kernel))

Testing poly kernel..
Testing linear kernel..
Testing rbf kernel..
Testing sigmoid kernel..
Max score using raw: 0.8026 with poly kernel
Testing poly kernel..
Testing linear kernel..
Testing rbf kernel..
Testing sigmoid kernel..
Max score using cn: 0.6447 with linear kernel
Testing poly kernel..
Testing linear kernel..
Testing rbf kernel..
Testing sigmoid kernel..
Max score using cna: 0.7517 with poly kernel


 - Max score using raw: 0.8026 with poly kernel
 - Max score using cn: 0.6447 with linear kernel
 - Max score using cna: 0.7517 with poly kernel

### Fully connected NN

##### Data compression
Data is reloaded but with the size of the images set to **32x32x3** (RBG)
This reduces the size of the input layer to **3084 nodes** instead of 127800

In [25]:
list_img, list_labels = read_images(DIRECTORY_RAW, 64)
data["raw"] = list_img
labels["raw"] = list_labels

In [26]:
# Saving the data to dictionnary
list_img, list_labels = read_images(DIRECTORY_CN, 64)
data["cn"] = list_img
labels["cn"] = list_labels

In [27]:
# Saving the data to dictionnary
list_img, list_labels = read_images(DIRECTORY_CNA, 64)
data["cna"] = list_img
labels["cna"] = list_labels

In [28]:
from sklearn import model_selection, neural_network

In [29]:
def best_nn(args):
    (xtrain, xtest, ytrain, ytest) = args
    best_score = 0
    nb_nodes = [32, 64, 128, 256] # Number of nodes per hidden layer
    nb_layers = [2,10,50,100] # Number of hidden layers
    
    for nb_node in nb_nodes:
        for nb_layer in nb_layers:
            print("Testing %d layers of %d nodes.." %(nb_layer, nb_node))
            nn = neural_network.MLPClassifier(hidden_layer_sizes=tuple([nb_node for i in range(nb_layer)]) )
            nn.fit(xtrain, ytrain)
            score = nn.score(xtest, ytest)
            
            if score > best_score:
                best_score = score
                best_node = nb_node
                best_layer = nb_layer
            
    return best_score, best_node, best_layer

In [30]:
for key in data:
    list_img = data[key]
    list_labels = labels[key]
    score, nodes, layers = best_nn( train_test_split(list_img, list_labels, test_size=0.3) )
    print("Max score using %s: %.4f with %d layers of size %d" %(key, score, nodes, layers))

Testing 2 layers of 32 nodes..
Testing 10 layers of 32 nodes..
Testing 50 layers of 32 nodes..
Testing 100 layers of 32 nodes..
Testing 2 layers of 64 nodes..
Testing 10 layers of 64 nodes..
Testing 50 layers of 64 nodes..
Testing 100 layers of 64 nodes..
Testing 2 layers of 128 nodes..
Testing 10 layers of 128 nodes..
Testing 50 layers of 128 nodes..
Testing 100 layers of 128 nodes..
Testing 2 layers of 256 nodes..
Testing 10 layers of 256 nodes..
Testing 50 layers of 256 nodes..
Testing 100 layers of 256 nodes..
Max score using raw: 0.8158 with 128 layers of size 2
Testing 2 layers of 32 nodes..
Testing 10 layers of 32 nodes..
Testing 50 layers of 32 nodes..
Testing 100 layers of 32 nodes..
Testing 2 layers of 64 nodes..
Testing 10 layers of 64 nodes..
Testing 50 layers of 64 nodes..
Testing 100 layers of 64 nodes..
Testing 2 layers of 128 nodes..
Testing 10 layers of 128 nodes..
Testing 50 layers of 128 nodes..
Testing 100 layers of 128 nodes..
Testing 2 layers of 256 nodes..
Testin

##### Using a size of 32x32x3

- Max score using raw: 0.8421 with 10 layers of size 64
- Max score using cn: 0.7895 with 10 layers of size 32
- Max score using cna: 0.7700 with 10 layers of size 256

##### Using a size of 64x64x3
 - Max score using raw: 0.8158 with 128 layers of size 2
 - Max score using cn: 0.7237 with 128 layers of size 2
 - Max score using cna: 0.7183 with 128 layers of size 10