# Notebook de modelisation

### Chris Hozé

#### On utilise tensorflow donc on se place dans l'environnement tensor_env paramétré avec tensor_gpu


In [None]:
# Import des 

# Import des packages 
import pandas as pd
import numpy as np
import pathlib
import os
#from scipy import sparse

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt

#import cv2

# Keras et tensorflow
import tensorflow as tf

from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dropout 
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Conv2D 
from tensorflow.keras.layers import MaxPooling2D

from tensorflow.keras.optimizers import Adam

from tensorflow.keras.utils import to_categorical
from keras.preprocessing.image import ImageDataGenerator

# # sklearn
# from sklearn.decomposition import PCA
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import KFold, train_test_split   
# from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay


In [2]:
# On vérifie que la gpu fonctionne

tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

# En repartant du dataset tabulaire : c'est possible mais pas ideal car il est long à créer
Cela pourrait tout de même être intéressant pour pouvoir utiliser l info URL

In [None]:
# Charger le dataset tabulaire préprocessé ne fonctionne pas pour utilisation de tensorflow

# file = r"C:\Users\Inrae\Documents\Projet_Data_Science\radio_tab.csv"

# # Dataset complet mais reduit en 100*100

# df = pd.read_csv(file)
# # On cree un jeu de données réduit en respectant les proportions initiales 

# df.rename(columns = {'Unnamed: 0':'FILENAME'}, inplace = True)

# df = df.set_index("FILENAME")

# #df = df.rename(columns = {'Unnamed : 0' : 'FILENAME'})
# df.head()


# On recode la variable source

#cat_url = pd.get_dummies(df["URL"], prefix="url")

#df = pd.concat([cat_url, df], axis=1)

#X = df.drop(["URL","SIZE","FORMAT","TYPE","num"], axis=1)

#X.head()


# En repartant du notebook de MME pour l'import des données

In [4]:
# On recrée un répertoire de données préprocessées, redimensionnées en 224*224 avec filtre gaussien et masquage

path_to_data = r"C:\Users\Inrae\Documents\Projet_Data_Science"
data_folder_path = os.path.join(path_to_data,"COVID-19_Radiography_Dataset")
output_path = os.path.join(path_to_data,"processed")
folder_to_process = ["Lung_Opacity","COVID","Normal","Viral_Pneumonia"]

make_tiny=False
tiny_size=1000
final_size=(224,224)

for img_type in  folder_to_process:
    print(f"Processing folder: {img_type}")

    img_folder_path = os.path.join(data_folder_path,img_type,"images")
    mask_folder_path = os.path.join(data_folder_path,img_type,"masks")

    output_folder_path = os.path.join(output_path, img_type)
    #output_folder_path.mkdir(parents=True, exist_ok=True)

    nb_image_done = 0
    for image_name, mask_name in zip(os.listdir(img_folder_path),
                                     os.listdir(mask_folder_path)):

        image_path = os.path.join(img_folder_path, image_name)
        mask_path = os.path.join(mask_folder_path, mask_name)

        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)

        # resized to mask size
        image = cv2.resize(image, dsize = (256, 256)) # fixed size to mask size

        # masking
        res =  cv2.bitwise_and(image, image, mask=mask)
        res = cv2.GaussianBlur(res, ksize = (3,3), sigmaX=0.2)
        res = cv2.resize(res, dsize = final_size)

        # Write masked image
        output_image_name = image_name + '_masked.png'
        output_file =  os.path.join(output_folder_path, output_image_name)
        cv2.imwrite(output_file, res)

        nb_image_done += 1
        if make_tiny and nb_image_done >= tiny_size:
            break

    print(f"Processing folder: {img_type} done.")


Processing folder: Lung_Opacity
Processing folder: Lung_Opacity done.
Processing folder: COVID
Processing folder: COVID done.
Processing folder: Normal
Processing folder: Normal done.
Processing folder: Viral_Pneumonia
Processing folder: Viral_Pneumonia done.


In [None]:

path_to_data = r"C:\Users\Inrae\Documents\Projet_Data_Science"
data_folder_path = os.path.join(path_to_data,"COVID-19_Radiography_Dataset")
output_path = os.path.join(path_to_data,"processed")
final_size=(224,224)

# Import des données on sépare entre apprentissage et validation
batch_size=32
data_dir=output_path
size=final_size

# On laisse en couleur pour pouvoir utiliser les modeles preentrainés

train_ds = tf.keras.utils.image_dataset_from_directory(
    data_dir,
    validation_split=0.5,
    subset="training",
    #color_mode= "grayscale",
    seed=42,
    image_size=size,
    batch_size=batch_size)


val_ds = tf.keras.utils.image_dataset_from_directory(
    data_dir,
    validation_split=0.5,
    subset="validation",
    #color_mode= "grayscale",
    seed=42,
    image_size=size,
    batch_size=batch_size)



Found 21165 files belonging to 4 classes.
Using 10583 files for training.
Found 21165 files belonging to 4 classes.
Using 10582 files for validation.


In [25]:
# Création X_train, X_test, y_train, y_test

# For train data
all_images = []
all_labels = []

for images, labels in train_ds.take(-1):  # -1 takes all
    all_images.append(images.numpy())
    all_labels.append(labels.numpy())

X_train = np.concatenate(all_images)
y_train = np.concatenate(all_labels)

# For test data
all_images = []
all_labels = []

for images, labels in val_ds.take(-1):  # -1 takes all
    all_images.append(images.numpy())
    all_labels.append(labels.numpy())

X_test = np.concatenate(all_images)
y_test = np.concatenate(all_labels)

# normalization
X_train = X_train / 255
X_test = X_test / 255

# encoding
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)


In [71]:
# On cree un generateur d image

train_datagen = ImageDataGenerator(
    shear_range = 0.1, # random application of shearing
    zoom_range = 0.1,
    horizontal_flip = False,
    brightness_range = (0.4, 0.6),
    width_shift_range=0.1,
    height_shift_range=0.1  
    ) 

test_datagen = ImageDataGenerator()

# Augmenter respectivement les jeu de données d'entrainement
train_dataset = train_datagen.flow(X_train,y_train,   batch_size = 64)

test_dataset = test_datagen.flow(X_test, y_test, batch_size = 64)



# Modèle de DL
## Test d un premier modèle from scratch

In [76]:
# Construction d un modèle classique

model = Sequential()

# Couche d'entrée pour les images 224x224 avec 3 canaux (RGB) 
model.add(Input(shape=(224, 224, 3))) 

# Première couche de convolution 
model.add(Conv2D(16, (3, 3), activation='relu', padding='same')) 
model.add(MaxPooling2D((2, 2))) 

# Deuxième couche de convolution 
model.add(Conv2D(32, (3, 3), activation='relu', padding='same')) 
model.add(MaxPooling2D((2, 2))) 

# Troisième couche de convolution 
model.add(Conv2D(64, (3, 3), activation='relu', padding='same')) 
model.add(MaxPooling2D((2, 2)))

# Couche flatten pour transformer les cartes de caractéristiques en un vecteur 
model.add(Flatten()) 

# Ajouter des couches fully connected (denses) 

model.add(Dense(60, activation='relu')) 
model.add(Dense(4, activation='softmax')) 

#Compiler le modèle model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Définir le learning rate désiré 
learning_rate = 0.0001 #

#Créer une instance de l'optimiseur Adam avec le learning rate personnalisé  
optimizer = Adam(learning_rate=learning_rate) 

# Compiler le modèle avec l'optimiseur personnalisé 
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=["accuracy"])



In [77]:
#Entrainement du modèle, utiliser le jeu de données augmenté, et préciser les callbacks 

batch_size = 64
epochs = 10 
model_history = model.fit(
            train_dataset, # use augmented images for train 
            steps_per_epoch=X_train.shape[0] // batch_size,
            validation_data = test_dataset, # use augmented images for test
            epochs = epochs,
            verbose=True)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### A partir de RESNET50 préentrainé sur image net
#### Avec le preprocessing initial et en freezant les parametres

In [78]:
# On importe un modele préentrainé : REsnet50
# https://keras.io/api/applications/#finetune-inceptionv3-on-a-new-set-of-classes
#import h5py

from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions

base_model = ResNet50(weights='imagenet')

base_model.summary()





Model: "resnet50"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_11 (InputLayer)          [(None, 224, 224, 3  0           []                               
                                )]                                                                
                                                                                                  
 conv1_pad (ZeroPadding2D)      (None, 230, 230, 3)  0           ['input_11[0][0]']               
                                                                                                  
 conv1_conv (Conv2D)            (None, 112, 112, 64  9472        ['conv1_pad[0][0]']              
                                )                                                                 
                                                                                           

In [79]:

# add a global spatial average pooling layer
x = base_model.output

#x = MaxPooling4D(4,4)(x)

# Couche flatten pour transformer les cartes de caractéristiques en un vecteur 
x = Flatten()(x)

# Ajouter des couches fully connected (denses) 

x = Dense(60, activation='relu')(x) 

predictions = Dense(4, activation='softmax')(x) 


# On adapte le preprocessing des données


# img_path = 'elephant.jpg'
# img = keras.utils.load_img(img_path, target_size=(224, 224))
# x = keras.utils.img_to_array(img)
# x = np.expand_dims(x, axis=0)
# x = preprocess_input(x)

# preds = model.predict(x)



# this is the model we will train
model = Model(inputs=base_model.input, outputs=predictions)

# first:  freeze all convolutional InceptionV3 layers
for layer in base_model.layers:
    layer.trainable = False

# compile the model (should be done *after* setting layers to non-trainable)
model.compile(optimizer='Adam', loss='categorical_crossentropy',   metrics = ["accuracy"])

# train the model on the new data for a few epochs
model.fit(train_dataset, # use augmented images for train 
            steps_per_epoch=X_train.shape[0] // batch_size,
            validation_data = test_dataset, # use augmented images for test
            epochs = 10,
            verbose=True)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x15927b3b100>

On degele les parametres des 10 dernieres couches

In [81]:
# ## Vu la precision on unfreeze certaine couche

# let's visualize layer names and layer indices to see how many layers
# we should freeze:
#for i, layer in enumerate(base_model.layers):
#   print(i, layer.name)

# we chose to train the top 2 inception blocks, i.e. we will freeze
# the first 249 layers and unfreeze the rest:
for layer in model.layers[:165]:
   layer.trainable = False
for layer in model.layers[165:]:
   layer.trainable = True

# we need to recompile the model for these modifications to take effect
# we use SGD with a low learning rate
#from tenkeras.optimizers import SGD
#model.compile(optimizer=SGD(lr=0.0001, momentum=0.9), loss='categorical_crossentropy')

#model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=["accuracy"])

# Définir le learning rate désiré 
learning_rate = 0.0001 #

#Créer une instance de l'optimiseur Adam avec le learning rate personnalisé  
optimizer = Adam(learning_rate=learning_rate) 

# Compiler le modèle avec l'optimiseur personnalisé 
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=["accuracy"])

# train the model on the new data for a few epochs
model.fit(train_dataset, # use augmented images for train 
            steps_per_epoch=X_train.shape[0] // batch_size,
            validation_data = test_dataset, # use augmented images for test
            epochs = 10,
            verbose=True)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x15927ee1a80>

#### En reprocessant les données

In [84]:
# Définir les paramètres pour la séparation des données 
train_ratio = 0.6 # Pourcentage de données d'entraînement 
validation_ratio = 0.4 # Pourcentage de données de validation

# On reprocesse les jeux de données d'entrainement et de validation
  
# Pas ideal car on applique un image generator sur le jeu de validation

train_datagen = ImageDataGenerator(
    preprocessing_function=preprocess_input,
    shear_range = 0.1, # random application of shearing
    zoom_range = 0.1,
    horizontal_flip = False,
    brightness_range = (0.4, 0.6),
    width_shift_range=0.1,
    height_shift_range=0.1,
    fill_mode="nearest",
    validation_split=validation_ratio,
    ) 

# test_datagen = ImageDataGenerator(
#     preprocessing_function=preprocess_input,
#     validation_split=validation_ratio,
#     ) 

# Augmenter respectivement les jeu de données d'entrainement et test ?

train_dataset = train_datagen.flow_from_directory(data_dir,batch_size = 64, class_mode="categorical", target_size=(224,224), subset='training', seed=42)

test_dataset = train_datagen.flow_from_directory(data_dir, batch_size = 64, class_mode= "categorical", target_size=(224,224), subset='validation', seed=42)

learning_rate = 0.001 #

# On recrée le modèle de zéro car sinon les poids s ajustent au fur et à mesure

base_model = ResNet50(weights='imagenet')
x = base_model.output
x = Flatten()(x)
x = Dense(60, activation='relu')(x) 
predictions = Dense(4, activation='softmax')(x) 

model = Model(inputs=base_model.input, outputs=predictions)

# on degele les dix dernieres couches:
for layer in model.layers[:165]:
   layer.trainable = False
for layer in model.layers[165:]:
   layer.trainable = True

#Créer une instance de l'optimiseur Adam avec le learning rate personnalisé  
optimizer = Adam(learning_rate=learning_rate) 

# Compiler le modèle avec l'optimiseur personnalisé 
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=["accuracy"])

# train the model on the new data for a few epochs
model.fit(train_dataset, # use augmented images for train 
            steps_per_epoch=train_dataset.n // batch_size,
            validation_data = test_dataset, # use augmented images for test
            epochs = 10,
            verbose=True)

Found 12701 images belonging to 4 classes.
Found 8464 images belonging to 4 classes.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
 44/198 [=====>........................] - ETA: 2:08 - loss: 0.2654 - accuracy: 0.9002

KeyboardInterrupt: 

attention les précisions sont élevées ici car je pense que j'ai réentrainé le modèle plusieurs fois : multiplication du nombre d epoch à reprendre .

In [None]:
# On essaye sans data augmentation
# Définir les paramètres pour la séparation des données 
train_ratio = 0.6 # Pourcentage de données d'entraînement 
validation_ratio = 0.4 # Pourcentage de données de validation

# On reprocesse les jeux de données d'entrainement et de validation avec moins d augmentation de données
  

train_datagen = ImageDataGenerator(
    preprocessing_function=preprocess_input,
    shear_range = 0.1, # random application of shearing
    zoom_range = 0.1,
    fill_mode="nearest",
    validation_split=validation_ratio,
    ) 

# test_datagen = ImageDataGenerator(
#     preprocessing_function=preprocess_input,
#     validation_split=validation_ratio,
#     ) 

# Augmenter respectivement les jeu de données d'entrainement

train_dataset = train_datagen.flow_from_directory(data_dir,batch_size = 64, class_mode="categorical", target_size=(224,224), subset='training', seed=42)

test_dataset = train_datagen.flow_from_directory(data_dir, batch_size = 64, class_mode= "categorical", target_size=(224,224), subset='validation', seed=42)

learning_rate = 0.001 #

# On recrée le modèle

base_model = ResNet50(weights='imagenet')
x = base_model.output
x = Flatten()(x)
x = Dense(60, activation='relu')(x) 
predictions = Dense(4, activation='softmax')(x) 

model = Model(inputs=base_model.input, outputs=predictions)

# on degele les dix dernieres couches:
for layer in model.layers[:165]:
   layer.trainable = False
for layer in model.layers[165:]:
   layer.trainable = True


#Créer une instance de l'optimiseur Adam avec le learning rate personnalisé  
optimizer = Adam(learning_rate=learning_rate) 

# Compiler le modèle avec l'optimiseur personnalisé 
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=["accuracy"])

# train the model on the new data for a few epochs
model.fit(train_dataset, # use augmented images for train 
            steps_per_epoch=train_dataset.n // batch_size,
            validation_data = test_dataset, # use augmented images for test
            epochs = 10,
            verbose=True)
