In [1]:
!pip install -U opencv-python tensorflow scikit-learn pandas matplotlib tensorflow_datasets requests

Collecting requests
  Downloading requests-2.32.4-py3-none-any.whl.metadata (4.9 kB)
Downloading requests-2.32.4-py3-none-any.whl (64 kB)
Installing collected packages: requests
  Attempting uninstall: requests
    Found existing installation: requests 2.32.3
    Uninstalling requests-2.32.3:
      Successfully uninstalled requests-2.32.3
Successfully installed requests-2.32.4
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


# IMPORTATION DES LIBRAIRIES

In [13]:
import pandas as pd
import numpy as np
from PIL import Image
import requests
import zipfile
import os
import time
import psutil
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.image import ImageDataGenerator


In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
if device.type == 'cuda':
    print(f'GPU Name: {torch.cuda.get_device_name(0)}')

Using device: cuda
GPU Name: NVIDIA RTX 6000 Ada Generation


# Chargement du dataset

In [15]:
def telecharger_dezip(url, chemin_sauv="plant_village_dataset.zip", extract_path="."):
    print(" Début du téléchargement")
    try:
        response=requests.get(url, stream=True)
        response.raise_for_status()

        #Taille totale du fichier pour la barre de progression
        total_size=int(response.headers.get('content-length',0))
        block_size=1064
        bar_progression = tqdm(total=total_size, unit='iB', unit_scale=True)

        #Téléchargement
        with open(chemin_sauv, 'wb') as file:
            for data in response.iter_content(block_size):
                bar_progression.update(len(data))
                file.write(data)
        bar_progression.close()

        if total_size != 0 and bar_progression.n != total_size:
            print("ERREUR, quelque chose s'est mal passé pendant le téléchargement.")
            return

        print(f"Téléchargement terminé. Fichier sauvegardé sous : {chemin_sauv}")

        # Créer le dossier d'extraction s'il n'existe pas
        if not os.path.exists(extract_path):
            os.makedirs(extract_path)

        # Décompresser le fichier ZIP
        print(f"Décompression du fichier dans le dossier : {extract_path}")
        with zipfile.ZipFile(chemin_sauv, 'r') as zip_ref:
            zip_ref.extractall(extract_path)

        print("Décompression terminée.")

        # Optionnel : Supprimer le fichier .zip après extraction pour économiser de l'espace
        print(f"Suppression du fichier {chemin_sauv}...")
        os.remove(chemin_sauv)
        print("Opération terminée avec succès !")

    except requests.exceptions.RequestException as e:
        print(f"Une erreur de réseau est survenue: {e}")
    except zipfile.BadZipFile:
        print("Erreur: Le fichier téléchargé n'est pas un fichier ZIP valide.")
    except Exception as e:
        print(f"Une erreur inattendue est survenue: {e}")

In [16]:
URL = "https://data.mendeley.com/datasets/tywbtsjrjv/1/files/b4e3a32f-c0bd-4060-81e9-6144231f2520/file_downloaded"

In [17]:
extract_folder = "plant_village_dataset"

In [18]:
telecharger_dezip(URL, "PlantVillage.zip", extract_folder)

 Début du téléchargement
Une erreur inattendue est survenue: name 'tqdm' is not defined


In [19]:
path="/workspace/plant_village_dataset/Plant_leave_diseases_dataset_with_augmentation"

In [20]:
IMG_SIZE = (224, 224)
BATCH_SIZE = 32

In [21]:
data_gen=ImageDataGenerator(rescale=1./255)

In [22]:
data=data_gen.flow_from_directory(
    path,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode="categorical"
)

Found 61486 images belonging to 39 classes.


# MODELISATION

## VGG11

In [23]:
# --------- 1. Préparer les données ---------
filepaths = []
labels = []
folds = os.listdir(path)
for fold in folds:
    f_path = os.path.join(path, fold)
    if not os.path.isdir(f_path):
        continue
    for file in os.listdir(f_path):
        filepaths.append(os.path.join(f_path, file))
        labels.append(fold)

df = pd.DataFrame({'filepaths': filepaths, 'labels': labels})
print(f"Total des images trouvées : {len(df)}")

Total des images trouvées : 61486


In [24]:
# Split 80/20 avec stratification
train_df, test_df = train_test_split(
    df,
    test_size=0.1,
    random_state=42,
    stratify=df['labels']
)
train_df, val_df = train_test_split(
    train_df,
    test_size=0.2,
    random_state=42,
    stratify=train_df['labels']
)

In [25]:
# Mapping des classes en indices
class_names = sorted(df['labels'].unique())
class_to_idx = {cls: idx for idx, cls in enumerate(class_names)}
num_classes = len(class_names)

In [26]:
# --------- 2. Dataset personnalisé ---------
class CustomImageDataset(Dataset):
    def __init__(self, df, class_to_idx, transform=None):
        self.df = df.reset_index(drop=True)
        self.class_to_idx = class_to_idx
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = self.df.loc[idx, 'filepaths']
        label_name = self.df.loc[idx, 'labels']
        label = self.class_to_idx[label_name]
        
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image, label

In [27]:
# --------- 3. Data augmentation et loaders ---------
train_transforms = transforms.Compose([
    transforms.Resize(IMG_SIZE),
    transforms.RandomRotation(30),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.RandomResizedCrop(IMG_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

val_transforms = transforms.Compose([
    transforms.Resize(IMG_SIZE),
    transforms.CenterCrop(IMG_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

In [28]:
train_dataset = CustomImageDataset(train_df, class_to_idx, transform=train_transforms)
val_dataset = CustomImageDataset(val_df, class_to_idx, transform=val_transforms)
test_dataset = CustomImageDataset(test_df, class_to_idx, transform=val_transforms)

In [29]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

In [30]:
# Charger VGG11 pré-entraîné
model = models.vgg11(weights=models.VGG11_Weights.DEFAULT)

Downloading: "https://download.pytorch.org/models/vgg11-8a719046.pth" to /root/.cache/torch/hub/checkpoints/vgg11-8a719046.pth


100%|██████████| 507M/507M [00:05<00:00, 91.9MB/s] 


In [31]:
for param in model.features.parameters():
    param.requires_grad = False

In [32]:
# Modifier le classificateur final pour notre nombre de classes
num_ftrs = model.classifier[6].in_features
model.classifier[6] = nn.Linear(num_ftrs, num_classes)

In [33]:
model = model.to(device)

In [34]:
optimizer = optim.Adam(model.classifier[6].parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
criterion = nn.CrossEntropyLoss()

In [35]:
def train_model(model, criterion, optimizer, scheduler, num_epochs, train_loader, val_loader, device):
    best_val_accuracy = 0.0
    best_model_path = '/workspace/models/best_model_vgg11.pth'
    start_time = time.time()
    
    for epoch in range(num_epochs):
        epoch_start_time = time.time()
        
        # Phase d'entraînement
        model.train()
        train_loss = 0.0
        train_corrects = 0
        
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            loss = criterion(outputs, labels)
            
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item() * inputs.size(0)
            train_corrects += torch.sum(preds == labels.data)
            
        scheduler.step()
        
        train_loss = train_loss / len(train_loader.dataset)
        train_accuracy = train_corrects.double() / len(train_loader.dataset)
        
        # Phase de validation
        model.eval()
        val_loss = 0.0
        val_corrects = 0
        
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                
                outputs = model(inputs)
                _, preds = torch.max(outputs, 1)
                loss = criterion(outputs, labels)
                
                val_loss += loss.item() * inputs.size(0)
                val_corrects += torch.sum(preds == labels.data)
        
        val_loss = val_loss / len(val_loader.dataset)
        val_accuracy = val_corrects.double() / len(val_loader.dataset)
        
        # Affichage des métriques de performance
        print(f'Epoch {epoch+1}/{num_epochs} - '
              f'Temps: {time.time()-epoch_start_time:.2f}s | '
              f'Train Loss: {train_loss:.4f} Acc: {train_accuracy:.4f} | '
              f'Val Loss: {val_loss:.4f} Acc: {val_accuracy:.4f}')

        # Sauvegarde du meilleur modèle
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            torch.save(model.state_dict(), best_model_path)
            print(f'Meilleur modèle sauvegardé avec une précision de validation de {best_val_accuracy:.4f}')
            
    total_time = time.time() - start_time
    print(f'Formation terminée en {total_time:.2f} secondes.')
    print(f'Meilleure précision de validation : {best_val_accuracy:.4f}')
    
    return model

In [36]:
NUM_EPOCHS = 20
trained_model = train_model(model, criterion, optimizer, scheduler, NUM_EPOCHS, train_loader, val_loader, device)

Epoch 1/20 - Temps: 83.81s | Train Loss: 1.1313 Acc: 0.6722 | Val Loss: 0.5499 Acc: 0.8259
Meilleur modèle sauvegardé avec une précision de validation de 0.8259
Epoch 2/20 - Temps: 81.59s | Train Loss: 0.9628 Acc: 0.7167 | Val Loss: 0.4824 Acc: 0.8491
Meilleur modèle sauvegardé avec une précision de validation de 0.8491
Epoch 3/20 - Temps: 89.97s | Train Loss: 0.9431 Acc: 0.7266 | Val Loss: 0.4809 Acc: 0.8513
Meilleur modèle sauvegardé avec une précision de validation de 0.8513
Epoch 4/20 - Temps: 77.68s | Train Loss: 0.9526 Acc: 0.7271 | Val Loss: 0.4447 Acc: 0.8652
Meilleur modèle sauvegardé avec une précision de validation de 0.8652
Epoch 5/20 - Temps: 71.22s | Train Loss: 0.9504 Acc: 0.7313 | Val Loss: 0.4503 Acc: 0.8631
Epoch 6/20 - Temps: 79.00s | Train Loss: 0.9592 Acc: 0.7318 | Val Loss: 0.4802 Acc: 0.8556
Epoch 7/20 - Temps: 86.37s | Train Loss: 0.9572 Acc: 0.7337 | Val Loss: 0.4292 Acc: 0.8710
Meilleur modèle sauvegardé avec une précision de validation de 0.8710
Epoch 8/20 - 

### Evaluation

In [38]:
trained_model.load_state_dict(torch.load('/workspace/models/best_model_vgg11.pth'))
trained_model.eval()

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (11): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): ReLU(inplace=True)
    (13): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (14): ReLU(inplace=True)
    (15): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
 

In [39]:
all_preds = []
all_labels = []

In [40]:
start_time = time.time()
with torch.no_grad():
    for i, (inputs, labels) in enumerate(test_loader):
        batch_start = time.time()
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = trained_model(inputs)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        # --- Profiling ---
        cpu_usage = psutil.cpu_percent(interval=None)
        ram = psutil.virtual_memory()
        if torch.cuda.is_available():
            gpu_mem = torch.cuda.memory_allocated() / 1024**2
        else:
            gpu_mem = 0.0
        print(f"[Batch {i+1}] Time: {time.time()-batch_start:.2f}s | CPU: {cpu_usage:.1f}% | RAM: {ram.used/1024**3:.2f}GB | GPU: {gpu_mem:.2f}MB")

end_time = time.time()

[Batch 1] Time: 0.03s | CPU: 7.2% | RAM: 130.15GB | GPU: 984.31MB
[Batch 2] Time: 0.02s | CPU: 3.8% | RAM: 130.15GB | GPU: 984.31MB
[Batch 3] Time: 0.02s | CPU: 3.6% | RAM: 130.16GB | GPU: 984.31MB
[Batch 4] Time: 0.01s | CPU: 4.8% | RAM: 130.16GB | GPU: 984.31MB
[Batch 5] Time: 0.02s | CPU: 4.3% | RAM: 130.17GB | GPU: 984.31MB
[Batch 6] Time: 0.02s | CPU: 4.0% | RAM: 130.17GB | GPU: 984.31MB
[Batch 7] Time: 0.02s | CPU: 5.0% | RAM: 130.16GB | GPU: 984.31MB
[Batch 8] Time: 0.02s | CPU: 5.1% | RAM: 130.16GB | GPU: 984.31MB
[Batch 9] Time: 0.02s | CPU: 4.1% | RAM: 130.15GB | GPU: 984.31MB
[Batch 10] Time: 0.02s | CPU: 4.8% | RAM: 130.17GB | GPU: 984.31MB
[Batch 11] Time: 0.02s | CPU: 4.0% | RAM: 130.19GB | GPU: 984.31MB
[Batch 12] Time: 0.01s | CPU: 5.0% | RAM: 130.20GB | GPU: 984.31MB
[Batch 13] Time: 0.02s | CPU: 11.3% | RAM: 130.23GB | GPU: 984.31MB
[Batch 14] Time: 0.01s | CPU: 5.3% | RAM: 130.25GB | GPU: 984.31MB
[Batch 15] Time: 0.01s | CPU: 5.7% | RAM: 130.27GB | GPU: 984.31MB
[Ba

In [41]:
total_time = end_time - start_time
print(f"\nTemps Test Total: {total_time:.2f} sec")
print(f"Throughput: {len(test_dataset) / total_time:.2f} images/sec")


Temps Test Total: 8.39 sec
Throughput: 732.65 images/sec


In [42]:
# Rapport complet
print("=== Rapport complet d'évaluation sur l'ensemble de test (VGG11) ===")
print(classification_report(all_labels, all_preds, target_names=class_names))

=== Rapport complet d'évaluation sur l'ensemble de test (VGG11) ===
                                               precision    recall  f1-score   support

                           Apple___Apple_scab       0.99      0.68      0.80       100
                            Apple___Black_rot       0.88      0.88      0.88       100
                     Apple___Cedar_apple_rust       0.99      0.90      0.94       100
                              Apple___healthy       0.92      0.85      0.88       164
                    Background_without_leaves       0.98      0.97      0.98       114
                          Blueberry___healthy       0.81      0.96      0.88       150
                      Cherry___Powdery_mildew       0.99      0.70      0.82       105
                             Cherry___healthy       0.93      0.87      0.90       100
   Corn___Cercospora_leaf_spot Gray_leaf_spot       0.97      0.77      0.86       100
                           Corn___Common_rust       1.00     