# Préparation des données

In [2]:
import pandas as pd
import os

# Chemins des fichiers
images_dir = r"C:\Users\mauge\Openclassrooms\Projet 7 - Git repository\POC BLIP\Data\Images"
captions_file = r"C:\Users\mauge\Openclassrooms\Projet 7 - Git repository\POC BLIP\Data\captions\captions.txt"

# Charger les légendes
captions = pd.read_csv(captions_file, header=None, names=["image", "caption"])

# Filtrer les images présentes dans le dossier
available_images = set(os.listdir(images_dir))
captions = captions[captions["image"].isin(available_images)]

In [3]:
captions.head()

Unnamed: 0,image,caption
1,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
2,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
3,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
4,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
5,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


In [4]:
import re
from sklearn.model_selection import train_test_split

# Fonction de nettoyage des légendes
def clean_caption(caption):
    caption = caption.lower()
    caption = re.sub(r"[^a-z0-9\s]", "", caption)
    caption = caption.strip()
    return caption

# Nettoyer les légendes
captions["caption"] = captions["caption"].apply(clean_caption)

# Séparer en ensembles d'entraînement et de validation
train_captions, val_captions = train_test_split(captions, test_size=0.2, random_state=42)


In [5]:
from torchvision import transforms
from PIL import Image

# Transformation des images
image_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Exemple de chargement et transformation d'une image
def load_image(image_path):
    image = Image.open(image_path).convert("RGB")
    return image_transforms(image)

# Charger une image pour tester
image_path = os.path.join(images_dir, train_captions.iloc[0]["image"])
image_tensor = load_image(image_path)
print("Image Tensor Shape:", image_tensor.shape)


Image Tensor Shape: torch.Size([3, 224, 224])


In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenisation des légendes
def tokenize_captions(captions, num_words=5000):
    # Créer un tokenizer
    tokenizer = Tokenizer(num_words=num_words, oov_token="<UNK>")
    tokenizer.fit_on_texts(captions)

    # Convertir les légendes en séquences
    sequences = tokenizer.texts_to_sequences(captions)

    # Obtenir la longueur maximale pour le padding
    max_length = max(len(seq) for seq in sequences)

    # Ajouter des zéros pour rendre les séquences de même longueur
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding="post")

    return tokenizer, padded_sequences, max_length


In [22]:
# Récupérer le dictionnaire mot → indice
word_index = tokenizer.word_index
print("Word Index:", word_index)



In [7]:
# Extraire les textes des légendes pour la tokenisation
train_texts = train_captions["caption"].tolist()
val_texts = val_captions["caption"].tolist()

# Tokeniser les légendes
tokenizer, train_sequences, max_length = tokenize_captions(train_texts)
_, val_sequences, _ = tokenize_captions(val_texts)

print("Exemple de séquence tokenisée :", train_sequences[0])
print("Longueur maximale des séquences :", max_length)


Exemple de séquence tokenisée : [  2  10   7   2   8  16  38   5   2 145 142 108 552  45  19   4 172   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
Longueur maximale des séquences : 33


# Modèle classique

CNN pour l’extraction de caractéristiques       
ResNet50 comme extracteur de caractéristiques :

In [8]:
from torchvision import models
import torch.nn as nn
import torch

# Charger ResNet50 avec un chemin local pour les poids
weights_path = r"C:\Users\mauge\.cache\torch\hub\checkpoints\resnet50-0676ba61.pth"

# Charger ResNet50 avec des poids téléchargés localement
class ImageEncoder(nn.Module):
    def __init__(self, embed_size, weights_path=None):
        super(ImageEncoder, self).__init__()
        
        # Charger ResNet50 sans poids si aucun chemin n'est fourni
        if weights_path:
            resnet = models.resnet50()
            state_dict = torch.load(weights_path, map_location=torch.device('cpu'))  # Charger les poids
            resnet.load_state_dict(state_dict)
        else:
            resnet = models.resnet50(pretrained=True)
        
        # Supprimer la dernière couche
        modules = list(resnet.children())[:-1]
        self.resnet = nn.Sequential(*modules)
        
        # Ajouter une couche linéaire pour ajuster la sortie
        self.fc = nn.Linear(resnet.fc.in_features, embed_size)

    def forward(self, images):
        features = self.resnet(images)
        features = features.view(features.size(0), -1)
        features = self.fc(features)
        return features



RNN pour la génération de légendes      
LSTM pour générer des légendes à partir des caractéristiques visuelles

In [9]:
class CaptionGenerator(torch.nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1):
        super(CaptionGenerator, self).__init__()
        self.embed = torch.nn.Embedding(vocab_size, embed_size)
        self.lstm = torch.nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = torch.nn.Linear(hidden_size, vocab_size)

    def forward(self, features, captions):
        embeddings = self.embed(captions)
        inputs = torch.cat((features.unsqueeze(1), embeddings), dim=1)
        outputs, _ = self.lstm(inputs)
        outputs = self.fc(outputs)
        return outputs


Initialisation et configuration des hyperparamètres

In [14]:
import torch
# Paramètres
embed_size = 256
hidden_size = 512
vocab_size = len(tokenizer.word_index) + 1  # Taille du vocabulaire
num_epochs = 10
batch_size = 32
learning_rate = 0.001
# Configurer le device : GPU si disponible, sinon CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Modèles
image_encoder = ImageEncoder(embed_size)
caption_generator = CaptionGenerator(embed_size, hidden_size, vocab_size)

# Optimiseur et fonction de perte
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignorer les tokens de padding
optimizer = torch.optim.Adam(list(image_encoder.parameters()) + list(caption_generator.parameters()), lr=learning_rate)


In [15]:
from torch.utils.data import DataLoader, Dataset

# Dataset personnalisé
class FlickrDataset(Dataset):
    def __init__(self, captions, images_dir, transform, tokenizer, max_length):
        self.captions = captions
        self.images_dir = images_dir
        self.transform = transform
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.captions)

    def __getitem__(self, idx):
        caption = self.captions.iloc[idx]["caption"]
        image_name = self.captions.iloc[idx]["image"]

        # Charger et transformer l'image
        image_path = os.path.join(self.images_dir, image_name)
        image = load_image(image_path)

        # Tokeniser la légende
        sequence = self.tokenizer.texts_to_sequences([caption])[0]
        sequence = pad_sequences([sequence], maxlen=self.max_length, padding="post")[0]

        return torch.tensor(image), torch.tensor(sequence)

In [30]:
from torch.utils.data import DataLoader, Dataset
# Réduire les datasets pour un test rapide
train_captions_small = train_captions.sample(n=100, random_state=42)
val_captions_small = val_captions.sample(n=50, random_state=42)

# Préparer les datasets réduits
train_dataset_small = FlickrDataset(train_captions_small, images_dir, image_transforms, tokenizer, max_length)
val_dataset_small = FlickrDataset(val_captions_small, images_dir, image_transforms, tokenizer, max_length)

# Préparer les DataLoaders réduits
train_loader_small = DataLoader(train_dataset_small, batch_size=2, shuffle=True)
val_loader_small = DataLoader(val_dataset_small, batch_size=2, shuffle=False)


In [31]:
for epoch in range(num_epochs):
    image_encoder.train()
    caption_generator.train()
    total_loss = 0

    for images, captions in train_loader_small:
        # Déplacer les données sur le bon device
        images, captions = images.to(device), captions.to(device)

        # Extraire les caractéristiques des images
        features = image_encoder(images)

        # Prédire les légendes
        outputs = caption_generator(features, captions[:, :-1])

        # Décaler les légendes cibles
        targets = captions[:, 1:]  # Décalage pour correspondre aux prédictions

        # Troncature des sorties pour correspondre aux cibles
        outputs = outputs[:, :targets.shape[1], :]  # (batch_size, seq_len, vocab_size)

        # Convertir les cibles en type Long
        targets = targets.long()

        # Vérifier les dimensions après ajustement
        print(f"Adjusted Outputs shape: {outputs.shape}")  # (batch_size, seq_len, vocab_size)
        print(f"Targets shape: {targets.shape}")          # (batch_size, seq_len)

        # Calcul de la perte
        loss = criterion(
            outputs.reshape(-1, outputs.size(-1)),  # (batch_size * seq_len, vocab_size)
            targets.reshape(-1)                    # (batch_size * seq_len)
        )

        # Backpropagation et optimisation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Afficher la perte moyenne pour l'époque
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader_small):.4f}")




  return torch.tensor(image), torch.tensor(sequence)


Adjusted Outputs shape: torch.Size([2, 32, 8058])
Targets shape: torch.Size([2, 32])
Adjusted Outputs shape: torch.Size([2, 32, 8058])
Targets shape: torch.Size([2, 32])
Adjusted Outputs shape: torch.Size([2, 32, 8058])
Targets shape: torch.Size([2, 32])
Adjusted Outputs shape: torch.Size([2, 32, 8058])
Targets shape: torch.Size([2, 32])
Adjusted Outputs shape: torch.Size([2, 32, 8058])
Targets shape: torch.Size([2, 32])
Adjusted Outputs shape: torch.Size([2, 32, 8058])
Targets shape: torch.Size([2, 32])
Adjusted Outputs shape: torch.Size([2, 32, 8058])
Targets shape: torch.Size([2, 32])
Adjusted Outputs shape: torch.Size([2, 32, 8058])
Targets shape: torch.Size([2, 32])
Adjusted Outputs shape: torch.Size([2, 32, 8058])
Targets shape: torch.Size([2, 32])
Adjusted Outputs shape: torch.Size([2, 32, 8058])
Targets shape: torch.Size([2, 32])
Adjusted Outputs shape: torch.Size([2, 32, 8058])
Targets shape: torch.Size([2, 32])
Adjusted Outputs shape: torch.Size([2, 32, 8058])
Targets shape: 

In [32]:
print(f"Targets shape: {targets.shape}")  # (batch_size, embed_size)
print(f"Outputs shape: {outputs.shape}")    # (batch_size, seq_len, vocab_size)
print(f"Captions shape: {captions.shape}")  # (batch_size, seq_len)


Targets shape: torch.Size([2, 32])
Outputs shape: torch.Size([2, 32, 8058])
Captions shape: torch.Size([2, 33])


In [33]:
train_loader_small

<torch.utils.data.dataloader.DataLoader at 0x1c3273c6a50>

In [34]:
# Retranscrire en language naturel
predicted_indices = outputs.argmax(dim=-1)  # Forme : (batch_size, seq_len)

In [35]:
predicted_indices.shape

torch.Size([2, 32])

In [36]:
def indices_to_sentence(indices, index_word, end_token="<END>"):
    words = []
    for idx in indices:
        word = index_word.get(idx, "<UNK>")  # Convertir les indices en mots
        if word == end_token:  # Arrêter si le token de fin est atteint
            break
        words.append(word)
    return " ".join(words)


In [37]:
# Convertir les indices en phrase
for sequence_indices in predicted_indices:
    sentence = indices_to_sentence(sequence_indices.tolist(), word_index)
    print("Phrase générée :", sentence)

Phrase générée : <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK>
Phrase générée : <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK>


In [38]:
import gradio as gr

description = "Story generation with GPT-2"
title = "Generate your own story"
examples = [["Adventurer is approached by a mysterious stranger in the tavern for a new quest."]]

interface = gr.Interface.load("huggingface/pranavpsv/gpt2-genre-story-generator",
            description=description,
            examples=examples
)

interface.launch()



ModuleNotFoundError: No module named 'gradio'