<a href="https://colab.research.google.com/github/ludoveltz/test_github_fev25/blob/main/Daily_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Imports nécessaires
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from transformers.models.bert.modeling_bert import BertEncoder
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
from google.colab import drive

# Monter Google Drive
print("Montage de Google Drive...")
drive.mount('/content/drive')

# Configuration du device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Utilisation de : {device}")
print(f"GPU disponible : {torch.cuda.get_device_name(0)}")

# Chemins des fichiers dans Google Drive
TRAIN_PATH = '/content/drive/MyDrive/DATASET/train_essays.csv'
TEST_PATH = '/content/drive/MyDrive/DATASET/test_essays.csv'
PROMPT_PATH = '/content/drive/MyDrive/DATASET/train_prompts.csv'

# Vérification de l'existence des fichiers
import os
for path in [TRAIN_PATH, TEST_PATH, PROMPT_PATH]:
    if not os.path.exists(path):
        print(f"Attention: Le fichier {path} n'existe pas!")
    else:
        print(f"Fichier trouvé: {path}")

# Chargement des données
print("\nChargement des données...")
try:
    src_train = pd.read_csv(TRAIN_PATH)
    src_prompt = pd.read_csv(PROMPT_PATH)
    src_sub = pd.read_csv(TEST_PATH)
    print("Données chargées avec succès!")

    # Affichage des statistiques
    print("\nStatistiques du dataset d'entraînement:")
    print(src_train['generated'].value_counts(normalize=True))

except FileNotFoundError as e:
    print(f"Erreur lors du chargement des fichiers: {e}")
    raise

# Hyperparamètres adaptés au contexte luxe
train_batch_size = 8  # Conservateur pour la stabilité initiale
test_batch_size = 16
lr = 2e-5
beta1 = 0.5
nz = 100
num_epochs = 10
num_hidden_layers = 2
train_ratio = 0.8

# Préparation des modèles
print("\nInitialisation des modèles BERT...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
pretrained_model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
embedding_model = pretrained_model.bert.to(device)


# Dataset personnalisé
class GANDAIGDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = float(self.labels[idx])

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label)
        }

# Préparation des données
all_num = len(src_train)
train_num = int(all_num * train_ratio)
test_num = all_num - train_num

# Split train/test
train_set = src_train.iloc[:train_num]
test_set = src_train.iloc[train_num:]

# Création des datasets
train_dataset = GANDAIGDataset(train_set['text'].values, train_set['generated'].values, tokenizer)
test_dataset = GANDAIGDataset(test_set['text'].values, test_set['generated'].values, tokenizer)

# DataLoaders
train_loader = DataLoader(
    train_dataset,
    batch_size=train_batch_size,
    shuffle=True,
    num_workers=0,
    pin_memory=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=test_batch_size,
    shuffle=False,
    num_workers=0,
    pin_memory=True
)

print("Initialisation terminée!")




Montage de Google Drive...
Mounted at /content/drive
Utilisation de : cuda
GPU disponible : NVIDIA A100-SXM4-40GB
Fichier trouvé: /content/drive/MyDrive/DATASET/train_essays.csv
Fichier trouvé: /content/drive/MyDrive/DATASET/test_essays.csv
Fichier trouvé: /content/drive/MyDrive/DATASET/train_prompts.csv

Chargement des données...
Données chargées avec succès!

Statistiques du dataset d'entraînement:
generated
0    0.997823
1    0.002177
Name: proportion, dtype: float64

Initialisation des modèles BERT...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Initialisation terminée!


In [5]:
# Configuration BERT optimisée
config = BertConfig(
    num_hidden_layers=num_hidden_layers,
    hidden_size=768,
    num_attention_heads=12,
    intermediate_size=3072
)

# Définition du Generator
class Generator(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc = nn.Linear(input_dim, 256 * 128)

        # Architecture convolutionnelle sophistiquée
        self.conv_net = nn.Sequential(
            # Première couche: expansion
            nn.ConvTranspose1d(256, 512, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Dropout(0.3),

            # Deuxième couche: raffinement
            nn.ConvTranspose1d(512, 384, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm1d(384),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Dropout(0.3),

            # Couche finale: adaptation BERT
            nn.ConvTranspose1d(384, 768, kernel_size=4, stride=2, padding=1),
            nn.Tanh()
        )

        self.bert_encoder = BertEncoder(config)

    def forward(self, x):
        # Transformation du vecteur latent
        x = self.fc(x)
        x = x.view(-1, 256, 128)

        # Génération des features
        x = self.conv_net(x)
        x = x.transpose(1, 2)

        # Préparation du masque d'attention
        attention_mask = torch.ones(
            (x.size(0), x.size(1)),
            dtype=torch.long,
            device=x.device
        )
        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
        extended_attention_mask = (1.0 - extended_attention_mask.float()) * -10000.0

        # Passage dans l'encodeur BERT
        return self.bert_encoder(
            hidden_states=x,
            attention_mask=extended_attention_mask
        )

# Pooler optimisé pour la détection
class SumBertPooler(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, hidden_states):
        sum_hidden = hidden_states.sum(dim=1)
        sum_mask = sum_hidden.sum(1).unsqueeze(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        return sum_hidden / sum_mask

# Discriminator avec attention particulière aux nuances stylistiques
class Discriminator(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert_encoder = BertEncoder(config)
        self.bert_encoder.layer = nn.ModuleList([
            layer for layer in pretrained_model.bert.encoder.layer[:6]
        ])
        self.pooler = SumBertPooler()

        # Classification sophistiquée
        self.classifier = nn.Sequential(
            nn.Linear(768, 384),
            nn.LayerNorm(384),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),

            nn.Linear(384, 192),
            nn.LayerNorm(192),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),

            nn.Linear(192, 1)
        )

    def forward(self, input):
        out = self.bert_encoder(input)
        out = self.pooler(out.last_hidden_state)
        out = self.classifier(out)
        return torch.sigmoid(out).view(-1)

# Initialisation des modèles
print("Initialisation des modèles GAN...")
netG = Generator(nz).to(device)
netD = Discriminator().to(device)

# Optimiseurs
criterion = nn.BCELoss()
optimizerD = optim.Adam(netD.parameters(), lr=lr, betas=(beta1, 0.999))
optimizerG = optim.Adam(netG.parameters(), lr=lr, betas=(beta1, 0.999))

print("Modèles GAN initialisés avec succès!")


Initialisation des modèles GAN...
Modèles GAN initialisés avec succès!


In [8]:
# Fonction d'évaluation AUC
def eval_auc(model):
    model.eval()
    predictions = []
    actuals = []

    print("Évaluation du modèle...")
    with torch.no_grad():
        for batch in tqdm(test_loader):
            # Préparation des données
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Extraction des embeddings
            with torch.no_grad():
                embeded = embedding_model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                ).last_hidden_state

            # Prédiction
            outputs = model(embeded)
            predictions.extend(outputs.cpu().numpy())
            actuals.extend(labels.cpu().numpy())

    auc = roc_auc_score(actuals, predictions)
    print(f"Score AUC: {auc:.4f}")
    return auc

# Fonction de préparation des embeddings
def preparation_embedding(batch):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)

    with torch.no_grad():
        embeded = embedding_model(
            input_ids=input_ids,
            attention_mask=attention_mask
        ).last_hidden_state

    return embeded

# Fonction d'étape GAN
def GAN_step(optimizerG, optimizerD, netG, netD, real_data, label, epoch, i):
    # Training du Discriminateur
    netD.zero_grad()
    batch_size = real_data.size(0)

    # Real data
    output = netD(real_data)
    errD_real = criterion(output, label)
    errD_real.backward()
    D_x = output.mean().item()

    # Fake data
    noise = torch.randn(batch_size, nz, device=device)
    fake_data = netG(noise).last_hidden_state
    label.fill_(1)
    output = netD(fake_data.detach())
    errD_fake = criterion(output, label)
    errD_fake.backward()
    D_G_z1 = output.mean().item()
    errD = errD_real + errD_fake
    optimizerD.step()

    # Training du Générateur
    netG.zero_grad()
    label.fill_(0)
    output = netD(fake_data)
    errG = criterion(output, label)
    errG.backward()
    D_G_z2 = output.mean().item()
    optimizerG.step()

    # Logging
    if i % 50 == 0:
        print(f'[{epoch}/{num_epochs}][{i}/{len(train_loader)}] '
              f'Loss_D: {errD.item():.4f} Loss_G: {errG.item():.4f} '
              f'D(x): {D_x:.4f} D(G(z)): {D_G_z1:.4f}/{D_G_z2:.4f}')

    return optimizerG, optimizerD, netG, netD

def get_model_info_dict(model, epoch, auc_score):
    """
    Sauvegarde l'état du modèle et ses métriques de performance
    """
    current_device = next(model.parameters()).device
    model.to('cpu')

    model_info = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'auc_score': auc_score,
    }

    model.to(current_device)
    return model_info

# Boucle d'entraînement principale
print("Début de l'entraînement...")
model_infos = []

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")
    netD.train()
    netG.train()

    # Métriques par epoch
    epoch_d_losses = []
    epoch_g_losses = []

    for i, batch in enumerate(tqdm(train_loader)):
        # Préparation des données
        embeded = preparation_embedding(batch)
        labels = batch['labels'].float().to(device)

        # Étape GAN
        optimizerG, optimizerD, netG, netD = GAN_step(
            optimizerG=optimizerG,
            optimizerD=optimizerD,
            netG=netG,
            netD=netD,
            real_data=embeded,
            label=labels,
            epoch=epoch,
            i=i
        )

        # Nettoyage mémoire GPU
        if i % 10 == 0:
            torch.cuda.empty_cache()

    # Évaluation de l'epoch
    auc_score = eval_auc(netD)
    model_infos.append(get_model_info_dict(netD, epoch, auc_score))

    # Sauvegarde du meilleur modèle avec timestamp
    if auc_score == max(info['auc_score'] for info in model_infos):
        model_path = f'bootcamp_detector_epoch_{epoch}_auc_{auc_score:.4f}.pt'
        torch.save({
            'epoch': epoch,
            'model_state_dict': netD.state_dict(),
            'optimizer_state_dict': optimizerD.state_dict(),
            'auc_score': auc_score,
        }, model_path)
        print(f"Meilleur modèle sauvegardé: {model_path}")

print('Entraînement terminé!')

# Analyse des résultats
best_model_info = max(model_infos, key=lambda x: x['auc_score'])
print(f"\nMeilleure performance :")
print(f"Epoch : {best_model_info['epoch']}")
print(f"AUC Score : {best_model_info['auc_score']:.4f}")


Début de l'entraînement...

Epoch 1/10


  1%|          | 1/138 [00:00<01:08,  1.99it/s]

[0/10][0/138] Loss_D: 0.0671 Loss_G: 3.8866 D(x): 0.0119 D(G(z)): 0.9466/0.9741


 37%|███▋      | 51/138 [00:25<00:43,  1.98it/s]

[0/10][50/138] Loss_D: 0.0431 Loss_G: 3.9931 D(x): 0.0096 D(G(z)): 0.9672/0.9801


 73%|███████▎  | 101/138 [00:50<00:18,  1.98it/s]

[0/10][100/138] Loss_D: 0.0440 Loss_G: 4.1712 D(x): 0.0084 D(G(z)): 0.9652/0.9828


100%|██████████| 138/138 [01:09<00:00,  1.99it/s]


Évaluation du modèle...


100%|██████████| 18/18 [00:03<00:00,  5.76it/s]


Score AUC: 0.9491
Meilleur modèle sauvegardé: bootcamp_detector_epoch_0_auc_0.9491.pt

Epoch 2/10


  1%|          | 1/138 [00:00<01:14,  1.83it/s]

[1/10][0/138] Loss_D: 0.0283 Loss_G: 4.0257 D(x): 0.0083 D(G(z)): 0.9802/0.9814


 37%|███▋      | 51/138 [00:25<00:43,  1.99it/s]

[1/10][50/138] Loss_D: 0.0266 Loss_G: 4.4531 D(x): 0.0096 D(G(z)): 0.9832/0.9876


 73%|███████▎  | 101/138 [00:51<00:18,  1.98it/s]

[1/10][100/138] Loss_D: 0.0308 Loss_G: 4.7179 D(x): 0.0096 D(G(z)): 0.9791/0.9900


100%|██████████| 138/138 [01:09<00:00,  1.98it/s]


Évaluation du modèle...


100%|██████████| 18/18 [00:03<00:00,  5.78it/s]


Score AUC: 0.7818

Epoch 3/10


  1%|          | 1/138 [00:00<01:15,  1.81it/s]

[2/10][0/138] Loss_D: 0.0221 Loss_G: 4.6203 D(x): 0.0059 D(G(z)): 0.9840/0.9897


 37%|███▋      | 51/138 [00:25<00:43,  1.98it/s]

[2/10][50/138] Loss_D: 0.0232 Loss_G: 4.4797 D(x): 0.0069 D(G(z)): 0.9839/0.9880


 73%|███████▎  | 101/138 [00:51<00:18,  1.98it/s]

[2/10][100/138] Loss_D: 0.0174 Loss_G: 4.4629 D(x): 0.0063 D(G(z)): 0.9890/0.9877


100%|██████████| 138/138 [01:09<00:00,  1.98it/s]


Évaluation du modèle...


100%|██████████| 18/18 [00:03<00:00,  5.77it/s]


Score AUC: 0.5091

Epoch 4/10


  1%|          | 1/138 [00:00<01:14,  1.83it/s]

[3/10][0/138] Loss_D: 0.0198 Loss_G: 4.8780 D(x): 0.0063 D(G(z)): 0.9866/0.9915


 37%|███▋      | 51/138 [00:25<00:43,  1.98it/s]

[3/10][50/138] Loss_D: 0.0146 Loss_G: 5.0075 D(x): 0.0050 D(G(z)): 0.9904/0.9931


 73%|███████▎  | 101/138 [00:51<00:18,  1.98it/s]

[3/10][100/138] Loss_D: 0.0125 Loss_G: 5.4493 D(x): 0.0057 D(G(z)): 0.9932/0.9954


100%|██████████| 138/138 [01:09<00:00,  1.98it/s]


Évaluation du modèle...


100%|██████████| 18/18 [00:03<00:00,  5.72it/s]


Score AUC: 0.7527

Epoch 5/10


  1%|          | 1/138 [00:00<01:18,  1.74it/s]

[4/10][0/138] Loss_D: 0.0134 Loss_G: 5.6008 D(x): 0.0067 D(G(z)): 0.9934/0.9960


 37%|███▋      | 51/138 [00:25<00:43,  1.98it/s]

[4/10][50/138] Loss_D: 0.0130 Loss_G: 5.1622 D(x): 0.0051 D(G(z)): 0.9922/0.9931


 73%|███████▎  | 101/138 [00:50<00:18,  1.98it/s]

[4/10][100/138] Loss_D: 0.0097 Loss_G: 5.1642 D(x): 0.0051 D(G(z)): 0.9954/0.9940


100%|██████████| 138/138 [01:09<00:00,  1.98it/s]


Évaluation du modèle...


100%|██████████| 18/18 [00:03<00:00,  5.63it/s]


Score AUC: 0.7964

Epoch 6/10


  1%|          | 1/138 [00:00<01:14,  1.84it/s]

[5/10][0/138] Loss_D: 0.0107 Loss_G: 5.7076 D(x): 0.0067 D(G(z)): 0.9961/0.9965


 37%|███▋      | 51/138 [00:25<00:43,  1.98it/s]

[5/10][50/138] Loss_D: 0.0076 Loss_G: 5.6425 D(x): 0.0035 D(G(z)): 0.9960/0.9957


 73%|███████▎  | 101/138 [00:50<00:18,  1.99it/s]

[5/10][100/138] Loss_D: 0.0081 Loss_G: 5.5630 D(x): 0.0044 D(G(z)): 0.9964/0.9957


100%|██████████| 138/138 [01:09<00:00,  1.98it/s]


Évaluation du modèle...


100%|██████████| 18/18 [00:03<00:00,  5.77it/s]


Score AUC: 0.9855
Meilleur modèle sauvegardé: bootcamp_detector_epoch_5_auc_0.9855.pt

Epoch 7/10


  1%|          | 1/138 [00:00<01:16,  1.79it/s]

[6/10][0/138] Loss_D: 0.0110 Loss_G: 5.5404 D(x): 0.0050 D(G(z)): 0.9940/0.9958


 37%|███▋      | 51/138 [00:25<00:43,  1.98it/s]

[6/10][50/138] Loss_D: 0.0090 Loss_G: 5.2793 D(x): 0.0043 D(G(z)): 0.9954/0.9948


 73%|███████▎  | 101/138 [00:50<00:18,  1.99it/s]

[6/10][100/138] Loss_D: 0.0094 Loss_G: 5.2105 D(x): 0.0045 D(G(z)): 0.9951/0.9934


100%|██████████| 138/138 [01:09<00:00,  1.99it/s]


Évaluation du modèle...


100%|██████████| 18/18 [00:03<00:00,  5.76it/s]


Score AUC: 0.9636

Epoch 8/10


  1%|          | 1/138 [00:00<01:11,  1.91it/s]

[7/10][0/138] Loss_D: 0.0088 Loss_G: 5.6846 D(x): 0.0057 D(G(z)): 0.9970/0.9961


 37%|███▋      | 51/138 [00:25<00:43,  1.98it/s]

[7/10][50/138] Loss_D: 0.0082 Loss_G: 5.7321 D(x): 0.0030 D(G(z)): 0.9948/0.9960


 73%|███████▎  | 101/138 [00:51<00:18,  1.97it/s]

[7/10][100/138] Loss_D: 0.0090 Loss_G: 5.4446 D(x): 0.0046 D(G(z)): 0.9956/0.9952


100%|██████████| 138/138 [01:09<00:00,  1.98it/s]


Évaluation du modèle...


100%|██████████| 18/18 [00:03<00:00,  5.75it/s]


Score AUC: 0.9564

Epoch 9/10


  1%|          | 1/138 [00:00<01:17,  1.78it/s]

[8/10][0/138] Loss_D: 0.0077 Loss_G: 5.3889 D(x): 0.0049 D(G(z)): 0.9972/0.9950


 37%|███▋      | 51/138 [00:25<00:44,  1.98it/s]

[8/10][50/138] Loss_D: 0.0088 Loss_G: 5.3149 D(x): 0.0037 D(G(z)): 0.9949/0.9948


 73%|███████▎  | 101/138 [00:51<00:18,  1.98it/s]

[8/10][100/138] Loss_D: 0.0075 Loss_G: 6.0473 D(x): 0.0037 D(G(z)): 0.9962/0.9975


100%|██████████| 138/138 [01:09<00:00,  1.98it/s]


Évaluation du modèle...


100%|██████████| 18/18 [00:03<00:00,  5.75it/s]


Score AUC: 0.7018

Epoch 10/10


  1%|          | 1/138 [00:00<01:13,  1.86it/s]

[9/10][0/138] Loss_D: 0.0092 Loss_G: 5.8985 D(x): 0.0037 D(G(z)): 0.9945/0.9968


 37%|███▋      | 51/138 [00:25<00:44,  1.96it/s]

[9/10][50/138] Loss_D: 0.0074 Loss_G: 5.9272 D(x): 0.0035 D(G(z)): 0.9962/0.9971


 73%|███████▎  | 101/138 [00:51<00:18,  1.98it/s]

[9/10][100/138] Loss_D: 0.0086 Loss_G: 5.9513 D(x): 0.0042 D(G(z)): 0.9956/0.9973


100%|██████████| 138/138 [01:09<00:00,  1.98it/s]


Évaluation du modèle...


100%|██████████| 18/18 [00:03<00:00,  5.73it/s]


Score AUC: 0.6436
Entraînement terminé!

Meilleure performance :
Epoch : 5
AUC Score : 0.9855


In [14]:
import torch
import numpy as np
from torch.serialization import add_safe_globals

# Ajout des globals nécessaires pour le chargement sécurisé
add_safe_globals([np.dtype])

print("Configuration du chargement du modèle...")
best_model_path = 'bootcamp_detector_epoch_5_auc_0.9855.pt'

try:
    # Chargement avec les paramètres de sécurité appropriés
    checkpoint = torch.load(
        best_model_path,
        map_location=device,
        weights_only=False  # Désactivation de la restriction pour les fichiers de confiance
    )

    # Extraction et chargement du state dict
    if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint:
        state_dict = checkpoint['model_state_dict']
    else:
        state_dict = checkpoint

    # Chargement dans le modèle avec gestion des erreurs
    try:
        netD.load_state_dict(state_dict, strict=False)
        print("Modèle chargé avec succès!")
    except RuntimeError as e:
        print(f"Attention: Certains paramètres n'ont pas pu être chargés: {str(e)}")

    # Affichage des informations du checkpoint
    if isinstance(checkpoint, dict):
        if 'epoch' in checkpoint:
            print(f"Époque: {checkpoint['epoch']}")
        if 'auc_score' in checkpoint:
            print(f"Score AUC: {checkpoint['auc_score']}")

except Exception as e:
    print(f"Erreur critique lors du chargement: {str(e)}")
    raise

# Configuration pour l'inférence
netD.eval()
print("Modèle configuré pour l'inférence")

# Vérification rapide
print("\nVérification de la structure du modèle:")
total_params = sum(p.numel() for p in netD.parameters())
print(f"Nombre total de paramètres: {total_params:,}")



Configuration du chargement du modèle...
Modèle chargé avec succès!
Époque: 5
Score AUC: 0.9854545454545455
Modèle configuré pour l'inférence

Vérification de la structure du modèle:
Nombre total de paramètres: 42,897,793


In [17]:
from google.colab import drive
import os
import pandas as pd
from tqdm import tqdm

# Monter Google Drive
print("Montage de Google Drive...")
drive.mount('/content/drive')

# Définir le chemin vers le dossier DATASET
dataset_path = '/content/drive/MyDrive/DATASET/'
print(f"Accès au dossier : {dataset_path}")

# Chargement des données de test
print("Chargement des données de test...")
test_df = pd.read_csv(os.path.join(dataset_path, 'test_essays.csv'))

# Initialisation de la liste des prédictions
predictions = []

# Traitement des données
print("\nTraitement des données de test...")
try:
    for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
        # Prédiction pour chaque texte
        pred = process_test_data(row['text'], tokenizer, netD, device)
        predictions.append(pred)

        # Nettoyage mémoire périodique
        if idx % 100 == 0:
            torch.cuda.empty_cache()

except Exception as e:
    print(f"Erreur pendant le traitement : {str(e)}")
    raise

# Création du fichier de soumission
submission = pd.DataFrame({
    'id': test_df.index,
    'generated': predictions
})

# Sauvegarde des résultats
output_path = os.path.join(dataset_path, 'submission.csv')
submission.to_csv(output_path, index=False)
print(f"\nRésultats sauvegardés dans : {output_path}")

# Affichage des statistiques
print("\nStatistiques des prédictions:")
print(pd.Series(predictions).describe())

# Vérification supplémentaire suggérée
def verify_predictions(predictions, threshold=0.5):
    print("Distribution des prédictions :")
    print(f"Textes classés comme IA : {sum(p > threshold for p in predictions)}/{len(predictions)}")
    print(f"Textes classés comme humains : {sum(p <= threshold for p in predictions)}/{len(predictions)}")



Montage de Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Accès au dossier : /content/drive/MyDrive/DATASET/
Chargement des données de test...

Traitement des données de test...


100%|██████████| 3/3 [00:00<00:00, 53.73it/s]


Résultats sauvegardés dans : /content/drive/MyDrive/DATASET/submission.csv

Statistiques des prédictions:
count    3.000000
mean     0.997230
std      0.000008
min      0.997221
25%      0.997227
50%      0.997234
75%      0.997234
max      0.997234
dtype: float64



