In [1]:
import logging

from torch.nn.modules.pooling import MaxPool1d
logging.basicConfig(level=logging.INFO)

import heapq
from pathlib import Path
import gzip

from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
import sentencepiece as spm

import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from torch.utils.tensorboard import SummaryWriter

from tp8_preprocess import TextDataset




NameError: name '__file__' is not defined

In [4]:
# Utiliser tp8_preprocess pour générer le vocabulaire BPE et
# le jeu de donnée dans un format compact

# --- Configuration

# Taille du vocabulaire
vocab_size = 1000
MAINDIR = Path("./").parent

# Chargement du tokenizer

tokenizer = spm.SentencePieceProcessor()
tokenizer.Load(f"wp{vocab_size}.model")
ntokens = len(tokenizer)

def loaddata(mode):
    with gzip.open(f"{mode}-{vocab_size}.pth", "rb") as fp:
        return torch.load(fp)


test = loaddata("test")
train = loaddata("train")
TRAIN_BATCHSIZE=500
TEST_BATCHSIZE=500


# --- Chargements des jeux de données train, validation et test

val_size = 1000
train_size = len(train) - val_size
train, val = torch.utils.data.random_split(train, [train_size, val_size])

logging.info("Datasets: train=%d, val=%d, test=%d", train_size, val_size, len(test))
logging.info("Vocabulary size: %d", vocab_size)
train_iter = torch.utils.data.DataLoader(train, batch_size=TRAIN_BATCHSIZE, collate_fn=TextDataset.collate)
val_iter = torch.utils.data.DataLoader(val, batch_size=TEST_BATCHSIZE, collate_fn=TextDataset.collate)
test_iter = torch.utils.data.DataLoader(test, batch_size=TEST_BATCHSIZE, collate_fn=TextDataset.collate)


#  TODO: 


INFO:root:Datasets: train=1599000, val=1000, test=359
INFO:root:Vocabulary size: 1000


In [1]:
print("train", train[0])
print("val", val[0])
print("test", test[0])

NameError: name 'train' is not defined

In [None]:
#TODO tester a partir d'ici

In [None]:
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_filters, filter_sizes, num_classes):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (fs, embedding_dim)) for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * num_filters, num_classes)

    def forward(self, x):
        x = self.embedding(x)  # Embedding layer
        x = x.unsqueeze(1)  # Add channel dimension
        x = [torch.relu(conv(x)).squeeze(3) for conv in self.convs]  # Convolutional layers
        x = [torch.max_pool1d(conv, conv.size(2)).squeeze(2) for conv in x]  # Max pooling layers
        x = torch.cat(x, 1)  # Concatenate results from different filter sizes
        x = self.fc(x)  # Fully connected layer
        return x


In [None]:
class MajorityClassBaseline:
    def __init__(self, majority_class=None):
        self.majority_class = majority_class

    def fit(self, X, y):
        """
        Fit the baseline model.

        Parameters:
        - X: Input data (not used in this baseline)
        - y: Target labels (used to determine the majority class)
        """
        if self.majority_class is None:
            # Determine the majority class if not explicitly provided
            unique_classes, counts = torch.unique(y, return_counts=True)
            self.majority_class = unique_classes[counts.argmax()]

    def predict(self, X):
        """
        Make predictions using the baseline model.

        Parameters:
        - X: Input data (not used in this baseline)

        Returns:
        - predictions: Predicted labels (always the majority class)
        """
        if self.majority_class is None:
            raise ValueError("Majority class is not set. Fit the model with training data first or provide it explicitly.")
        
        # Return predictions of the majority class
        predictions = torch.full((len(X),), self.majority_class, dtype=torch.long)
        return predictions

In [None]:
majority_class_baseline = MajorityClassBaseline()
majority_class_baseline.fit(train, train.labels)
predictions = majority_class_baseline.predict(test)
print("Majority class baseline accuracy:", (predictions == test.labels).float().mean())



In [None]:
# Calculate Metrics for Baseline Model (if available)
baseline_val_predictions = majority_class_baseline.predict(val)  # Use your baseline model
baseline_val_accuracy = accuracy_score(val.labels, baseline_val_predictions)
baseline_val_precision = precision_score(val.labels, baseline_val_predictions)
baseline_val_recall = recall_score(val.labels, baseline_val_predictions)
baseline_val_f1 = f1_score(val.labels, baseline_val_predictions)

In [None]:
# Create Confusion Matrices
confusion_matrix_cnn = confusion_matrix(val.labels, baseline_val_predictions)
confusion_matrix_baseline = confusion_matrix(val.labels, baseline_val_predictions)

In [None]:
vocab_size = ntokens
embedding_dim = 100
num_filters = 100
filter_sizes = [3, 4, 5]
num_classes = 3
model1 = TextCNN(vocab_size, embedding_dim, num_filters, filter_sizes, num_classes)

In [None]:
# Define training parameters
learning_rate = 0.001
epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model1.parameters(), lr=learning_rate)

# Step 1b: Training Loop
for epoch in range(epochs):
    model1.train()
    for batch in train_iter:
        inputs, labels = batch.text.to(device), batch.labels.to(device)
        optimizer.zero_grad()
        outputs = model1(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Step 1c: Evaluate on Validation Dataset
model1.eval()
val_predictions = []
val_labels = []
with torch.no_grad():
    for batch in val_iter:
        inputs, labels = batch.text.to(device), batch.labels.to(device)
        outputs = model1(inputs)
        _, predicted = torch.max(outputs, 1)
        val_predictions.extend(predicted.cpu().numpy())
        val_labels.extend(labels.cpu().numpy())

# Calculate validation metrics
val_accuracy = accuracy_score(val_labels, val_predictions)
val_precision = precision_score(val_labels, val_predictions)
val_recall = recall_score(val_labels, val_predictions)
val_f1 = f1_score(val_labels, val_predictions)


In [None]:
# Define hyperparameters and configurations for multiple models
models = []
vocab_size = 10000
embedding_dim_list = [100, 200, 300]  # Different embedding dimensions
num_filters_list = [50, 100, 150]    # Different numbers of filters
filter_sizes_list = [(3, 100), (4, 100), (5, 100)]  # Different filter sizes
num_classes = 2  # Number of output classes (e.g., binary classification)

# Initialize models with different configurations
for embedding_dim in embedding_dim_list:
    for num_filters in num_filters_list:
        for filter_sizes in filter_sizes_list:
            model = TextCNN(vocab_size, embedding_dim, num_filters, filter_sizes, num_classes)
            models.append(model)