In [2]:
from transformers import AutoModel, AutoTokenizer
AutoTokenizer.from_pretrained('almanach/camembert-base')
AutoModel.from_pretrained('almanach/camembert-base')

CamembertModel(
  (embeddings): CamembertEmbeddings(
    (word_embeddings): Embedding(32005, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): CamembertEncoder(
    (layer): ModuleList(
      (0-11): 12 x CamembertLayer(
        (attention): CamembertAttention(
          (self): CamembertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): CamembertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
 

In [3]:
tokenize = AutoTokenizer.from_pretrained("almanach/camembert-base")


tok_sent = tokenize(["Arturo","devra","retenter","demain"], is_split_into_words=True, return_tensors='pt')
print(tok_sent['input_ids'])


model=AutoModel.from_pretrained("almanach/camembert-base")
emb_sent = model(**tok_sent)['last_hidden_state'][0]
print(emb_sent.shape)

tok_sent.word_ids()
print(tokenize.convert_ids_to_tokens(tok_sent['input_ids'][0]))


tensor([[    5,  2082, 10394,  2613,   343,  3889,   108,  2385,     6]])
torch.Size([9, 768])
['<s>', '▁Art', 'uro', '▁devra', '▁re', 'tent', 'er', '▁demain', '</s>']


In [4]:
# Extraction de sequoia
import sys
sys.path.append("../TP1/")
from conllu import parse_incr
import numpy as np

phrases = []
for sent in parse_incr(open("../TP1/sequoia-ud.parseme.frsemcor.simple.small", encoding='UTF-8')):
    mots = [tok["form"] for tok in sent if tok["upos"] in ["NOUN", "PROPN", "NUM"]]
    if mots:
        phrases.append(mots)

print(phrases[:10])


tokenize = AutoTokenizer.from_pretrained("almanach/camembert-base")

embeddings = []
for phrase in phrases:
    tok_sents = tokenize(phrase, is_split_into_words=True, return_tensors='pt', padding=True, truncation=True)
    model = AutoModel.from_pretrained("almanach/camembert-base")
    emb_sent = model(**tok_sents)['last_hidden_state'][0]
    embeddings.append(emb_sent)

tok_sents.word_ids() 
word_embeddings = []
for i, word_id in enumerate(tok_sents.word_ids()):
    if word_id is not None and phrase[word_id] in mots:
        word_embeddings.append(emb_sent[i].detach().numpy())

# Moyennage des embeddings des sub-tokens pour chaque mot
final_embeddings = []
for word in mots:
    word_embs = [emb for i, emb in enumerate(word_embeddings) if tok_sents.word_ids()[i] == mots.index(word)]
    if word_embs:
        final_embeddings.append(np.mean(word_embs, axis=0))

print(final_embeddings)


[['association', 'décors', 'aide', 'bénévoles', 'tableaux', 'thème', 'Cinq', 'siècles', 'activité', 'région', 'Ancerville'], ['Ouverture', 'jours', 'lundi', '14', 'h', '30', '18', 'h'], ['sous-préfet', 'énergie', 'réalisation'], ['membres', 'club', 'occasion', 'après-midi', '14', 'h', 'salle', 'Jean-Mathieu'], ['M.', 'Hosneld', '44', 'ans'], ['cas', 'Joseph', 'Bari', 'comité', 'vedette', 'colombophilie', '1962', 'époque', 'pigeonnier', 'Châtenois-les-Forges', 'espace', 'Grand-Charmont'], ['soirée', 'François', 'Puel', 'chercheur', 'laboratoire', 'astrophysique', 'Besançon'], ['Princes', 'nombre', 'roi', 'Pierre', 'public', 'souvenirs', 'enfance', 'temps', 'représentation'], ['événement', 'deux', 'établissements', 'mercredi', '23', 'juin', 'occasion', 'assemblée', 'Offices', 'tourisme', 'Syndicats', 'initiative', 'Doubs', 'jour', 'présidence', 'Edmond', 'Maire', 'Saline'], ['Suzanne', 'Collin', 'Brauvilliers', 'Meuse', '21', 'janvier', '1924', 'famille', 'cinq', 'enfants']]
[array([ 5.2

In [5]:
# Extraction de sequoia
import sys
sys.path.append("../TP1/")
from conllu import parse_incr
import numpy as np
import torch

phrases = []
for sent in parse_incr(open("../TP1/sequoia-ud.parseme.frsemcor.simple.small", encoding='UTF-8')):
    mots = [tok["form"] for tok in sent if tok["upos"] in ["NOUN", "PROPN", "NUM"]]
    if mots:
        phrases.append(mots)

print(phrases[:10])

# Assuming super_sense_labels is a list of super-sense labels corresponding to each word in 'mots'
super_sense_labels = ["label1", "label2", "label3", "label4", "label5", "label6", "label7", "label8", "label9", "label10", "label11", "label12", "label13", "label14"]

# Convert super-sense labels to indices
label_to_index = {label: idx for idx, label in enumerate(set(super_sense_labels))}
super_sense_indices = [label_to_index[label] for label in super_sense_labels]

print(super_sense_indices)

# Tokenize and embed phrases
final_embeddings = []
embeddings = []
tokenize = AutoTokenizer.from_pretrained("almanach/camembert-base")
model = AutoModel.from_pretrained("almanach/camembert-base")

for phrase in phrases:
    tok_sents = tokenize(phrase, is_split_into_words=True, return_tensors='pt', padding=True, truncation=True)
    emb_sent = model(**tok_sents)['last_hidden_state'][0]
    embeddings.append(emb_sent)

    word_embeddings = []
    for i, word_id in enumerate(tok_sents.word_ids()):
        if word_id is not None and phrase[word_id] in mots:
            word_embeddings.append(emb_sent[i].detach().numpy())

    # Moyennage des embeddings des sub-tokens pour chaque mot
    for word in phrase:
        word_embs = [emb for i, emb in enumerate(word_embeddings) if tok_sents.word_ids()[i] == phrase.index(word)]
        if word_embs:
            final_embeddings.append(np.mean(word_embs, axis=0))

print(final_embeddings)
print(np.asarray(final_embeddings).shape)

# Freeze embeddings with torch.no_grad()
with torch.no_grad():
    for phrase in phrases:
        tok_sents = tokenize(phrase, is_split_into_words=True, return_tensors='pt', padding=True, truncation=True)
        emb_sent = model(**tok_sents)['last_hidden_state'][0]
        embeddings.append(emb_sent)

[['association', 'décors', 'aide', 'bénévoles', 'tableaux', 'thème', 'Cinq', 'siècles', 'activité', 'région', 'Ancerville'], ['Ouverture', 'jours', 'lundi', '14', 'h', '30', '18', 'h'], ['sous-préfet', 'énergie', 'réalisation'], ['membres', 'club', 'occasion', 'après-midi', '14', 'h', 'salle', 'Jean-Mathieu'], ['M.', 'Hosneld', '44', 'ans'], ['cas', 'Joseph', 'Bari', 'comité', 'vedette', 'colombophilie', '1962', 'époque', 'pigeonnier', 'Châtenois-les-Forges', 'espace', 'Grand-Charmont'], ['soirée', 'François', 'Puel', 'chercheur', 'laboratoire', 'astrophysique', 'Besançon'], ['Princes', 'nombre', 'roi', 'Pierre', 'public', 'souvenirs', 'enfance', 'temps', 'représentation'], ['événement', 'deux', 'établissements', 'mercredi', '23', 'juin', 'occasion', 'assemblée', 'Offices', 'tourisme', 'Syndicats', 'initiative', 'Doubs', 'jour', 'présidence', 'Edmond', 'Maire', 'Saline'], ['Suzanne', 'Collin', 'Brauvilliers', 'Meuse', '21', 'janvier', '1924', 'famille', 'cinq', 'enfants']]
[1, 6, 4, 10

In [8]:
# Create super sense embeddings
super_sense_embeddings = []
for idx in super_sense_indices:
    super_sense_embeddings.append(final_embeddings[idx])

print(super_sense_embeddings)

[array([ 5.27295955e-02, -1.00408226e-01,  2.64570028e-01,  1.86985582e-02,
       -4.12009656e-04,  8.07704628e-02, -6.44614622e-02,  4.50569466e-02,
       -1.83928423e-02, -7.88034685e-03, -1.95240915e-01,  2.16381520e-01,
        1.11946762e-01,  1.08703554e-01, -2.58674473e-02, -4.17003036e-03,
        1.40289247e-01, -8.38452056e-02, -1.47874251e-01,  6.03525378e-02,
       -2.42318884e-02, -6.74317554e-02,  7.30022192e-02, -4.67245877e-02,
       -6.12574890e-02, -4.57514599e-02, -2.77687222e-01, -7.53597170e-03,
        4.18729112e-02, -1.86406434e-01,  9.46238637e-04,  7.42191374e-02,
       -7.24270754e-03, -2.88519077e-03,  4.66724336e-02,  1.58115268e-01,
       -4.74037975e-03, -1.51580289e-01,  7.17360079e-02,  1.09147079e-01,
       -1.57622010e-01,  6.95470124e-02,  8.84627998e-02,  8.24192166e-02,
        8.79816636e-02,  1.25562176e-01,  1.81594770e-03,  3.35046798e-02,
       -7.97078684e-02,  1.18692759e-02, -1.58338472e-02, -3.07781808e-03,
       -8.68041739e-02, 

In [6]:
from transformers import CamembertModel

import torch.nn as nn
import torch.nn.functional as F

class SimpleCNNWithCamembert(nn.Module):
    def __init__(self):
        super(SimpleCNNWithCamembert, self).__init__()
        self.camembert = CamembertModel.from_pretrained("almanach/camembert-base")
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(64 * 7 * 7 + 768, 128)  # 768 is the hidden size of CamemBERT
        self.fc2 = nn.Linear(128, 10)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x, input_ids, attention_mask):
        camembert_outputs = self.camembert(input_ids=input_ids, attention_mask=attention_mask)
        camembert_pooled_output = camembert_outputs.pooler_output
        
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 64 * 7 * 7)
        
        x = torch.cat((x, camembert_pooled_output), dim=1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        x = self.softmax(x)
        return x

# Instantiate the model
model = SimpleCNNWithCamembert()
print(model)


import torch
from torch.utils.data import DataLoader, TensorDataset

import torch.optim as optim

# Assuming final_embeddings and super_sense_indices are already defined
embeddings_tensor = torch.tensor(final_embeddings, dtype=torch.float32)
labels_tensor = torch.tensor(super_sense_indices, dtype=torch.long)

# Create a TensorDataset and DataLoader
dataset = TensorDataset(embeddings_tensor, labels_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Define the model, loss function, and optimizer
model = SimpleCNNWithCamembert()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for inputs, labels in dataloader:
        optimizer.zero_grad()
        
        # Forward pass
        # Adjust the input size for the convolutional layers
        inputs = inputs.unsqueeze(1)
        inputs = F.interpolate(inputs, size=(32, 32))  # Resize to 32x32

        # Calculate the correct size for the fully connected layer
        conv_output = model.pool(F.relu(model.conv2(model.pool(F.relu(model.conv1(inputs)))))).view(inputs.size(0), -1)
        conv_output_size = conv_output.size(1)
        model.fc1 = nn.Linear(conv_output_size + 768, 128)

        # Tokenize the input phrases
        tok_sents = tokenize(phrase, is_split_into_words=True, return_tensors='pt', padding=True, truncation=True)
        tok_sents = {k: v.to(inputs.device) for k, v in tok_sents.items()}  # Move tokenized inputs to the same device as inputs
        outputs = model(inputs, input_ids=tok_sents['input_ids'], attention_mask=tok_sents['attention_mask'])  # Adjust inputs as needed
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        # Calculate loss and accuracy
        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    epoch_loss = running_loss / len(dataloader)
    epoch_accuracy = correct / total
    
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')

# Save the model, label vocabulary, and hyper-parameters
torch.save({
    'model_state_dict': model.state_dict(),
    'label_to_index': label_to_index,
    'hyper_parameters': {
        'pretrained_model_name': 'almanach/camembert-base',
        'learning_rate': 0.001,
        'num_epochs': num_epochs
    }
}, 'model_checkpoint.pth')

SimpleCNNWithCamembert(
  (camembert): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0-11): 12 x CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
   

ValueError: Input and output must have the same number of spatial dimensions, but got input with spatial dimensions of [768] and output size of (32, 32). Please provide input tensor in (N, C, d1, d2, ...,dK) format and output size in (o1, o2, ...,oK) format.