In [76]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# premier test

In [77]:
torch.manual_seed(0)
seq_1 = torch.randn(5, 128)  # 5 tweets, 128-dim embeddings
seq_2 = torch.randn(3, 128)  # 3 tweets, 128-dim embeddings
seq_3 = torch.randn(7, 128)  # 7 tweets, 128-dim embeddings

# Pad sequences so they are the same length
padded_seqs = pad_sequence([seq_1, seq_2, seq_3], batch_first=True) 
# shape = b_size, max_n_tweets, 128
# avec b_size=3 et max_n_tweets=7

# Create masks to identify the real data points
lengths = [seq_1.size(0), seq_2.size(0), seq_3.size(0)]
mask = torch.arange(padded_seqs.size(1)).unsqueeze(0) < torch.tensor(lengths).unsqueeze(1)

# Example of applying a weighted average (mask-aware)
weights = torch.randn_like(padded_seqs)
weighted_sum = torch.sum(weights * padded_seqs * mask.unsqueeze(2), dim=1)
average = weighted_sum / mask.sum(dim=1, keepdim=True)

In [78]:
average[0] - (torch.sum((weights[0, :5] * seq_1), dim=0) / 5)

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.])

# vrai donnees

In [82]:
np.random.seed(0)
data = {
    'f1': [0.1, 0.2, 0.3, 0.4, 0.5, 0.2, 0.3, 3.3, 3.4, 3.5],
    'f2': [1.1, 1.2, 1.3, 3.3, 3.4, 3.5, 1.4, 1.5, 0.2, 0.3],
    'm1': [2.1, 2.2, 2.3, 2.4, 3.3, 3.4, 3.5, 2.5, 0.2, 0.3],
    'm2': [3.3, 3.4, 3.5, 3.1, 3.2, 3.3, 3.4, 3.5, 0.2, 0.1]
}

# Example index values (dates and IDs)
index = pd.MultiIndex.from_tuples([
    ('2024-10-01', '1'),
    ('2024-10-01', '1'),
    ('2024-10-01', '2'),
    ('2024-10-01', '2'),
    ('2024-10-01', '1'),
    ('2024-10-02', '2'),
    ('2024-10-02', '1'),
    ('2024-10-02', '1'),
    ('2024-10-03', '3'),
    ('2024-10-03', '1'),
], names=['date', 'ID_QI'])

df = pd.DataFrame(data, index=index)
df['c2cdr'] = np.random.rand(len(df))
df['c2cdr'] = df['c2cdr'].groupby(['date', 'ID_QI']).mean().round(2) # make sure that the target is fixed for a given (date, ID_QI) pair
df

Unnamed: 0_level_0,Unnamed: 1_level_0,f1,f2,m1,m2,c2cdr
date,ID_QI,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-10-01,1,0.1,1.1,2.1,3.3,0.56
2024-10-01,1,0.2,1.2,2.2,3.4,0.56
2024-10-01,2,0.3,1.3,2.3,3.5,0.57
2024-10-01,2,0.4,3.3,2.4,3.1,0.57
2024-10-01,1,0.5,3.4,3.3,3.2,0.56
2024-10-02,2,0.2,3.5,3.4,3.3,0.65
2024-10-02,1,0.3,1.4,3.5,3.4,0.66
2024-10-02,1,3.3,1.5,2.5,3.5,0.66
2024-10-03,3,3.4,0.2,0.2,0.2,0.96
2024-10-03,1,3.5,0.3,0.3,0.1,0.38


In [83]:
grouped = df.groupby(['date', 'ID_QI'])

sequences = []
targets = []

for (date, ID_QI), group in grouped:
    features = group[['f1', 'f2', 'm1', 'm2']].values
    assert group['c2cdr'].nunique() == 1
    target = group['c2cdr'].iloc[0]  # Single target for the group
    
    sequences.append(features)
    targets.append(target)

sequences = [torch.tensor(seq, dtype=torch.float32) for seq in sequences]
targets = torch.tensor(targets, dtype=torch.float32)

In [96]:
class TweetDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return self.sequences[idx], self.targets[idx]
    

def collate_fn(batch):
    sequences, targets = zip(*batch)
    padded_sequences = pad_sequence(sequences, batch_first=True)
    embeddings = padded_sequences[:, :, :2]  # TODO: Select f1, f2 (embeddings)
    meta_features = padded_sequences[:, :, 2:]  # TODO: Select m1, m2 (meta features)
    lengths = torch.tensor([len(seq) for seq in sequences])
    targets = torch.tensor(targets, dtype=torch.float32)
    return embeddings, meta_features, lengths, targets

tweet_dataset = TweetDataset(sequences, targets)
data_loader = DataLoader(tweet_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

In [111]:
class BatchedTweetAttentionModel(nn.Module):
    def __init__(self, embedding_dim=2, meta_dim=2, mlp_dim=32):
        super(BatchedTweetAttentionModel, self).__init__()
        self.intermediary_layer = nn.Linear(embedding_dim + meta_dim, embedding_dim + meta_dim)
        self.attention_layer = nn.Linear(embedding_dim + meta_dim, 1)
        self.mlp = nn.Sequential(
            nn.Linear(embedding_dim, mlp_dim),
            nn.ReLU(),
            nn.Linear(mlp_dim, 1)
        )

    def forward(self, embeddings, meta_features, lengths):
        """
        embeddings: tensor of shape (batch_size, L_max, embedding_dim) (padded)
        meta_features: tensor of shape (batch_size, L_max, meta_dim) (padded)
        lengths: tensor of shape (batch_size,) indicating the actual length of each sequence
        """
        batch_size, L_max, _ = embeddings.shape
        concat_features = torch.cat([embeddings, meta_features], dim=-1)  # Shape: (batch_size, L_max, embedding_dim + meta_dim)
        transformed_features = torch.tanh(self.intermediary_layer(concat_features))  # Shape: (batch_size, L_max, embedding_dim + meta_dim)
        w_l = torch.sigmoid(self.attention_layer(transformed_features)).squeeze(-1)  # Shape: (batch_size, L_max)
        mask = torch.arange(L_max).expand(batch_size, L_max) < lengths.unsqueeze(1)  # Shape: (batch_size, L_max)
        mask = mask.to(embeddings.device)
        w_l = w_l.masked_fill(~mask, -float('inf'))  # Mask out padded positions by setting large negative values
        alpha_l = F.softmax(w_l, dim=1)  # Shape: (batch_size, L_max)
        weighted_embedding = torch.sum(alpha_l.unsqueeze(-1) * embeddings, dim=1)  # Shape: (batch_size, embedding_dim)
        prediction = self.mlp(weighted_embedding).squeeze(-1)  # Shape: (batch_size,)
        return prediction
    

class BatchedTweetAverageModel(nn.Module): # Only for comparison with the regular model
    def __init__(self, embedding_dim=2, mlp_dim=32):
        super(BatchedTweetAverageModel, self).__init__()
        self.mlp = nn.Sequential(
            nn.Linear(embedding_dim, mlp_dim),
            nn.ReLU(),
            nn.Linear(mlp_dim, 1)  # Final output layer
        )

    def forward(self, embeddings, lengths):
        """
        embeddings: tensor of shape (batch_size, L_max, embedding_dim) (padded)
        lengths: tensor of shape (batch_size,) indicating the actual length of each sequence
        """
        batch_size, L_max, _ = embeddings.shape
        mask = torch.arange(L_max).expand(batch_size, L_max).to(embeddings.device) < lengths.unsqueeze(1)  # Shape: (batch_size, L_max)
        masked_embeddings = embeddings * mask.unsqueeze(-1)
        sum_embeddings = masked_embeddings.sum(dim=1)  # Shape: (batch_size, embedding_dim)
        avg_embeddings = sum_embeddings / lengths.unsqueeze(1).float()  # Shape: (batch_size, embedding_dim)
        print(avg_embeddings)
        prediction = self.mlp(avg_embeddings).squeeze(-1)  # Shape: (batch_size,)
        return prediction

In [116]:
model = BatchedTweetAttentionModel()
criterion = nn.MSELoss()

for embeddings, meta_features, lengths, target_batch in data_loader:
    prediction = model(embeddings, meta_features, lengths)
    loss = criterion(prediction, target_batch)
    break


# TODO: test if this model gives exactly the same results as the regular model
model = BatchedTweetAverageModel()
criterion = nn.MSELoss()

for embeddings, meta_features, lengths, target_batch in data_loader:
    prediction = model(embeddings, lengths)
    loss = criterion(prediction, target_batch)
    break

tensor([[0.2667, 1.9000],
        [1.8000, 1.4500],
        [3.5000, 0.3000],
        [0.3500, 2.3000]])


In [117]:
# torch.cat((embeddings, meta_features), dim=2)
embeddings

tensor([[[0.1000, 1.1000],
         [0.2000, 1.2000],
         [0.5000, 3.4000]],

        [[0.3000, 1.4000],
         [3.3000, 1.5000],
         [0.0000, 0.0000]],

        [[3.5000, 0.3000],
         [0.0000, 0.0000],
         [0.0000, 0.0000]],

        [[0.3000, 1.3000],
         [0.4000, 3.3000],
         [0.0000, 0.0000]]])

In [120]:
target_batch

tensor([0.5600, 0.6600, 0.3800, 0.5700])

In [119]:
np.mean([1.1, 1.2, 3.4])

1.8999999999999997