In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


# Dataset

In [18]:
class EmotionSequenceDataset(Dataset):
    def __init__(self, df):
        df = df.copy()
        
        df['deceptive'] = df['deceptive'].astype(int)
        self.sample_ids = df['id'].unique().tolist()
        self.data = []

        for sample_id in self.sample_ids:
            subset = df[df['id'] == sample_id]
            X = subset.drop(columns=['id', 'frame', 'deceptive']).values.astype(np.float32)
            y = subset['deceptive'].iloc[0]
            self.data.append((X, y))

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        X, y = self.data[idx]
        features = torch.tensor(X, dtype=torch.float32)
        label = torch.tensor(y, dtype=torch.float32)
        return features, label
    
def collate_fn(batch):
    sequences, labels = zip(*batch)
    lengths = [len(seq) for seq in sequences]
    padded_sequences = nn.utils.rnn.pad_sequence(sequences, batch_first=True)
    return padded_sequences, torch.tensor(lengths), torch.tensor(labels)

# BiLSTM with attention

In [19]:
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Linear(hidden_dim * 2, 1)

    def forward(self, lstm_out, mask):
        scores = self.attn(lstm_out).squeeze(-1)
        scores = scores.masked_fill(~mask, float('-inf'))
        weights = torch.softmax(scores, dim=1).unsqueeze(-1)
        context = torch.sum(weights * lstm_out, dim=1)
        return context, weights

class BiLSTMAttention(nn.Module):
    def __init__(self, input_dim=7, hidden_dim=64, num_layers=1):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers,
                            bidirectional=True, batch_first=True)
        self.attn = Attention(hidden_dim)
        self.fc = nn.Linear(hidden_dim * 2, 1)

    def forward(self, x, lengths):
        packed = nn.utils.rnn.pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        out, _ = self.lstm(packed)
        out, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)

        mask = torch.arange(out.size(1), device=out.device)[None, :] < lengths[:, None].to(out.device)
        context, weights = self.attn(out, mask)
        logits = self.fc(context).squeeze(1)
        return logits, weights

In [22]:
df = pd.read_csv("processed_data/silesian_deception_dataset/emotions.csv")
samples = df['id'].unique().tolist()
train_samples, test_samples = train_test_split(samples, test_size=0.2, random_state=42)
train_df = df[df['id'].isin(train_samples)]
test_df = df[df['id'].isin(test_samples)]

train_dataset = EmotionSequenceDataset(train_df)
test_dataset = EmotionSequenceDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=8, collate_fn=collate_fn, shuffle=True, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=8, collate_fn=collate_fn, shuffle=False, drop_last=True)

print(f"Global Class Distribution: {df['deceptive'].value_counts().to_dict()}")
print(f"Train samples: {len(train_dataset)}, Test samples: {len(test_dataset)}")


Global Class Distribution: {True: 129497, False: 61599}
Train samples: 737, Test samples: 185


In [21]:
model = BiLSTMAttention().to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

for epoch in range(10):
    model.train()
    total_loss = 0
    for X, lengths, y in train_loader:
        X = X.to(device)
        y = y.to(device)
        opt.zero_grad()
        logits, _ = model(X, lengths)
        loss = criterion(logits, y)
        loss.backward()
        opt.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, loss={total_loss/len(train_loader):.4f}")

Epoch 1, loss=0.6448
Epoch 2, loss=0.6362
Epoch 3, loss=0.6340
Epoch 4, loss=0.6359
Epoch 5, loss=0.6338
Epoch 6, loss=0.6338
Epoch 7, loss=0.6329
Epoch 8, loss=0.6327
Epoch 9, loss=0.6323
Epoch 10, loss=0.6329


In [6]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for X, lengths, y in test_loader:
        X = X.to(device)
        y = y.to(device)

        logits, _ = model(X, lengths)
        preds = torch.sigmoid(logits)
        preds = (preds > 0.5).float()

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(y.cpu().numpy())

all_preds = np.array(all_preds)
all_labels = np.array(all_labels)

acc = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)
cm = confusion_matrix(all_labels, all_preds)

print(f"✅ Test accuracy: {acc:.4f}")
print(f"✅ F1 score: {f1:.4f}")
print("✅ Confusion matrix:")
print(cm)

✅ Test accuracy: 0.7283
✅ F1 score: 0.8428
✅ Confusion matrix:
[[  0  50]
 [  0 134]]


# Purely emotion-based dataset analysis

In [28]:
df = pd.read_csv("processed_data/silesian_deception_dataset/emotions.csv")
df.groupby("deceptive").mean()

Unnamed: 0_level_0,id,frame,Angry,Disgust,Fear,Happy,Sad,Surprise,Neutral
deceptive,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
False,452.546502,3778.415965,0.130773,0.120608,0.121079,0.134827,0.148551,0.12061,0.223552
True,459.625536,6431.186514,0.129634,0.119907,0.12053,0.132197,0.152211,0.119913,0.225608


In [29]:
df.groupby("deceptive").std()

Unnamed: 0_level_0,id,frame,Angry,Disgust,Fear,Happy,Sad,Surprise,Neutral
deceptive,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
False,273.434383,3758.070614,0.040251,0.010359,0.012758,0.046865,0.060667,0.010363,0.085442
True,270.189444,2612.202414,0.039913,0.00998,0.013235,0.043731,0.064583,0.00999,0.085189


Minimal difference between truth/lie sequences. The model cannot learn anything, which is shown by the stable loss.