In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import MinMaxScaler

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


# Dataset

In [None]:
class EmotionSequenceDataset(Dataset):
    def __init__(self, df):
        self.df = df.copy()
        self.df['deceptive'] = self.df['deceptive'].astype(int)
        self.features_cols = df.columns.difference(['id', 'frame', 'deceptive'])
        self.sample_ids = self.df['id'].unique().tolist()

    def __len__(self):
        return len(self.sample_ids)

    def __getitem__(self, idx):
        sample_id = self.sample_ids[idx]
        subset = self.df[self.df['id'] == sample_id]
        X = subset[self.features_cols].values.astype(np.float32)
        y = subset['deceptive'].iloc[0]
        return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)
    
def collate_fn(batch):
    sequences, labels = zip(*batch)
    lengths = [len(seq) for seq in sequences]
    padded_sequences = nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=-10.0)
    return padded_sequences, torch.tensor(lengths), torch.tensor(labels, dtype=torch.float32)

# BiLSTM with attention

In [None]:
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Linear(hidden_dim * 2, 1)

    def forward(self, lstm_out, mask):
        scores = self.attn(lstm_out).squeeze(-1)
        scores = scores.masked_fill(~mask, -1e4)
        weights = torch.softmax(scores, dim=1).unsqueeze(-1)
        # weights = torch.nan_to_num(weights, nan=0.0)
        context = torch.sum(weights * lstm_out, dim=1)
        return context, weights

class BiLSTMAttention(nn.Module):
    def __init__(self, input_dim=967, reduced_dim=64, hidden_dim=64, num_layers=2, dropout=0.3):
        super().__init__()

        self.feature_extractor = nn.Sequential(
            nn.Conv1d(input_dim, reduced_dim * 2, kernel_size=3, padding=1),
            nn.BatchNorm1d(reduced_dim * 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Conv1d(reduced_dim * 2, reduced_dim, kernel_size=3, padding=1),
            nn.BatchNorm1d(reduced_dim),
            nn.ReLU()
        )

        self.lstm = nn.LSTM(reduced_dim, hidden_dim, num_layers=num_layers,
                            bidirectional=True, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        
        self.attn = Attention(hidden_dim)

        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim * 2, 32),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(32, 1)
        )

    def forward(self, x, lengths):
        x = x.permute(0, 2, 1)
        x = self.feature_extractor(x)
        x = x.permute(0, 2, 1)

        packed = nn.utils.rnn.pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        out, _ = self.lstm(packed)
        out, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)

        mask = torch.arange(out.size(1), device=out.device)[None, :] < lengths[:, None].to(out.device)
        context, weights = self.attn(out, mask)

        logits = self.classifier(context).squeeze(1)
        return logits, weights

In [None]:
df = pd.read_csv("processed_data/silesian_deception_dataset/emotions_landmarks_flow.csv")
bad_rows = df[df.isna().any(axis=1)]
bad_ids = bad_rows['id'].unique().tolist()
df = df[~df['id'].isin(bad_ids)].reset_index(drop=True)

samples = df['id'].unique().tolist()
sample_labels = df.groupby('id')['deceptive'].first().values
train_samples, test_samples = train_test_split(samples, test_size=0.2, random_state=42, stratify=sample_labels)
train_df = df[df['id'].isin(train_samples)].copy()
test_df = df[df['id'].isin(test_samples)].copy()

landmarks_cols = [f'lm_{i}' for i in range(468*2)]
flow_cols = ['flow_mean_x', 'flow_mean_y', 'flow_std_x', 'flow_std_y']
emotion_cols = ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']

scalers = {}
for cols in [landmarks_cols, flow_cols, emotion_cols]:
    scaler = MinMaxScaler(feature_range=(-1, 1))
    train_df[cols] = scaler.fit_transform(train_df[cols])
    test_df[cols] = scaler.transform(test_df[cols])
    scalers.update({col: scaler for col in cols})

train_dataset = EmotionSequenceDataset(train_df)
test_dataset = EmotionSequenceDataset(test_df)

labels = train_df.groupby("id")["deceptive"].first().values
class_sample_count = np.array([len(np.where(labels==0)[0]), len(np.where(labels==1)[0])])
weight = 1. / class_sample_count
samples_weight = np.array([weight[int(l)] for l in labels])
samples_weight = torch.from_numpy(samples_weight).float()
sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

train_loader = DataLoader(train_dataset, batch_size=32, collate_fn=collate_fn, sampler=sampler, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_fn, shuffle=False, drop_last=True)

670
122


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[cols] = scaler.fit_transform(train_df[cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[cols] = scaler.transform(test_df[cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[cols] = scaler.fit_transform(train_df[cols])
A value is trying to be set on a copy of a slice 

In [5]:
num_pos_videos = sum([train_dataset[i][1].item() == 1 for i in range(len(train_dataset))])
num_neg_videos = len(train_dataset) - num_pos_videos
pos_weight = torch.tensor(num_neg_videos / num_pos_videos, dtype=torch.float32).to(device)

print(f"Number of positive samples: {num_pos_videos}, Number of negative samples: {num_neg_videos}")

Number of positive samples: 501, Number of negative samples: 232


In [None]:
def evaluate(model, loader):
    model.eval()
    all_preds, all_labels, all_probs = [], [], []
    with torch.no_grad():
        for X, lengths, y in loader:
            X, y = X.to(device), y.to(device)
            logits = model(X, lengths)
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).float()
            all_probs.append(probs.cpu())
            all_preds.append(preds.cpu())
            all_labels.append(y.cpu())
    
    y_true = torch.cat(all_labels)
    y_pred = torch.cat(all_preds)
    y_prob = torch.cat(all_probs)
    
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    try:
        auc = roc_auc_score(y_true, y_prob)
    except:
        auc = 0.5
    return acc, f1, auc

# Training

In [7]:
subset_ids = np.random.choice(train_dataset.sample_ids, size=50, replace=False)
small_train_ids = subset_ids[:35]
small_val_ids = subset_ids[35:]

small_train_df = train_df[train_df['id'].isin(small_train_ids)].reset_index(drop=True)
small_val_df = train_df[train_df['id'].isin(small_val_ids)].reset_index(drop=True)

small_train_dataset = EmotionSequenceDataset(small_train_df)
small_val_dataset = EmotionSequenceDataset(small_val_df)
small_train_loader = DataLoader(
    small_train_dataset,
    batch_size=2,
    collate_fn=collate_fn,
    shuffle=True
)

small_val_loader = DataLoader(
    small_val_dataset,
    batch_size=2,
    collate_fn=collate_fn,
    shuffle=False
)

In [None]:
model = BiLSTMAttention(hidden_dim=64, dropout=0.3).to(device)
opt = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)
criterion = nn.BCEWithLogitsLoss()
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode='min', factor=0.5, patience=3, verbose=True)

# early stopping parameters
early_stop_patience = 10
best_val_f1 = float('-inf')
epochs_no_improve = 0

for epoch in range(50):
    model.train()
    total_loss = 0

    for X, lengths, y in train_loader:
        X = X.to(device)
        y = y.to(device)

        opt.zero_grad()

        logits, _ = model(X, lengths)
        loss = criterion(logits, y)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        opt.step()
        
        total_loss += loss.item()
    
    train_loss = total_loss / len(train_loader)
    val_acc, val_f1, val_auc = evaluate(model, test_loader)
    print(f"Epoch {epoch+1}: train_loss={train_loss:.4f} | val_acc={val_acc:.4f} | val_f1={val_f1:.4f} | val_auc={val_auc:.4f}")

    # scheduler.step(train_loss)

    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        epochs_no_improve = 0
        torch.save(model.state_dict(), "model_weights/best_model.pt")
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= early_stop_patience:
            print("Early stopping.")
            break

Epoch 1: train_loss=0.6982, val_acc=0.6534, val_f1=0.7845
Epoch 2: train_loss=0.6917, val_acc=0.6932, val_f1=0.8176
Epoch 3: train_loss=0.6925, val_acc=0.6818, val_f1=0.8082
Epoch 4: train_loss=0.6902, val_acc=0.4261, val_f1=0.4294
Epoch 5: train_loss=0.6941, val_acc=0.5284, val_f1=0.6498
Epoch 6: train_loss=0.6925, val_acc=0.5455, val_f1=0.6774
Epoch 7: train_loss=0.6902, val_acc=0.5057, val_f1=0.5915
Epoch 8: train_loss=0.6916, val_acc=0.6420, val_f1=0.7758
Epoch 00008: reducing learning rate of group 0 to 5.0000e-01.
Epoch 9: train_loss=0.6852, val_acc=0.5795, val_f1=0.7016
Epoch 10: train_loss=0.6778, val_acc=0.4091, val_f1=0.4286
Epoch 11: train_loss=0.6820, val_acc=0.4489, val_f1=0.5403
Epoch 12: train_loss=0.6856, val_acc=0.4886, val_f1=0.5833
Early stopping triggered.


# Purely emotion-based dataset analysis

In [None]:
df = pd.read_csv("processed_data/silesian_deception_dataset/emotions.csv")
df.groupby("deceptive").mean()

Unnamed: 0_level_0,id,frame,Angry,Disgust,Fear,Happy,Sad,Surprise,Neutral
deceptive,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
False,452.546502,3778.415965,0.130773,0.120608,0.121079,0.134827,0.148551,0.12061,0.223552
True,459.625536,6431.186514,0.129634,0.119907,0.12053,0.132197,0.152211,0.119913,0.225608


In [None]:
df.groupby("deceptive").std()

Unnamed: 0_level_0,id,frame,Angry,Disgust,Fear,Happy,Sad,Surprise,Neutral
deceptive,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
False,273.434383,3758.070614,0.040251,0.010359,0.012758,0.046865,0.060667,0.010363,0.085442
True,270.189444,2612.202414,0.039913,0.00998,0.013235,0.043731,0.064583,0.00999,0.085189


Minimal difference between truth/lie sequences. The model cannot learn anything, which is shown by the stable loss.