In [20]:
import pandas as pd
import numpy as np
from sklearn import metrics
import transformers
from transformers import BertTokenizer
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import os

### TRIAL DATA

In [24]:
# PREPROCESSING 
# Extract the data from the files and create df with all the transcriptions labeled 0 for deceptive and 1 for truthful
path_to_lies = "data/trial/transcription/deceptive"
path_to_truths = "data/trial/transcription/truthful"

def read_files_from_directory(directory_path, label):
    data_list = []
    for file_name in os.listdir(directory_path):
        if file_name.startswith('trial_lie_') and label == 0 or file_name.startswith('trial_truth_') and label == 1:
            file_path = os.path.join(directory_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read().strip()
                data_list.append((content, label))
    return data_list

data = read_files_from_directory(path_to_lies, 0) + read_files_from_directory(path_to_truths, 1)
df = pd.DataFrame(data, columns=['text', 'label'])
#df = df.sample(frac=1).reset_index(drop=True)
print(df.head())
print(df['label'].value_counts())

                                                text  label
0  No sir I did not. I absolutely did not. No sir...      0
1  ... and she approached me, and at that time th...      0
2                      No sir I was not, not at all.      0
3  He had told me that he had had a dream that, a...      0
4  And he told me that, ammm … he was trying to f...      0
label
0    61
1    60
Name: count, dtype: int64


In [25]:
if torch.cuda.is_available():
    device = torch.device("cuda")          # a CUDA device object
    print(f'The GPU {torch.cuda.get_device_name(0)} is available')
    torch.cuda.empty_cache()
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead')

The GPU NVIDIA GeForce GTX 1050 is available


In [26]:
# from here based on https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb
# Sections of config
MAX_LEN = 200
TRAIN_BATCH_SIZE = 10
VALID_BATCH_SIZE = 4
EPOCHS = 10
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [27]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [28]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (121, 2)
TRAIN Dataset: (97, 2)
TEST Dataset: (24, 2)


In [29]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [30]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 1) # add classification layer
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)


model = BERTClass()
model.to(device)
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [31]:
def train(epoch):
    model.train()
    total_loss = 0
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids).squeeze()
        
 
        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        total_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print(f'Epoch: {epoch}, Average Loss:  {total_loss / len(training_loader)}')
    
for epoch in range(EPOCHS):
    train(epoch)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0, Average Loss:  0.7200074613094329
Epoch: 1, Average Loss:  0.6604593575000763
Epoch: 2, Average Loss:  0.6352026581764221
Epoch: 3, Average Loss:  0.5767881214618683
Epoch: 4, Average Loss:  0.5058984190225602
Epoch: 5, Average Loss:  0.4046668171882629
Epoch: 6, Average Loss:  0.3273684769868851
Epoch: 7, Average Loss:  0.2511323481798172
Epoch: 8, Average Loss:  0.20757670402526857
Epoch: 9, Average Loss:  0.15708430707454682


In [32]:
model.eval()
fin_targets=[]
fin_outputs=[]
with torch.no_grad():
    for _, data in enumerate(testing_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        outputs = model(ids, mask, token_type_ids).squeeze()
        fin_targets.extend(targets.cpu().detach().numpy().tolist())
        fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())


outputs = np.array(fin_outputs) >= 0.5
accuracy = metrics.accuracy_score(fin_targets, outputs)
f1_score_micro = metrics.f1_score(fin_targets, outputs, average='micro')
f1_score_macro = metrics.f1_score(fin_targets, outputs, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")




Accuracy Score = 0.625
F1 Score (Micro) = 0.625
F1 Score (Macro) = 0.6243478260869565


### BAG OF LIES DATA

In [34]:
df1 = pd.read_csv("data/bagoflies/BagOfLies/audio/metadata.csv")  
df2 = pd.read_csv("data/bagoflies/BagOfLies/transcription/transcriptions.csv")  
df2['file_name'] = df2['file_name'].str.split('\\').str[-1]
merged_df = pd.merge(df1, df2, on='file_name')
df = merged_df[['text', 'label']]
df.head()

Unnamed: 0,text,label
0,There is a river and a bridge crossing over i...,1
1,A Labrador has jumped to catch a Frisbee and ...,0
2,A plane is trying to land in the river. The p...,0
3,There is a girl wearing a blue dress and she ...,0
4,A boy is drinking coffee and reading a newspa...,0


In [43]:
# Creating the dataset and dataloader for the neural network
VALID_BATCH_SIZE = 5
train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

model = BERTClass()
model.to(device)
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    train(epoch)

FULL Dataset: (325, 2)
TRAIN Dataset: (260, 2)
TEST Dataset: (65, 2)




Epoch: 0, Average Loss:  0.6973642202524039
Epoch: 1, Average Loss:  0.6460712804244115
Epoch: 2, Average Loss:  0.5657660445341697
Epoch: 3, Average Loss:  0.4574064497764294
Epoch: 4, Average Loss:  0.34386261380635774
Epoch: 5, Average Loss:  0.2546419587272864
Epoch: 6, Average Loss:  0.17458916856692389
Epoch: 7, Average Loss:  0.11282079065075287
Epoch: 8, Average Loss:  0.08208147035195278
Epoch: 9, Average Loss:  0.063583032729534


In [44]:
model.eval()
fin_targets=[]
fin_outputs=[]
with torch.no_grad():
    for _, data in enumerate(testing_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        outputs = model(ids, mask, token_type_ids).squeeze()
        fin_targets.extend(targets.cpu().detach().numpy().tolist())
        fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())


outputs = np.array(fin_outputs) >= 0.5
accuracy = metrics.accuracy_score(fin_targets, outputs)
f1_score_micro = metrics.f1_score(fin_targets, outputs, average='micro')
f1_score_macro = metrics.f1_score(fin_targets, outputs, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")



Accuracy Score = 0.7384615384615385
F1 Score (Micro) = 0.7384615384615385
F1 Score (Macro) = 0.738213693437574
