In [2]:
import pandas as pd
import numpy as np
# from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset

from transformers import BertModel, BertTokenizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# import spacy
from textblob import TextBlob

In [3]:
df = pd.read_csv('train.tsv', delimiter='\t', header = None)

df.rename({0: 'id', 1: 'label', 2: 'statement', 3: 'subject', 4: 'speaker', 5: 'job-title',
           6: 'state_info', 7: 'party_affiliation', 8: 'barely_true_counts', 9: 'false_counts',
           10: 'half_true_counts', 11: 'mostly_true_counts', 12: 'pants_on_fire_counts', 13: 'context'
          }, axis = 1, inplace = True)

mapping = {'false': 0, 'half-true': 1, 'mostly-true': 2, 'true': 3, 'barely-true': 4,
       'pants-fire': 5}

df['label'] = df['label'].replace(mapping)

In [4]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['subject'] = label_encoder.fit_transform(df['subject'])

party_encoder = LabelEncoder()
df['party_affiliation'] = party_encoder.fit_transform(df['party_affiliation'])

In [5]:
def get_sentiment(text):
    blob = TextBlob(text)
    return (blob.sentiment.polarity + 1) / 2

df['sentiment'] = df['statement'].apply(get_sentiment)

In [6]:
#removing stop words, neglecting casing

df['statement'] = df['statement'].str.lower()

import nltk
from nltk.corpus import stopwords

# Download the stopwords dataset
nltk.download('stopwords')
def remove_stopwords(sentence):
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(sentence)
    filtered_sentence = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_sentence)

df['statement_cleaned'] = df['statement'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ociolli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Credibility Scores

In [7]:
cs_lookup = {}

for speaker in set(df['speaker'].unique()) - {np.nan}:
    try:
        limited = df[df['speaker'] == speaker]
        vals = limited.iloc[0]
       
        MTC = vals.loc['mostly_true_counts']
        HTC = vals.loc['half_true_counts']
        BTC = vals.loc['barely_true_counts']
        FC = vals.loc['false_counts']
        PFC = vals.loc['pants_on_fire_counts']

        #exclude true counts = weight 0
        cs = (0.2 * MTC) + (0.5 * HTC) + (0.75 * BTC) + (0.9 * FC) + (1 * PFC)

        cs_lookup[speaker] = cs / (MTC + HTC + BTC + FC + PFC)
        
    except:
        print(speaker)

mean_cs = np.nanmean(list(cs_lookup.values()))

  cs_lookup[speaker] = cs / (MTC + HTC + BTC + FC + PFC)


In [8]:
for key, val in cs_lookup.items():
    if np.isnan(val):
        cs_lookup[key] = mean_cs
    else: continue
        

def impute_cred(val):
    if not isinstance(val, str) or (val not in cs_lookup.keys()):
        return mean_cs

    else:
        return cs_lookup[val]

In [9]:
df['credibility'] = df['speaker'].apply(impute_cred)

# Model

In [10]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('cuda')
else:
    device = torch.device("cpu")

cuda


In [11]:
class CustomDataset(Dataset):
    def __init__(self, tokenizer, df):
        self.encodings = tokenizer(df['statement'].tolist(), 
                                   truncation=True, padding='max_length', return_tensors='pt', max_length=768)
        self.party_affiliations = torch.tensor(df['party_affiliation'].values)
        self.credibilities = torch.tensor(df['credibility'].values)
        self.sentiments = torch.tensor(df['sentiment'].values)
        self.subjects = torch.tensor(df['subject'].values)

        self.labels = torch.tensor(df['label'].values)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['party_affiliation'] = self.party_affiliations[idx]
        item['credibility'] = self.credibilities[idx]
        item['sentiment'] = self.sentiments[idx]
        item['subject'] = self.subjects[idx]
        item['label'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2)

train_dataset = CustomDataset(tokenizer, train_df)
val_dataset = CustomDataset(tokenizer, val_df)

# Create DataLoader instances
train_data_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_data_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)


In [12]:
class TransformerModel(nn.Module):
    def __init__(self, hidden_size=128, num_classes=6, num_heads=8, num_layers=1):
        super(TransformerModel, self).__init__()

        # Statement layers
        self.transformer_layer = nn.TransformerEncoderLayer(
            d_model=768, 
            nhead=num_heads, 
            dim_feedforward=hidden_size,
        )
        self.transformer_encoder = nn.TransformerEncoder(
            self.transformer_layer, 
            num_layers=num_layers,
        )
        self.dropout = torch.nn.Dropout(p=0.15)
        self.state_fc = nn.Linear(768, 128)

        # Feature layers
        self.party_embed = torch.nn.Embedding(24, 32)
        self.subject_embed = torch.nn.Embedding(3828, 256)

        self.feature_fc = nn.Linear(290, 128)
        self.dropout2 = torch.nn.Dropout(p=0.25)
        
        self.combined_fc_1 = nn.Linear(256, 64)
        self.combined_fc = nn.Linear(64, num_classes)

    def forward(self, input_seq, party_affiliation, subject, credibility, sentiment):
        # Statement branch
        statement_out = self.transformer_encoder(input_seq.float()) 
        statement_out = self.dropout(statement_out)
        statement_out = F.relu(self.state_fc(statement_out))
        statement_out = self.dropout(statement_out)

        # Feature branch
        party_affiliation = self.party_embed(party_affiliation)
        subject = self.subject_embed(subject)

        feature_vec = torch.cat([party_affiliation.float(), subject.float(),
                                 credibility.unsqueeze(1), sentiment.unsqueeze(1)], dim=1)
        feature_out = F.relu(self.feature_fc(feature_vec.float()))

    
        combined = torch.cat([statement_out, feature_out], dim=1)
        combined = self.combined_fc_1(combined)
        combined = self.combined_fc(combined)

        return F.softmax(combined, dim=1)

In [21]:
model = TransformerModel()
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)


num_epochs = 15
for epoch in range(num_epochs):
    model.train()

    for batch in train_data_loader:

        for key in batch:
            batch[key] = batch[key].to(device)

        optimizer.zero_grad()

        # Forward pass
        output = model(batch['input_ids'],
                       batch['party_affiliation'],
                       batch['subject'], 
                       batch['credibility'], 
                       batch['sentiment'])

        loss = criterion(output, batch['label'])

        # Backward pass
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    with torch.no_grad():
        val_outputs = []
        val_labels = []

        
        for val_batch in val_data_loader:

            for key in val_batch:
                val_batch[key] = val_batch[key].to(device)

            # Forward pass
            val_output = model(val_batch['input_ids'],
                                val_batch['party_affiliation'],
                                val_batch['subject'],
                                val_batch['credibility'],
                                val_batch['sentiment'])
            val_outputs.append(val_output)
            val_labels.append(val_batch['label'])

        val_outputs = torch.cat(val_outputs, dim=0)
        val_labels = torch.cat(val_labels, dim=0)

        # Calculate validation accuracy
        _, predicted = torch.max(val_outputs, 1)
        val_accuracy = accuracy_score(val_labels.cpu().numpy(), predicted.cpu().numpy())

        print(f"Epoch {epoch + 1}/{num_epochs}, Validation Accuracy: {val_accuracy:.4f}")

print("Training complete!")

Epoch 1/15, Validation Accuracy: 0.2104
Epoch 2/15, Validation Accuracy: 0.2241
Epoch 3/15, Validation Accuracy: 0.2261
Epoch 4/15, Validation Accuracy: 0.2300
Epoch 5/15, Validation Accuracy: 0.2271
Epoch 6/15, Validation Accuracy: 0.2354
Epoch 7/15, Validation Accuracy: 0.2231
Epoch 8/15, Validation Accuracy: 0.2271
Epoch 9/15, Validation Accuracy: 0.2324
Epoch 10/15, Validation Accuracy: 0.2275
Epoch 11/15, Validation Accuracy: 0.2300
Epoch 12/15, Validation Accuracy: 0.2261
Epoch 13/15, Validation Accuracy: 0.2319
Epoch 14/15, Validation Accuracy: 0.2363
Epoch 15/15, Validation Accuracy: 0.2314
Training complete!
