In [1]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset

from transformers import BertModel, BertTokenizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

import spacy
from textblob import TextBlob

In [9]:
df = pd.read_csv('train.tsv', delimiter='\t', header = None)

df.rename({0: 'id', 1: 'label', 2: 'statement', 3: 'subject', 4: 'speaker', 5: 'job-title',
           6: 'state_info', 7: 'party_affiliation', 8: 'barely_true_counts', 9: 'false_counts',
           10: 'half_true_counts', 11: 'mostly_true_counts', 12: 'pants_on_fire_counts', 13: 'context'
          }, axis = 1, inplace = True)

mapping = {'false': 0, 'half-true': 1, 'mostly-true': 2, 'true': 3, 'barely-true': 4,
       'pants-fire': 5}

df['label'] = df['label'].replace(mapping)

# Data engineering

In [10]:
def clean_party(val):
    
    left = {'democrat', 'green', 'democratic-farmer-labor', 'ocean-state-tea-party-action'}
    right = {'republican', 'libertarian', 'tea-party-member', 'Moderate',
       'constitution-party'}
    
    center_none = {'none', 'organization', 'independent',
       'columnist', 'activist', 'talk-show-host',
       'newsmaker', 'journalist', 'labor-leader', 'state-official',
       'business-leader', 'education-official', 'tea-party-member', np.NaN,
       'liberal-party-canada', 'government-body', 'Moderate',
       }
    
    if val in left:
        return 0
    elif val in center_none:
        return 1
    else:
        return 2

df['party_affiliation'] = df['party_affiliation'].apply(clean_party)

In [11]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['subject'] = label_encoder.fit_transform(df['subject'])

In [12]:
def get_sentiment(text):
    blob = TextBlob(text)
    return (blob.sentiment.polarity + 1) / 2

df['sentiment'] = df['statement'].apply(get_sentiment)

In [13]:
#removing stop words, neglecting casing

df['statement'] = df['statement'].str.lower()

import nltk
from nltk.corpus import stopwords

# Download the stopwords dataset
nltk.download('stopwords')
def remove_stopwords(sentence):
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(sentence)
    filtered_sentence = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_sentence)

df['statement_cleaned'] = df['statement'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ociolli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Glove embeddings

In [14]:
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec


# Path to the downloaded GloVe embeddings file
glove_file = 'glove.6B.100d.txt'

word2vec_file = 'glove.6B.100d.word2vec'  # Any path and filename you prefer

# Convert GloVe format to Word2Vec format
glove2word2vec(glove_file, word2vec_file)

# Load the Word2Vec model
glove_model = KeyedVectors.load_word2vec_format(word2vec_file, binary=False)


  glove2word2vec(glove_file, word2vec_file)


In [15]:
def sentence_embedding(sentence, model, dim=100):
    # Tokenize the sentence into words
    words = nltk.word_tokenize(sentence)
    
    # Filter out words that are not in the model's vocabulary
    words_in_vocab = [word for word in words if word in model.key_to_index]
    
    # Check if there are words in the sentence that are in the model's vocabulary
    if words_in_vocab:
        # Compute the mean of word embeddings for the words in the sentence
        embedding = sum(model[word] for word in words_in_vocab) / len(words_in_vocab)
        return embedding
    else:
        # If none of the words in the sentence are in the model's vocabulary, return None
        return None


In [16]:
df['glove'] = df['statement_cleaned'].apply(lambda x: sentence_embedding(x, glove_model))


# Credibility score

In [17]:
cs_lookup = {}

for speaker in set(df['speaker'].unique()) - {np.nan}:
    try:
        limited = df[df['speaker'] == speaker]
        vals = limited.iloc[0]
       
        MTC = vals.loc['mostly_true_counts']
        HTC = vals.loc['half_true_counts']
        BTC = vals.loc['barely_true_counts']
        FC = vals.loc['false_counts']
        PFC = vals.loc['pants_on_fire_counts']

        #exclude true counts = weight 0
        cs = (0.2 * MTC) + (0.5 * HTC) + (0.75 * BTC) + (0.9 * FC) + (1 * PFC)

        cs_lookup[speaker] = cs / (MTC + HTC + BTC + FC + PFC)
        
    except:
        print(speaker)
    

  cs_lookup[speaker] = cs / (MTC + HTC + BTC + FC + PFC)


In [18]:
mean_cs = np.nanmean(list(cs_lookup.values()))

In [19]:
for key, val in cs_lookup.items():
    if np.isnan(val):
        cs_lookup[key] = mean_cs
    else: continue

In [37]:
def impute_cred(val):
    if not isinstance(val, str) or (val not in cs_lookup.keys()):
        return mean_cs

    else:
        return cs_lookup[val]

In [21]:
df['credibility'] = df['speaker'].apply(impute_cred)

# Model

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('cuda')
else:
    device = torch.device("cpu")

cuda


In [20]:
#sentiment, party_affiliation (subject later)

In [22]:
class seqModel(nn.Module):
    def __init__(self, hidden_size=128, num_classes=6):
        super(seqModel, self).__init__()
        
        self.lstm = torch.nn.LSTM(100, hidden_size)
        self.dropout = torch.nn.Dropout(p = 0.15)
        self.state_fc = nn.Linear(hidden_size, 64)

        self.feature_fc = nn.Linear(3, 64)

        self.combined_fc = nn.Linear(128, 6)

            
    def forward(self, input_seq, party_affiliation, credibility, sentiment):
        
        # statement branch
        statement_out, _ = self.lstm(input_seq)
#         statement_out = statement_out[:, -1, :]
        statement_out = self.dropout(statement_out)
        statement_out = F.relu(self.state_fc(statement_out))
        statement_out = self.dropout(statement_out)

        # feature branch
        feature_vec = torch.cat([party_affiliation.unsqueeze(1),
                                 credibility.unsqueeze(1), sentiment.unsqueeze(1)], dim = 1)
        
        feature_out = F.relu(self.feature_fc(feature_vec))

        combined = torch.cat([statement_out, feature_out], dim = 1)
        combined = self.combined_fc(combined)

        return F.softmax(combined, dim = 1)


In [23]:
class CustomDataset(Dataset):
    def __init__(self, df):
        self.labels = torch.tensor(df['label'].values, dtype=torch.long)
        self.glove_embeddings = torch.stack([torch.tensor(embedding) for embedding in df['glove']], dim=0)
        self.party_affiliation = torch.tensor(df['party_affiliation'].values, dtype=torch.long)
        self.credibility = torch.tensor(df['credibility'].values, dtype=torch.float)
        self.sentiment = torch.tensor(df['sentiment'].values, dtype=torch.float)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'label': self.labels[idx],
            'glove_embedding': self.glove_embeddings[idx],
            'party_affiliation': self.party_affiliation[idx],
            'credibility': self.credibility[idx],
            'sentiment': self.sentiment[idx]
        }

In [24]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = CustomDataset(train_df)
val_dataset = CustomDataset(val_df)

train_data_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_data_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)

# Training and eval

In [50]:
model = seqModel()
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [51]:
# Training loop
num_epochs = 30

for epoch in range(num_epochs):
    model.train()

    for batch in train_data_loader:
        # Move data to device
        for key in batch:
            batch[key] = batch[key].to(device)

        # Zero the gradients
        optimizer.zero_grad()

        
        # Forward pass
        output = model(batch['glove_embedding'], batch['party_affiliation'],
                       batch['credibility'], batch['sentiment'])

        # Calculate loss
        loss = criterion(output, batch['label'])

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    with torch.no_grad():
        val_outputs = []
        val_labels = []

        for val_batch in val_data_loader:
            # Move data to device
            for key in val_batch:
                val_batch[key] = val_batch[key].to(device)

            # Forward pass
            val_output = model(val_batch['glove_embedding'], val_batch['party_affiliation'],
                               val_batch['credibility'], val_batch['sentiment'])
            val_outputs.append(val_output)
            val_labels.append(val_batch['label'])

        val_outputs = torch.cat(val_outputs, dim=0)
        val_labels = torch.cat(val_labels, dim=0)

        # Calculate validation accuracy
        _, predicted = torch.max(val_outputs, 1)
        val_accuracy = accuracy_score(val_labels.cpu().numpy(), predicted.cpu().numpy())

        print(f"Epoch {epoch + 1}/{num_epochs}, Validation Accuracy: {val_accuracy:.4f}")

print("Training complete!")

Epoch 1/30, Validation Accuracy: 0.2290
Epoch 2/30, Validation Accuracy: 0.2607
Epoch 3/30, Validation Accuracy: 0.2729
Epoch 4/30, Validation Accuracy: 0.2856
Epoch 5/30, Validation Accuracy: 0.2988
Epoch 6/30, Validation Accuracy: 0.3120
Epoch 7/30, Validation Accuracy: 0.2969
Epoch 8/30, Validation Accuracy: 0.3091
Epoch 9/30, Validation Accuracy: 0.3052
Epoch 10/30, Validation Accuracy: 0.3262
Epoch 11/30, Validation Accuracy: 0.3286
Epoch 12/30, Validation Accuracy: 0.3325
Epoch 13/30, Validation Accuracy: 0.3159
Epoch 14/30, Validation Accuracy: 0.3306
Epoch 15/30, Validation Accuracy: 0.3291
Epoch 16/30, Validation Accuracy: 0.3267
Epoch 17/30, Validation Accuracy: 0.3252
Epoch 18/30, Validation Accuracy: 0.3174
Epoch 19/30, Validation Accuracy: 0.3242
Epoch 20/30, Validation Accuracy: 0.3262
Epoch 21/30, Validation Accuracy: 0.3291
Epoch 22/30, Validation Accuracy: 0.3271
Epoch 23/30, Validation Accuracy: 0.3306
Epoch 24/30, Validation Accuracy: 0.3247
Epoch 25/30, Validation A

In [30]:
model.eval()
with torch.no_grad():
    val_outputs = []
    val_labels = []

    for val_batch in val_data_loader:
        # Move data to device
        for key in val_batch:
            val_batch[key] = val_batch[key].to(device)

        # Forward pass
        val_output = model(val_batch['glove_embedding'], val_batch['party_affiliation'],
                           val_batch['credibility'], val_batch['sentiment'])
        val_outputs.append(val_output)
        val_labels.append(val_batch['label'])

    val_outputs = torch.cat(val_outputs, dim=0)
    val_labels = torch.cat(val_labels, dim=0)

    # Calculate validation accuracy
    _, predicted = torch.max(val_outputs, 1)
    val_accuracy = accuracy_score(val_labels.cpu().numpy(), predicted.cpu().numpy())
    print(val_accuracy)

0.32421875


# Testing

In [41]:
test = pd.read_csv('test.tsv', delimiter='\t', header = None)

test.rename({0: 'id', 1: 'label', 2: 'statement', 3: 'subject', 4: 'speaker', 5: 'job-title',
           6: 'state_info', 7: 'party_affiliation', 8: 'barely_true_counts', 9: 'false_counts',
           10: 'half_true_counts', 11: 'mostly_true_counts', 12: 'pants_on_fire_counts', 13: 'context'
          }, axis = 1, inplace = True)

mapping = {'false': 0, 'half-true': 1, 'mostly-true': 2, 'true': 3, 'barely-true': 4,
       'pants-fire': 5}

test['label'] = test['label'].replace(mapping)


test['party_affiliation'] = test['party_affiliation'].apply(clean_party)
#test['subject'] = label_encoder.fit_transform(test['subject'])
test['credibility'] = test['speaker'].apply(impute_cred)
test['sentiment'] = test['statement'].apply(get_sentiment)

test['statement'] = test['statement'].str.lower()
test['statement_cleaned'] = test['statement'].apply(remove_stopwords)
test['glove'] = test['statement_cleaned'].apply(lambda x: sentence_embedding(x, glove_model))


In [47]:
test_dataset = CustomDataset(test)

test_data_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

In [52]:
model.eval()
with torch.no_grad():
    test_outputs = []
    test_labels = []

    for test_batch in test_data_loader:
        # Move data to device
        for key in test_batch:
            test_batch[key] = test_batch[key].to(device)

        # Forward pass
        test_output = model(test_batch['glove_embedding'], test_batch['party_affiliation'],
                           test_batch['credibility'], test_batch['sentiment'])
        test_outputs.append(test_output)
        test_labels.append(test_batch['label'])

    test_outputs = torch.cat(test_outputs, dim=0)
    test_labels = torch.cat(test_labels, dim=0)

    # Calculate validation accuracy
    _, predicted = torch.max(test_outputs, 1)
    test_accuracy = accuracy_score(test_labels.cpu().numpy(), predicted.cpu().numpy())
    print(test_accuracy)

0.287292817679558
