In [82]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from transformers import BertModel, BertTokenizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

import spacy
from textblob import TextBlob


In [15]:
df = pd.read_csv('train.tsv', delimiter='\t', header = None)

df.rename({0: 'id', 1: 'label', 2: 'statement', 3: 'subject', 4: 'speaker', 5: 'job-title',
           6: 'state_info', 7: 'party_affiliation', 8: 'barely_true_counts', 9: 'false_counts',
           10: 'half_true_counts', 11: 'mostly_true_counts', 12: 'pants_on_fire_counts', 13: 'context'
          }, axis = 1, inplace = True)

mapping = {'false': 0, 'half-true': 1, 'mostly-true': 2, 'true': 3, 'barely-true': 4,
       'pants-fire': 5}

df['label'] = df['label'].replace(mapping)

In [16]:
def clean_party(val):
    
    left = {'democrat', 'green', 'democratic-farmer-labor', 'ocean-state-tea-party-action'}
    right = {'republican', 'libertarian', 'tea-party-member', 'Moderate',
       'constitution-party'}
    
    center_none = {'none', 'organization', 'independent',
       'columnist', 'activist', 'talk-show-host',
       'newsmaker', 'journalist', 'labor-leader', 'state-official',
       'business-leader', 'education-official', 'tea-party-member', np.NaN,
       'liberal-party-canada', 'government-body', 'Moderate',
       }
    
    if val in left:
        return 0
    elif val in center_none:
        return 1
    else:
        return 2

df['party_affiliation'] = df['party_affiliation'].apply(clean_party)

In [17]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['subject'] = label_encoder.fit_transform(df['subject'])

In [83]:
def get_sentiment(text):
    blob = TextBlob(text)
    return (blob.sentiment.polarity + 1) / 2

df['sentiment'] = df['statement'].apply(get_sentiment)

In [18]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print('cuda')
else:
    device = torch.device("cpu")

cuda


# Model

In [85]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer

class ClaimDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        statement = self.data['statement'].iloc[idx]
        political_affiliation = self.data['party_affiliation'].iloc[idx]
        subject_encoded = self.data['subject_encoded'].iloc[idx]
        label = self.data['label'].iloc[idx]
        sentiment = self.data['sentiment'].iloc[idx]

        # Tokenize and encode text input with attention mask
        inputs = self.tokenizer(
            statement,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_statement': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'input_political_affiliation': torch.tensor(political_affiliation, dtype=torch.float),
            'input_subject': torch.tensor(subject_encoded, dtype=torch.float),
            'input_sentiment': torch.tensor(sentiment, dtype=torch.float),
            'label': torch.tensor(label, dtype=torch.long)
            
        }

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 128  # Adjust as needed
claim_dataset = ClaimDataset(df, tokenizer, max_length)

# Split the dataset into train, validation, and test sets
train_size = int(0.8 * len(claim_dataset))
val_size = (len(claim_dataset) - train_size) // 2
test_size = len(claim_dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(claim_dataset, [train_size, val_size, test_size])

# Create DataLoader instances
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [86]:
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Define the model
class ClaimClassifier(nn.Module):
    def __init__(self, hidden_size=768, num_classes=6):
        super(ClaimClassifier, self).__init__()
        self.bert_model = bert_model
        self.fc_statement = nn.Linear(hidden_size, num_classes)
        self.fc_political_affiliation = nn.Linear(1, 1)
        self.fc_sentiment = nn.Linear(1, 1)
        self.fc_subject = nn.Linear(1, 1)
        self.fc_final = nn.Linear(9, num_classes)

    def forward(self, input_statement, attention_mask, input_political_affiliation, input_subject, input_sentiment):
        # Tokenize and encode inputs
        pooled_output = self.bert_model(input_statement, attention_mask=attention_mask).pooler_output
        
        # Apply linear layers to numerical features
        fc_statement_output = self.fc_statement(pooled_output)
        fc_political_affiliation_output = self.fc_political_affiliation(input_political_affiliation.view(-1, 1))
        fc_subject_output = self.fc_subject(input_subject.view(-1, 1))
        fc_sentiment_output = self.fc_subject(input_sentiment.view(-1, 1))

        # Concatenate or add numerical features
        merged_inputs = torch.cat([fc_statement_output, fc_political_affiliation_output, 
                                   fc_subject_output, fc_sentiment_output], dim=1)

        # Apply final linear layer
        output = self.fc_final(merged_inputs)

        return output

In [87]:
# Create an instance of the model
model = ClaimClassifier().to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [88]:
# Training loop
epochs = 3
batch_size = 32

for epoch in range(epochs):
    
    model.train()
    total_loss = 0.0
    
    for batch in train_loader:
        input_statement = batch['input_statement'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        input_political_affiliation = batch['input_political_affiliation'].to(device)
        input_subject = batch['input_subject'].to(device)
        input_sentiment = batch['input_sentiment'].to(device)
        labels = batch['label'].to(device)

        # Forward pass
        output = model(input_statement, attention_mask, input_political_affiliation, 
                       input_subject, input_sentiment)

        loss = criterion(output, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()

    # Calculate average training loss
    average_train_loss = total_loss / len(train_loader)

    # Validation
    model.eval()
    val_loss = 0.0
    correct_predictions = 0

    with torch.no_grad():
        for val_batch in val_loader:
            val_input_statement = val_batch['input_statement'].to(device)
            val_attention_mask = batch['attention_mask'].to(device)
            val_input_political_affiliation = val_batch['input_political_affiliation'].to(device)
            val_input_subject = val_batch['input_subject'].to(device)
            val_input_sentiment = batch['input_sentiment'].to(device)
            val_labels = val_batch['label'].to(device)

            val_outputs = model(val_input_statement, val_attention_mask, val_input_political_affiliation, 
                                val_input_subject, val_input_sentiment)
            val_loss += criterion(val_outputs, val_labels).item()

            # Calculate the number of correct predictions
            _, predicted = torch.max(val_outputs, 1)
            correct_predictions += (predicted == val_labels).sum().item()

    # Calculate average validation loss
    average_val_loss = val_loss / len(val_loader)

    # Calculate accuracy
    accuracy = correct_predictions / len(val_dataset)

    print(f'Epoch {epoch+1}/{epochs}, Training Loss: {average_train_loss:.4f}, Validation Loss: {average_val_loss:.4f}, Accuracy: {accuracy:.4f}')

Epoch 1/3, Training Loss: 132.1845, Validation Loss: 21.2329, Accuracy: 0.2295
Epoch 2/3, Training Loss: 10.2368, Validation Loss: 3.4089, Accuracy: 0.1729
Epoch 3/3, Training Loss: 2.0646, Validation Loss: 1.9041, Accuracy: 0.2070


In [90]:
# Test the model
correct_predictions = 0
with torch.no_grad():
        for val_batch in test_loader:
            val_input_statement = val_batch['input_statement'].to(device)
            val_attention_mask = batch['attention_mask'].to(device)
            val_input_political_affiliation = val_batch['input_political_affiliation'].to(device)
            val_input_subject = val_batch['input_subject'].to(device)
            val_input_sentiment = batch['input_sentiment'].to(device)
            val_labels = val_batch['label'].to(device)

            val_outputs = model(val_input_statement, val_attention_mask, val_input_political_affiliation, 
                                val_input_subject, val_input_sentiment)
            val_loss += criterion(val_outputs, val_labels).item()

            # Calculate the number of correct predictions
            _, predicted = torch.max(val_outputs, 1)
            correct_predictions += (predicted == val_labels).sum().item()

        # Calculate average validation loss
        average_val_loss = val_loss / len(val_loader)

        # Calculate accuracy
        accuracy = correct_predictions / len(val_dataset)

In [91]:
accuracy

0.2177734375

# Evaluation on train dataset

In [92]:
test = pd.read_csv('test.tsv', delimiter='\t', header = None)

test.rename({0: 'id', 1: 'label', 2: 'statement', 3: 'subject', 4: 'speaker', 5: 'job-title',
           6: 'state_info', 7: 'party_affiliation', 8: 'barely_true_counts', 9: 'false_counts',
           10: 'half_true_counts', 11: 'mostly_true_counts', 12: 'pants_on_fire_counts', 13: 'context'
          }, axis = 1, inplace = True)

mapping = {'false': 0, 'half-true': 1, 'mostly-true': 2, 'true': 3, 'barely-true': 4,
       'pants-fire': 5}

test['label'] = test['label'].replace(mapping)


def clean_party(val):
    
    left = {'democrat', 'green', 'democratic-farmer-labor', 'ocean-state-tea-party-action'}
    right = {'republican', 'libertarian', 'tea-party-member', 'Moderate',
       'constitution-party'}
    
    center_none = {'none', 'organization', 'independent',
       'columnist', 'activist', 'talk-show-host',
       'newsmaker', 'journalist', 'labor-leader', 'state-official',
       'business-leader', 'education-official', 'tea-party-member', np.NaN,
       'liberal-party-canada', 'government-body', 'Moderate',
       }
    
    if val in left:
        return 0
    elif val in center_none:
        return 1
    else:
        return 2

test['party_affiliation'] = test['party_affiliation'].apply(clean_party)
test['subject'] = label_encoder.fit_transform(test['subject'])
df['sentiment'] = df['statement'].apply(get_sentiment)

In [93]:
full_test_dataset = ClaimDataset(test, tokenizer)  
full_test_loader = DataLoader(full_test_dataset, batch_size=batch_size, shuffle=False)

In [94]:
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_statement = batch['input_statement'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        input_political_affiliation = batch['input_political_affiliation'].to(device)
        input_subject = batch['input_subject'].to(device)
        input_sentiment = batch['input_sentiment'].to(device)
        labels = batch['label'].to(device)

        # Forward pass
        outputs = model(input_statement, attention_mask, input_political_affiliation, 
                        input_subject, input_sentiment)

        # Convert logits to predictions
        _, predictions = torch.max(outputs, dim=1)

        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate Metrics
accuracy = accuracy_score(all_labels, all_predictions)
report = classification_report(all_labels, all_predictions)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.21484375
Classification Report:
              precision    recall  f1-score   support

           0       0.23      0.57      0.33       213
           1       0.22      0.39      0.28       217
           2       0.32      0.04      0.06       196
           3       0.08      0.05      0.06       153
           4       0.00      0.00      0.00       166
           5       0.00      0.00      0.00        79

    accuracy                           0.21      1024
   macro avg       0.14      0.17      0.12      1024
weighted avg       0.17      0.21      0.15      1024



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
