In [9]:
import pandas as pd

# Load the dataset
file_path = "cepe_dataset.txt"  # Replace with the actual file path
try:
    data = pd.read_csv(
        file_path,
        delimiter="\t",  # Assuming tab-separated values
        header=None,  # No predefined headers
        names=[
            'claim_label', 'topic_sentence', 'evidence_label',
            'claim_sentence', 'evidence_candidate_sentence', 'article_id', 'full_label'],
        skipinitialspace=True
    )
except Exception as e:
    print(f"Error loading file: {e}")
    exit()

# Define helper functions
def clean_claim_label(label):
    """
    Validate and map claim labels.
    'C' -> 1 (claim), 'O' -> 0 (non-claim)
    """
    return 1 if label == 'C' else 0 if label == 'O' else None

def clean_evidence_label(label):
    """
    Validate and map evidence labels.
    'E' -> 1 (evidence), 'O' -> 0 (non-evidence)
    """
    return 1 if label == 'E' else 0 if label == 'O' else None

def validate_full_label(label):
    """
    Validate the format of full_label (e.g., 'C-index', 'E-B-index', 'E-I-index', 'O').
    """
    if label == 'O':
        return label
    if isinstance(label, str) and all(
        part.startswith(('C-', 'E-B-', 'E-I-')) or part == 'O'
        for part in label.split('|')
    ):
        return label
    return None

def validate_article_id(article_id):
    """
    Validate article_id format as 'number_number' (e.g., '1_3').
    """
    return isinstance(article_id, str) and bool(pd.Series([article_id]).str.match(r'^\d+_\d+$').iloc[0])

# Clean and validate data
try:
    data['claim_label'] = data['claim_label'].apply(clean_claim_label)
    data['evidence_label'] = data['evidence_label'].apply(clean_evidence_label)
    data['full_label'] = data['full_label'].apply(validate_full_label)

    valid_rows = (
        data['claim_label'].notna() &
        data['evidence_label'].notna() &
        data['full_label'].notna() &
        data['topic_sentence'].apply(lambda x: isinstance(x, str) and x.strip() != "") &
        data['claim_sentence'].apply(lambda x: isinstance(x, str) and x.strip() != "") &
        data['evidence_candidate_sentence'].apply(lambda x: isinstance(x, str) and x.strip() != "") &
        data['article_id'].apply(validate_article_id)
    )

    # Separate valid and invalid rows
    cleaned_data = data[valid_rows]
    discarded_data = data[~valid_rows]

    # Save processed data
    cleaned_data.to_csv("cleaned_cepe_data.csv", index=False)
    discarded_data.to_csv("discarded_cepe_data.csv", index=False)

    # Summarize results
    print("Processed Dataset Summary:")
    print(f"Total Rows: {data.shape[0]}")
    print(f"Valid Rows: {cleaned_data.shape[0]}")
    print(f"Discarded Rows: {discarded_data.shape[0]}")
    print("\nSample of Cleaned Data:")
    print(cleaned_data.head())
    print("\nSample of Discarded Data:")
    print(discarded_data.head())

    print("\nFiles Saved:")
    print("- Cleaned data: cleaned_cepe_data.csv")
    print("- Discarded data: discarded_cepe_data.csv")

except Exception as e:
    print(f"Error during preprocessing: {e}")


Processed Dataset Summary:
Total Rows: 57398
Valid Rows: 57391
Discarded Rows: 7

Sample of Cleaned Data:
   claim_label                          topic_sentence  evidence_label  \
0            0  Should we abandon the one-child policy               0   
1            0  Should we abandon the one-child policy               0   
2            0  Should we abandon the one-child policy               0   
3            0  Should we abandon the one-child policy               0   
4            0  Should we abandon the one-child policy               0   

                                      claim_sentence  \
0  A 2009 study at the University of Ulster found...   
1  A 2009 study at the University of Ulster found...   
2  A 2009 study at the University of Ulster found...   
3  A 2009 study at the University of Ulster found...   
4  A 2009 study at the University of Ulster found...   

                         evidence_candidate_sentence article_id full_label  
0  Low fertility which increases th

In [10]:
import pandas as pd

# Load the cleaned dataset
file_path = 'cleaned_cepe_data.csv'  # Replace with the correct file path
data = pd.read_csv(file_path)

# Add a new column 'combined_label' which concatenates 'claim_label' and 'evidence_label'
try:
    data['combined_label'] = data['claim_label'].astype(str) + "_" + data['evidence_label'].astype(str)

    # Save the updated dataset
    output_file = 'updated_cleaned_cepe_data.csv'  # Update the file name/path as needed
    data.to_csv(output_file, index=False)

    # Display a summary and first few rows of the updated dataset
    print("Updated dataset with 'combined_label' column added.")
    print(data.head())

    print(f"\nUpdated dataset saved as: {output_file}")

except Exception as e:
    print(f"Error while adding the combined_label column: {e}")


Updated dataset with 'combined_label' column added.
   claim_label                          topic_sentence  evidence_label  \
0            0  Should we abandon the one-child policy               0   
1            0  Should we abandon the one-child policy               0   
2            0  Should we abandon the one-child policy               0   
3            0  Should we abandon the one-child policy               0   
4            0  Should we abandon the one-child policy               0   

                                      claim_sentence  \
0  A 2009 study at the University of Ulster found...   
1  A 2009 study at the University of Ulster found...   
2  A 2009 study at the University of Ulster found...   
3  A 2009 study at the University of Ulster found...   
4  A 2009 study at the University of Ulster found...   

                         evidence_candidate_sentence article_id full_label  \
0  Low fertility which increases the need for sex...        1_1          O   
1  The Uni

In [11]:
import pandas as pd

# Load the updated dataset
file_path = 'updated_cleaned_cepe_data.csv'  # Replace with the correct file path
data = pd.read_csv(file_path)

# Define the label encoding mapping
label_encoding = {
    '0_0': 0,
    '0_1': 1,
    '1_0': 2,
    '1_1': 3
}

# Add a new column 'encoded_label' with encoded values
try:
    data['encoded_label'] = data['combined_label'].map(label_encoding)
    
    # Save the updated dataset with encoded labels
    output_file = 'encoded_cleaned_cepe_data.csv'  # Update the file name/path as needed
    data.to_csv(output_file, index=False)
    
    # Display a summary and first few rows of the updated dataset
    print("Updated dataset with 'encoded_label' column added.")
    print(data[['combined_label', 'encoded_label']].head())

    print(f"\nUpdated dataset saved as: {output_file}")

except Exception as e:
    print(f"Error while encoding the combined_label column: {e}")

Updated dataset with 'encoded_label' column added.
  combined_label  encoded_label
0            0_0              0
1            0_0              0
2            0_0              0
3            0_0              0
4            0_0              0

Updated dataset saved as: encoded_cleaned_cepe_data.csv


In [16]:
# -*- coding: utf-8 -*-
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import classification_report

# Load and Clean Dataset
file_path = "encoded_cleaned_cepe_data.csv"  # Path to your cleaned dataset with encoded labels
try:
    data = pd.read_csv(file_path)
    print(f"Loaded dataset with {len(data)} rows.")
except Exception as e:
    print(f"Error loading file: {e}")
    exit()

# Dataset Class
class CEPEDataset(Dataset):
    def __init__(self, data, seq_len, patch_size, vocab_size):
        self.data = data
        self.seq_len = seq_len
        self.patch_size = patch_size
        self.vocab_size = vocab_size

    def text_to_numeric(self, text):
        """Convert text to numeric values (character-level mapping)."""
        return [ord(char) % self.vocab_size for char in text]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        # Tokenize claims, evidence, and topics
        claim_tokens = self.text_to_numeric(row['claim_sentence'])
        evidence_tokens = self.text_to_numeric(row['evidence_candidate_sentence'])
        topic_tokens = self.text_to_numeric(row['topic_sentence'])

        # Pad to sequence length
        padded_length = ((self.seq_len + self.patch_size - 1) // self.patch_size) * self.patch_size
        claim_tokens = claim_tokens[:self.seq_len] + [0] * (padded_length - len(claim_tokens))
        evidence_tokens = evidence_tokens[:self.seq_len] + [0] * (padded_length - len(evidence_tokens))
        topic_tokens = topic_tokens[:self.seq_len] + [0] * (padded_length - len(topic_tokens))

        # Extract encoded label
        encoded_label = torch.tensor(row['encoded_label'], dtype=torch.long)

        return {
            'claim_input_ids': torch.tensor(claim_tokens, dtype=torch.long),
            'evidence_input_ids': torch.tensor(evidence_tokens, dtype=torch.long),
            'topic_input_ids': torch.tensor(topic_tokens, dtype=torch.long),
            'encoded_label': encoded_label
        }

# Collate Function
def cepe_collate_fn(batch):
    claim_input_ids = pad_sequence([item['claim_input_ids'] for item in batch], batch_first=True, padding_value=0)
    evidence_input_ids = pad_sequence([item['evidence_input_ids'] for item in batch], batch_first=True, padding_value=0)
    topic_input_ids = pad_sequence([item['topic_input_ids'] for item in batch], batch_first=True, padding_value=0)
    encoded_labels = torch.tensor([item['encoded_label'] for item in batch], dtype=torch.long)

    return {
        'claim_input_ids': claim_input_ids,
        'evidence_input_ids': evidence_input_ids,
        'topic_input_ids': topic_input_ids,
        'encoded_label': encoded_labels
    }

# DataLoader
seq_len = 128
patch_size = 8
vocab_size = 30522

# Dataset and DataLoader
dataset = CEPEDataset(data, seq_len=seq_len, patch_size=patch_size, vocab_size=vocab_size)
data_loader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=cepe_collate_fn)

# Model
class TextVisionTransformerCEPE(nn.Module):
    def __init__(self, seq_len, patch_size, emb_size, vocab_size, depth, num_heads, ff_dim):
        super(TextVisionTransformerCEPE, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=emb_size, nhead=num_heads, dim_feedforward=ff_dim, batch_first=True),
            num_layers=depth
        )
        self.projection = nn.Linear(emb_size * 3, emb_size)  # Project concatenated embeddings back to emb_size
        self.classifier = nn.Linear(emb_size, 4)  # Multi-class classifier (4 classes for encoded_label)

    def forward(self, claim_input_ids, evidence_input_ids, topic_input_ids):
        # Embed claims, evidence, and topics
        claim_emb = self.embedding(claim_input_ids).mean(dim=1)
        evidence_emb = self.embedding(evidence_input_ids).mean(dim=1)
        topic_emb = self.embedding(topic_input_ids).mean(dim=1)

        # Concatenate embeddings and project back to emb_size
        combined = torch.cat((claim_emb, evidence_emb, topic_emb), dim=1)  # [batch_size, emb_size * 3]
        combined = self.projection(combined)  # [batch_size, emb_size]

        # Pass through the transformer encoder
        encoded = self.encoder(combined.unsqueeze(1))  # [batch_size, 1, emb_size]
        encoded = encoded.squeeze(1)  # [batch_size, emb_size]

        # Classify the encoded representation
        logits = self.classifier(encoded)  # [batch_size, 4]
        return logits

# Instantiate Model
model = TextVisionTransformerCEPE(
    seq_len=seq_len, patch_size=patch_size, emb_size=128, vocab_size=vocab_size,
    depth=6, num_heads=8, ff_dim=512
)

# Training and Evaluation
def train_model(model, data_loader, optimizer, loss_fn, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in data_loader:
            claim_input_ids = batch['claim_input_ids']
            evidence_input_ids = batch['evidence_input_ids']
            topic_input_ids = batch['topic_input_ids']
            labels = batch['encoded_label']  # Use the new encoded_label

            optimizer.zero_grad()
            outputs = model(claim_input_ids, evidence_input_ids, topic_input_ids)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(data_loader):.4f}")

# Optimizer and Loss Function
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss()  # For multi-class classification

# Train the Model
train_model(model, data_loader, optimizer, loss_fn, epochs=3)

ModuleNotFoundError: No module named 'torch'

In [None]:
def evaluate_model_cepe(model, data_loader, device):
    """
    Evaluate the CEPE model on the given dataset.
    Args:
        model: Trained CEPE model.
        data_loader: DataLoader for evaluation data.
        device: Device to run the evaluation on (CPU or GPU).
    """
    model.eval()  # Set model to evaluation mode
    all_claim_preds, all_claim_labels = [], []

    with torch.no_grad():
        for batch in data_loader:
            # Move inputs and labels to the appropriate device
            claim_input_ids = batch['claim_input_ids'].to(device)
            evidence_input_ids = batch['evidence_input_ids'].to(device)
            topic_input_ids = batch['topic_input_ids'].to(device)
            claim_labels = batch['claim_label'].to(device)

            # Forward pass
            logits = model(claim_input_ids, evidence_input_ids, topic_input_ids)

            # Predictions
            preds = torch.argmax(logits, dim=1).cpu().numpy()

            # Append predictions and labels
            all_claim_preds.extend(preds)
            all_claim_labels.extend(claim_labels.cpu().numpy())

    # Print classification report
    print("Claim Classification Report:")
    print(classification_report(all_claim_labels, all_claim_preds, target_names=["Non-Match", "Match"]))


In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Evaluate the model
evaluate_model_cepe(model, data_loader, device)

Claim Classification Report:
              precision    recall  f1-score   support

   Non-Match       0.95      1.00      0.98      3016
       Match       0.00      0.00      0.00       149

    accuracy                           0.95      3165
   macro avg       0.48      0.50      0.49      3165
weighted avg       0.91      0.95      0.93      3165



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
