In [1]:
import pandas as pd

data_train = pd.read_csv("train.tsv", sep="\t", header=None)
data_valid = pd.read_csv("valid.tsv", sep="\t", header=None)
data_test = pd.read_csv("test.tsv", sep="\t", header=None)

In [None]:
data_train.head(5)

In [None]:
data_test.head(5)

In [None]:
data_valid.head(5)

In [9]:
def data_preprocessing(dataset):
    # Creating new 'label' column based on column 1
    dataset['label'] = [1 if x in ["true", "mostly-true"] else 0 for x in dataset[1]]
    
    # Debug: print current columns
    print("Columns before dropping:", dataset.columns.tolist())
    
    # Drop unwanted columns by label (not by position)
    dataset = dataset.drop(columns=[0, 1, 8, 9, 10, 11, 12])
    
    # Process metadata columns
    meta = []
    for i in range(len(dataset)):
        subject = dataset.loc[i, 3] if dataset.loc[i, 3] != 0 else 'None'
        speaker = dataset.loc[i, 4] if dataset.loc[i, 4] != 0 else 'None'
        job = dataset.loc[i, 5] if dataset.loc[i, 5] != 0 else 'None'
        state = dataset.loc[i, 6] if dataset.loc[i, 6] != 0 else 'None'
        affiliation = dataset.loc[i, 7] if dataset.loc[i, 7] != 0 else 'None'
        context = dataset.loc[i, 13] if dataset.loc[i, 13] != 0 else 'None'
        meta.append(f"{subject} {speaker} {job} {state} {affiliation} {context}")
    
    # Add the combined metadata column
    dataset['combined_meta'] = meta
    # Create 'sentence' by combining metadata with text from column 2
    dataset["sentence"] = dataset['combined_meta'].astype(str) + " " + dataset[2].astype(str)
    
    # Now drop the original metadata columns and the temporary 'combined_meta'
    dataset = dataset.drop(columns=[2, 3, 4, 5, 6, 7, 13, 'combined_meta'])
    
    # Drop any remaining rows with null values
    dataset = dataset.dropna()
    
    return dataset

In [None]:
data_train = data_preprocessing(data_train)
data_valid = data_preprocessing(data_valid)
data_test = data_preprocessing(data_test)


In [None]:
data_train

In [None]:
data_valid

In [None]:
data_test

In [None]:
data_train.head(5)

In [None]:
data_train["label"].value_counts()

In [None]:
data_valid["label"].value_counts()

In [None]:
data_test['label'].value_counts()


In [None]:
sent_len = [] 
for sent in data_train["sentence"]:
    sent_len.append(len(sent))

import matplotlib.pyplot as plt 

fig = plt.figure(figsize =(10, 7))
plt.boxplot(sent_len)
plt.show()

sent_len = [i for i in sent_len if i<=500] #Excluding the outliers
fig2 = plt.figure(figsize =(10, 7))
plt.hist(sent_len, 5)
plt.show()

In [None]:
%pip install transformers 

In [None]:
%pip install --upgrade torch torchvision

In [34]:
from transformers import BertForSequenceClassification
from transformers import BertTokenizer
from transformers import RobertaForSequenceClassification
from transformers import RobertaTokenizer

In [None]:
%pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117

In [39]:
import torch

In [None]:

import torch
print(torch.cuda.is_available())   # Should be False on Mac
print(torch.backends.mps.is_available())  # True if using an M1/M2 Mac with proper PyTorch support

In [None]:
bert_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", #Using BERT base model with an uncased vocab.
                                                                num_labels = 2, #number of output labels - 0,1 (binary classification)
                                                                output_attentions = False, #model doesnt return attention weights
                                                                output_hidden_states = False #model doesnt return hidden states
                                                          )

bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)


bert_model = bert_model.to("mps")

roberta_model = RobertaForSequenceClassification.from_pretrained("roberta-base", #RoBERTa base model
                                                                    num_labels = 2,  #number of output labels - 0,1 (binary classification)
                                                                    output_attentions = False,  #model doesnt return attention weights
                                                                    output_hidden_states = False #model doesnt return hidden states
                                                                )

roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base", do_lower_case=True)

bert_model = bert_model.to("mps")
print("base models loaded ")

In [None]:
if torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"
    
bert_model = bert_model.to(device)
device

In [None]:
print(' Original: ', data_train["sentence"][0])

# Split the sentence into tokens - BERT
print('Tokenized BERT: ', bert_tokenizer.tokenize(data_train["sentence"][0]))

# Mapping tokens to token IDs - BERT
print('Token IDs BERT: ', bert_tokenizer.convert_tokens_to_ids(bert_tokenizer.tokenize(data_train["sentence"][0])))

# Split the sentence into tokens -RoBERTa
print('Tokenized RoBERT: ', roberta_tokenizer.tokenize(data_train["sentence"][0]))

# Mapping tokens to token IDs - RoBERTa
print('Token IDs RoBERTa: ', roberta_tokenizer.convert_tokens_to_ids(roberta_tokenizer.tokenize(data_train["sentence"][0])))

In [58]:
sentences = data_train["sentence"].values 
labels = data_train["label"].values

In [60]:
import torch

In [None]:
def bert_robert_tokenization(dataset):
  sentences = dataset["sentence"].values
  labels = dataset["label"].values
  max_length = 256

  # Tokenize all of the sentences and map the tokens to thier word IDs.
  bert_input_ids = []
  bert_attention_masks = []
  roberta_input_ids = []
  roberta_attention_masks = []

  sentence_ids = []
  counter = 0
  for sent in sentences:
      bert_encoded_dict = bert_tokenizer.encode_plus(
          str(sent),
          add_special_tokens=True,
          max_length=256,
            pad_to_max_length = True,
            return_attention_mask = True , 
            truncation = True, 
            return_tensors = 'pt'
        )
        roberta_encoded_dict = roberta_tokenizer.encode_plus(
            str(sent),
            add_special_tokens = True,
            max_length = 256,
            pad_to_max_length = True,
            return_attention_mask = True , 
            truncation = True , 
            return_tensors = "pt"
        )
        #Add the encoded sentence to the List 
        bert_input.ids.append(bert_encoded_dict["input_ids"])
        roberta_input_ids.append(roberta_encoded_dict["input_idc"])

        #Add attemtion mask to the list 
        bert_attention_maks.append(bert_encoded_dict["attention_mask"])
        roberta_attention_maks.append(roberta_encoded_dict["attention_mask"])
        #collecting sentence_ids 
        sentence_ids.append(counter)
        counter = counter + 1
#Convert the list into tensors 
bert_input_ids = torch.cat(bert_input_ids , dim = 0)
bert_attention_masks = torch.cat(bert_attention_maks, dim  = 0 )



    

In [None]:
from torch.utils.data import TensorDataset , random_split 
torch.manual_seed(0)

token_dict_train = bert_robert
