In [None]:
from google.colab import drive
from google.colab.files import download

# Mount Google Drive
drive.mount('/content/drive')

# Base directory for input files and output images
#base_path = '/content/drive/My Drive/GPT Project/'

Mounted at /content/drive


In [None]:
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset


In [None]:
# Load the CSV file
try:
    df = pd.read_csv(
        '/content/data.csv',  # Update this path to your file location
        on_bad_lines='skip',
        escapechar="\\",
        encoding='utf-8',
        engine='python'
    )
    print("CSV file loaded successfully!")
except Exception as e:
    print(f"Error loading CSV file: {e}")

CSV file loaded successfully!


In [None]:
# Drop unnecessary columns
df = df.drop(columns=['source_url', 'url', 'content_original', 'authors'], errors='ignore')

# Combine 'title' and 'content' into a single input field
df['text'] = df['title'] + " " + df['content']


In [None]:
df

Unnamed: 0,topic,source,bias,title,date,content,bias_text,ID,text,bias_label
0,politics,Politico,0,Can the Impeachment Hearings Actually Change A...,2019-11-13,"This happens for different reasons , but a key...",left,bpp2Cqpvyi2ER5Lr,Can the Impeachment Hearings Actually Change A...,1
1,china,Bloomberg,1,Trump’s Tariff Barrage Pushes China Fight to P...,2018-07-11,LISTEN TO ARTICLE 5:37 SHARE THIS ARTICLE Shar...,center,PnVV1v7F0D8bhd5X,Trump’s Tariff Barrage Pushes China Fight to P...,0
2,terrorism,Washington Times,2,Top general: Failed Foley rescue was ‘highest-...,2014-09-16,The Pentagon ’ s top general said Tuesday that...,right,pZlutCuZNm832qqK,Top general: Failed Foley rescue was ‘highest-...,3
3,politics,Salon,0,America’s post-midterm inferno: Tea Party garb...,2014-11-12,A full week after the alleged “ shellacking “ ...,left,aMGkvFlpLyQrX6iu,America’s post-midterm inferno: Tea Party garb...,1
4,politics,CNN (Web News),0,E-mails among Christie appointees suggest poli...,2014-01-08,Story highlights Gergen : `` It feeds into thi...,left,Ludq48gCzDiKtnmX,E-mails among Christie appointees suggest poli...,1
...,...,...,...,...,...,...,...,...,...,...
37549,media_bias,Fox News,2,Summit meeting: Can there be a 'reset' between...,2016-11-22,If Donald Trump can sit down with Mitt Romney ...,right,aDXaucBITOvKX6Sl,Summit meeting: Can there be a 'reset' between...,3
37550,education,Vox,0,All of West Virginia’s teachers have been on s...,2018-03-03,Thousands of public school teachers across Wes...,left,78Akx2nSokxbCMd0,All of West Virginia’s teachers have been on s...,1
37551,elections,The Hill,1,Kirsten Gillibrand officially announces White ...,2019-03-17,Sen. Kirsten Gillibrand Kirsten GillibrandTo w...,center,vvAgpKy0PL1h8990,Kirsten Gillibrand officially announces White ...,0
37552,education,Reason,2,"Why We Need School Choice: ""Obama Administrati...",2017-01-23,"Welcome to National School Choice Week , an an...",right,aWlri1iRio9Q6Rfe,"Why We Need School Choice: ""Obama Administrati...",3


In [None]:
# Encode labels for bias ('left', 'center', 'right')
label_encoder_bias = LabelEncoder()
df['bias_label'] = label_encoder_bias.fit_transform(df['bias_text'])

# Display label mappings for bias
print("Bias label mappings:", dict(enumerate(label_encoder_bias.classes_)))

Bias label mappings: {0: 'center', 1: 'left', 2: 'n4BSawmUfSId5MY0', 3: 'right'}


In [None]:
# Check unique values in the bias_text column
print(df['bias_text'].unique())


['left' 'center' 'right' 'n4BSawmUfSId5MY0']


In [None]:
# Drop entries where bias_text is 'n4BSawmUfSId5MY0'
df = df[df['bias_text'] != 'n4BSawmUfSId5MY0']

In [None]:
print(df['bias_text'].unique())

['left' 'center' 'right']


In [None]:
# Encode labels for bias ('left', 'center', 'right')
label_encoder_bias = LabelEncoder()
df['bias_label'] = label_encoder_bias.fit_transform(df['bias_text'])

# Display label mappings for bias
print("Bias label mappings:", dict(enumerate(label_encoder_bias.classes_)))

Bias label mappings: {0: 'center', 1: 'left', 2: 'right'}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['bias_label'] = label_encoder_bias.fit_transform(df['bias_text'])


In [None]:
# Split the dataset into training, validation, and test sets
train_texts, test_texts, train_bias_labels, test_bias_labels = train_test_split(
    df['text'], df['bias_label'], test_size=0.2, random_state=42
)
val_texts, test_texts, val_bias_labels, test_bias_labels = train_test_split(
    test_texts, test_bias_labels, test_size=0.5, random_state=42
)

This dataset class splits long texts into overlapping chunks during training and handles them during inference.

In [None]:
class BiasDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512, overlap=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.overlap = overlap

    def tokenize_and_split(self, text):
        # Tokenize without truncation to get full input
        tokens = self.tokenizer(text, truncation=False, padding=False)["input_ids"]
        # Split into chunks with specified overlap
        chunks = [tokens[i:i + self.max_length] for i in range(0, len(tokens), self.max_length - self.overlap)]
        return chunks

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        chunks = self.tokenize_and_split(text)

        # Use the first chunk for training; handle multiple chunks during inference
        encoding = self.tokenizer(
            self.tokenizer.decode(chunks[0], skip_special_tokens=True),
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long),
        }

# Initialize tokenizer
tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v3-large")

# Create datasets
train_dataset = BiasDataset(train_texts, train_bias_labels, tokenizer, max_length=512, overlap=256)
val_dataset = BiasDataset(val_texts, val_bias_labels, tokenizer, max_length=512, overlap=256)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

Training Arguements

In [None]:
training_args = TrainingArguments(
    output_dir="/content/drive/My Drive/NLP Project/deberta_bias_results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="/content/drive/My Drive/NLP Project/deberta_logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


In [None]:
model = DebertaV2ForSequenceClassification.from_pretrained(
    "microsoft/deberta-v3-large", num_labels=len(label_encoder_bias.classes_)
)


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Train the model

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Save the trained model and tokenizer
trainer.save_model("/content/drive/My Drive/NLP Project/deberta_bias_model")
tokenizer.save_pretrained("/content/drive/My Drive/NLP Project/deberta_bias_model")


  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7416,1.034929,0.427963,0.307978,0.680365,0.427963
2,0.2993,0.38679,0.892144,0.892064,0.892322,0.892144
3,0.1807,0.439776,0.89747,0.897938,0.899,0.89747


('/content/drive/My Drive/NLP Project/deberta_bias_model/tokenizer_config.json',
 '/content/drive/My Drive/NLP Project/deberta_bias_model/special_tokens_map.json',
 '/content/drive/My Drive/NLP Project/deberta_bias_model/spm.model',
 '/content/drive/My Drive/NLP Project/deberta_bias_model/added_tokens.json')

In [None]:
def predict_with_aggregation(text, model, tokenizer, max_length=512, overlap=256):
    model.eval()
    tokens = tokenizer(text, truncation=False, padding=False)["input_ids"]
    chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length - overlap)]
    predictions = []

    for chunk in chunks:
        inputs = tokenizer.decode(chunk, skip_special_tokens=True)
        inputs = tokenizer(
            inputs, return_tensors="pt", truncation=True, padding="max_length", max_length=max_length
        )
        inputs = {key: val.to(model.device) for key, val in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            predictions.append(torch.softmax(logits, dim=-1).cpu().numpy())

    # Aggregate probabilities across chunks
    aggregated_probs = np.mean(predictions, axis=0)
    predicted_class = np.argmax(aggregated_probs)
    return predicted_class, aggregated_probs


In [None]:
metrics = trainer.evaluate()
print(metrics)


{'eval_loss': 0.43977606296539307, 'eval_accuracy': 0.8974700399467377, 'eval_f1': 0.8979377702543823, 'eval_precision': 0.8989999193948969, 'eval_recall': 0.8974700399467377, 'eval_runtime': 200.5306, 'eval_samples_per_second': 18.725, 'eval_steps_per_second': 1.172, 'epoch': 3.0}


In [35]:
# Create a dataset object for the test data
test_dataset = BiasDataset(test_texts, test_bias_labels, tokenizer, max_length=4096)

# Evaluate the model using the test dataset
print("Evaluating Bias Classification Model...")
#bias_metrics = trainer.evaluate(test_dataset)
print("Bias Classification Metrics:", metrics)

# Highlight key metrics
print("\nBias Classification Metrics (Highlighted):")
print(f"F1 Score: \033[92m{metrics['eval_f1']:.4f}\033[0m")  # Green color for F1
print(f"Accuracy: \033[94m{metrics['eval_accuracy']:.4f}\033[0m")  # Blue color for Accuracy
print(f"Precision: \033[93m{metrics['eval_precision']:.4f}\033[0m")  # Yellow for Precision
print(f"Recall: \033[91m{metrics['eval_recall']:.4f}\033[0m")  # Red for Recall


Evaluating Bias Classification Model...
Bias Classification Metrics: {'eval_loss': 0.43977606296539307, 'eval_accuracy': 0.8974700399467377, 'eval_f1': 0.8979377702543823, 'eval_precision': 0.8989999193948969, 'eval_recall': 0.8974700399467377, 'eval_runtime': 200.5306, 'eval_samples_per_second': 18.725, 'eval_steps_per_second': 1.172, 'epoch': 3.0}

Bias Classification Metrics (Highlighted):
F1 Score: [92m0.8979[0m
Accuracy: [94m0.8975[0m
Precision: [93m0.8990[0m
Recall: [91m0.8975[0m


In [33]:
from transformers import DebertaV2ForSequenceClassification, DebertaV2Tokenizer

# Define paths to the saved models
bias_model_path = "/content/drive/My Drive/NLP Project/deberta_bias_model"


# Load the models and tokenizer
bias_model = DebertaV2ForSequenceClassification.from_pretrained(bias_model_path)
tokenizer = DebertaV2Tokenizer.from_pretrained(bias_model_path)  # Use the same tokenizer


In [34]:
# Example texts
example_texts = [
    "4 ways Trumps mass deportation plans could hurt your finances President-elect Donald Trump has made tougher immigration enforcement a key campaign promise in each of his White House bids. If he follows through on his pledge for mass deportations and tighter immigration policies, it could create a financial burden for many Americans..."
]

# Tokenize and predict
for text in example_texts:
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Bias Prediction
    with torch.no_grad():
        bias_outputs = bias_model(**inputs)
        bias_predictions = torch.argmax(bias_outputs.logits, dim=-1).item()
    predicted_bias = label_encoder_bias.inverse_transform([bias_predictions])[0]

    # Display predictions
    print(f"Text: {text[:200]}...")  # Truncate long text for display
    print(f"Predicted Bias: {predicted_bias}")
    print("-" * 20)


Text: 4 ways Trumps mass deportation plans could hurt your finances President-elect Donald Trump has made tougher immigration enforcement a key campaign promise in each of his White House bids. If he follow...
Predicted Bias: right
--------------------


In [36]:
example_texts = ["US News Joe Rogan explains why liberal media hemorrhaging audiences Youre not accurate youre delusional And people are speaking with their subscriptions and theyre speaking with their purchasing of the Washington Post and their purchasing of the New York Times The conversation kicked off when Rogan 57 brought up Washington Post owner Jeff Bezos divisive October opinion piece The hard truth Americans dont trust the news media in which the billionaire Amazon founder declined to continue the newspapers legacy of endorsing a candidate for president The Washington Post planned to endorse Kamala Harris before Bezos stepped in claiming he move cost the newspaper thousands of subscribers but Rogan theorized it would have lost much more if it stuck to its progressive endorsement Essentially saying that you have to take divergent viewpoints you have to take a bunch of different perspectives we cant just be this leftwing echo chamber and its the reason why the business is faltering he noted The New York Times is suffering from the same leftleaning affliction Rogan argued pointing to a recent factcheck the newspaper published earlier this week on Robert F Kennedy Jrs claim that a popular breakfast cereal contains several artificial ingredients in the United States that are not used in other countriespolitical endorsements create a perception of bias The Gray Lady claimed Kennedy was wrong but admitted the ingredient lists for the USs version of Froot Loops contain a multitude of manmade chemicals that Canadas cereal does not The fact check is so dumb because the fact check says its not correct they have the same ingredients except for these harmful chemicals Rogan seethed adding that the ingredient list clearly included fng dangerous chemicals that are banned in Canada that were trying to get rid of in America So theyre literally saying he was wrong but he was right The popular podcaster theorized the Times went after RFK Jr so ruthlessly because of his close ties to Presidentelect Donald Trump and his nomination to serve as the next health secretary as well as his vocal aversion to vaccines Thats the New York fng Times he continued This is what the New York Times is doing so of course youre gonna hemorrhage subscribers of course Youre crazy youre saying something thats nuts and also What is your motivation Are you trying to eliminate because you lost so much credibility are you trying to kill it all Are you secretly working for the Chinese Like what are you doing Rogan lamented"]

# Tokenize and predict
for text in example_texts:
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Bias Prediction
    with torch.no_grad():
        bias_outputs = bias_model(**inputs)
        bias_predictions = torch.argmax(bias_outputs.logits, dim=-1).item()
    predicted_bias = label_encoder_bias.inverse_transform([bias_predictions])[0]

    # Display predictions
    print(f"Text: {text[:200]}...")  # Truncate long text for display
    print(f"Predicted Bias: {predicted_bias}")
    print("-" * 20)

Text: US News Joe Rogan explains why liberal media hemorrhaging audiences Youre not accurate youre delusional And people are speaking with their subscriptions and theyre speaking with their purchasing of th...
Predicted Bias: right
--------------------


In [37]:
example_texts = ["Harris pins the blame on Trump for Supreme Court overturning Roe v Wade Vice President Kamala Harris on Monday pinned blame squarely on Donald Trump for the reversal of Roe v Wade saying the former president is proud that women are silently suffering without a guaranteed right to abortion The previous president expressed his intentions quite clearly And fast forward to just recently says hes proud of what he did Harris told CNNs Laura Coates during an exclusive interview in Wisconsin where the vice president was launching a national tour in support of reproductive rights She said the likely Republican nominee relishes his role in new restrictive abortion laws that have emerged in the aftermath of the high court ruling By inference he is proud that women have been deprived of fundamental freedoms to make decisions about their own body by inference proud that doctors are being penalized and criminalized for providing health care proud that women are silently suffering because they dont have access to the health care they need Harris added So lets understand that the stakes are so very high It was a preview of a central reelection argument in a rematch between Trump and President Joe Biden a contest that will be shadowed by the former presidents multiple courtroom trials including one related to Trumps role in events that led to the January 6 2021 riot at the Capitol The vice president rejected outright Trumps assertions that his legal issues amount to political persecution undertaken by the Biden administration What hes saying is not factual period she said And that would not be new for him would it And Harris said it was imperative Americans remain watchful for signs of democratic erosion in the leadup to this years contest"]

# Tokenize and predict
for text in example_texts:
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Bias Prediction
    with torch.no_grad():
        bias_outputs = bias_model(**inputs)
        bias_predictions = torch.argmax(bias_outputs.logits, dim=-1).item()
    predicted_bias = label_encoder_bias.inverse_transform([bias_predictions])[0]

    # Display predictions
    print(f"Text: {text[:200]}...")  # Truncate long text for display
    print(f"Predicted Bias: {predicted_bias}")
    print("-" * 20)

Text: Harris pins the blame on Trump for Supreme Court overturning Roe v Wade Vice President Kamala Harris on Monday pinned blame squarely on Donald Trump for the reversal of Roe v Wade saying the former pr...
Predicted Bias: left
--------------------


In [40]:
example_texts=["Ruben Amorim said he has inherited a difficult position at Manchester United and described his new side as a massive club but not a massive team. Having racked up 20 top-flight titles, United are English football's most successful club domestically. But their impressive history has not been mirrored by results on the pitch over the last decade. Amorim has arrived at Old Trafford with United, who have not won the title since 2013, in the bottom half of the table. They head into Saturday's game six points behind opponents Nottingham Forest, who were playing in the Championship as recently as 2022. We are a massive club but we are not a massive team and we know it so it is no problem to say it. Our players have to understand that it is a very difficult position. We are not one of the best teams in the league and we have to say and think that clearly but our past, our club, is maybe the best one in the league. So here we have a problem and we have to focus on the little things and little details. Amorim has had little time on the training pitch since taking over from Erik ten Hag because of a packed fixture list. The 39-year-old has been careful to play down expectations while the squad gets to grips with a new tactical set-up. However, he insists one non-negotiable is the amount of effort put in by his players and warned them they had to run like mad dogs if they want to be successful. I think the way we sprint back, the way we sprint forward, the way we fight, we have to be very clear with the team. This is the first point we have to address then the tactical and technical aspects come later. What I see is that they are making an effort and there is a lot of room to improve. Changing that thing you can sometimes say is simple because it is just running but it is something in the head of the players. If you want to win we have to do it. Even with the best starting XI on the planet without running they will not win anything, that is very clear. If we want to win the Premier League we have to run like mad dogs. If not, we are not going to."]

# Tokenize and predict
for text in example_texts:
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Bias Prediction
    with torch.no_grad():
        bias_outputs = bias_model(**inputs)
        bias_predictions = torch.argmax(bias_outputs.logits, dim=-1).item()
    predicted_bias = label_encoder_bias.inverse_transform([bias_predictions])[0]

    # Display predictions
    print(f"Text: {text[:200]}...")  # Truncate long text for display
    print(f"Predicted Bias: {predicted_bias}")
    print("-" * 20)

Text: Ruben Amorim said he has inherited a difficult position at Manchester United and described his new side as a massive club but not a massive team. Having racked up 20 top-flight titles, United are Engl...
Predicted Bias: center
--------------------
