In [None]:
# Step 1: Install required libraries :contentReference[oaicite:0]{index=0}:contentReference[oaicite:1]{index=1}
!pip install transformers datasets scikit-learn shap umap-learn


Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (

In [None]:
# Step 2: Mount Drive or upload CSV :contentReference[oaicite:2]{index=2}:contentReference[oaicite:3]{index=3}
from google.colab import files, drive

# Option A: upload directly
uploaded = files.upload()  # then select model_data_2.csv

# Option B: mount Google Drive
# drive.mount('/content/drive')
# %cd /content/drive/MyDrive/path/to/your/folder


Saving model_data_2.csv to model_data_2.csv


In [None]:
# Step 3: Imports and device setup
import pandas as pd
import torch
import pickle
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from google.colab import files as colab_files

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [None]:
# Step 4: Load & prepare your data :contentReference[oaicite:4]{index=4}:contentReference[oaicite:5]{index=5}
df = pd.read_csv("model_data_2.csv")

# If your file already has 'text' and 'label', skip the next two lines.
# Otherwise—for example, if you have title/source columns—combine them:
# df['text'] = df['title'].fillna('') + ' ' + df['source'].fillna('') + ' ' + df['text'].fillna('')

dataset = Dataset.from_pandas(df[['text', 'label']])


In [None]:
import pandas as pd
from datasets import Dataset

df = pd.read_csv("model_data_2.csv")
print(df.columns)
print(df.head())


Index(['title', 'text', 'source', 'label', 'complete', 'cleaned_text'], dtype='object')
                                               title  \
0  Tiny implantable device short-circuits hunger ...   
1  Scientists report CRISPR restores effectivenes...   
2  Probiotics could help millions of patients suf...   
3  Yes Please to Yogurt and Cheese: The New Impro...   
4  Johns Hopkins team identifies promising diagno...   

                                                text  \
0  MADISON, Wis. -- More than 700 million adults ...   
1  Wilmington, DE, December 17, 2018 - The CRISPR...   
2  About 3 million people in the US are diagnosed...   
3  Newswise — Thousands of people can take heart ...   
4  Researchers at Johns Hopkins Medicine have ide...   

                     source label  \
0   https://web.archive.org  Fake   
1   https://web.archive.org  Fake   
2   https://web.archive.org  Fake   
3  https://www.newswise.com  Fake   
4   https://web.archive.org  Fake   

               

In [None]:
# 2) Pick the column you really want to feed into BERT.
#    In your case you have both 'text' (the raw paragraphs) and
#    'cleaned_text' (lowercased, punctuation-stripped). Use whichever
#    you prefer; here I'll use 'cleaned_text'.
df["cleaned_text"] = df["cleaned_text"].fillna("").astype(str)

# 3) Rename it to exactly "text" for the tokenizer function:
hf_df = df[["cleaned_text", "label"]].rename(
    columns={"cleaned_text": "text"}
)

In [None]:
# 1. Bucket raw claims into named labels
df.dropna(subset=["label"], inplace=True)

df["label"] = df["label"].map({
    "SUPPORTS":        "Reliable",
    "true":            "Reliable",
    "Realiable":       "Reliable",
    "false":           "Fake",
    "REFUTES":         "Fake",
    "mixture":         "Unreliable",
    "Not Sure":        "Unreliable",
    "mostly-false":    "Unreliable",
    "mostly-true":     "Reliable",
    "unknown":         "NOT ENOUGH INFO"
}).fillna(df["label"])

# 2. Map named labels to exactly the integers you want
df["label"] = df["label"].map({
    "Unreliable":       0,
    "Fake":             1,
    "Reliable":         2,
    "NOT ENOUGH INFO":  3
})

# Ensure the label column is of integer type
df["label"] = df["label"].astype(int)

# 4. Then build your HF dataset as before:
hf_df = df[["cleaned_text","label"]].rename(columns={"cleaned_text":"text"})
dataset = Dataset.from_pandas(hf_df)

In [None]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,3343
2,2605
0,1794
3,1185


In [None]:
df['label'].isna().sum() #should be 0

np.int64(0)

In [None]:
df['label'].unique()

array([1, 0, 2, 3])

In [None]:
# 1) Show the mapping dictionary you used
mapping = {
    "Unreliable":      0,
    "Fake":            1,
    "Reliable":        2,
    "NOT ENOUGH INFO": 3
}
print("Expected mapping:", mapping)

# 2) Verify that only 0–3 appear in your df
print("Unique label values after recoding:", sorted(df["label"].unique()))

# 3) Show how many examples of each class you have
print("\nCounts per label:")
print(df["label"].value_counts().sort_index())

# 4) (Optional) Peek at a few rows of each class
for lbl, name in mapping.items():
    print(f"\nSample rows for class '{lbl}' ({mapping[lbl]}):")
    display(df[df["label"] == mapping[lbl]].head(3))


Expected mapping: {'Unreliable': 0, 'Fake': 1, 'Reliable': 2, 'NOT ENOUGH INFO': 3}
Unique label values after recoding: [np.int64(0), np.int64(1), np.int64(2), np.int64(3)]

Counts per label:
label
0    1794
1    3343
2    2605
3    1185
Name: count, dtype: int64

Sample rows for class 'Unreliable' (0):


Unnamed: 0,title,text,source,label,complete,cleaned_text
2237,The USA is nothing more than a corporation,"Recently, the Natural News Editors wrote a cla...",naturalnewsblogs.com,0,The USA is nothing more than a corporationRece...,the usa is nothing more than a corporationrece...
2238,A simple approach to begin a healthier lifesty...,This is a continuation from a post that I publ...,naturalnewsblogs.com,0,A simple approach to begin a healthier lifesty...,a simple approach to begin a healthier lifesty...
2239,Seven Foods to Reduce the Risk of Lung Cancer,Lung cancer is one of the most common forms of...,naturalnewsblogs.com,0,Seven Foods to Reduce the Risk of Lung CancerL...,seven foods to reduce the risk of lung cancerl...



Sample rows for class 'Fake' (1):


Unnamed: 0,title,text,source,label,complete,cleaned_text
0,Tiny implantable device short-circuits hunger ...,"MADISON, Wis. -- More than 700 million adults ...",https://web.archive.org,1,Tiny implantable device short-circuits hunger ...,tiny implantable device shortcircuits hunger p...
1,Scientists report CRISPR restores effectivenes...,"Wilmington, DE, December 17, 2018 - The CRISPR...",https://web.archive.org,1,Scientists report CRISPR restores effectivenes...,scientists report crispr restores effectivenes...
2,Probiotics could help millions of patients suf...,About 3 million people in the US are diagnosed...,https://web.archive.org,1,Probiotics could help millions of patients suf...,probiotics could help millions of patients suf...



Sample rows for class 'Reliable' (2):


Unnamed: 0,title,text,source,label,complete,cleaned_text
3237,Illinois General Assembly - Ballotpedia,More Democrats were represented in the table a...,https://ballotpedia.org/Illinois_General_Assembly,2,Illinois General Assembly - BallotpediaMore De...,illinois general assembly ballotpediamore dem...
3239,HyperWar: Antiaircraft Action Summary--Suicide...,There were more suicide attempts present Janua...,http://www.ibiblio.org/hyperwar/USN/rep/Kamika...,2,HyperWar: Antiaircraft Action Summary--Suicide...,hyperwar antiaircraft action summarysuicide at...
3243,HyperWar: Antiaircraft Action Summary--Suicide...,suicide attempts in October were 43 while thos...,http://www.ibiblio.org/hyperwar/USN/rep/Kamika...,2,HyperWar: Antiaircraft Action Summary--Suicide...,hyperwar antiaircraft action summarysuicide at...



Sample rows for class 'NOT ENOUGH INFO' (3):


Unnamed: 0,title,text,source,label,complete,cleaned_text
3241,Illinois General Assembly - Ballotpedia,"In the Illinois Legislature, the democratic pa...",https://ballotpedia.org/Illinois_General_Assembly,3,Illinois General Assembly - BallotpediaIn the ...,illinois general assembly ballotpediain the i...
3242,Power to the Poor: Black-Brown Coalition and t...,The rediscovery of poverty in SC the arrests w...,https://books.google.co.uk/books?id=5RRS9aGqyM...,3,Power to the Poor: Black-Brown Coalition and t...,power to the poor blackbrown coalition and the...
3249,Citizen-General: Jacob Dolson Cox and the Civi...,Division and Army Commander,https://books.google.co.uk/books?id=sHjkAgAAQB...,3,Citizen-General: Jacob Dolson Cox and the Civi...,citizengeneral jacob dolson cox and the civil ...


In [None]:
df['label'].isna().sum() #should be 0

np.int64(0)

In [None]:
df['label'].unique()

array([1, 0, 2, 3])

In [None]:

# Now your tokenize_fn will receive `examples["text"]` as a List[str]
# and you can safely do:
from transformers import AutoTokenizer
checkpoint = "bert-base-uncased"
tokenizer  = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_fn(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

tokenized_ds = dataset.map(tokenize_fn, batched=True)
tokenized_ds = tokenized_ds.rename_column("label", "labels")

Map:   0%|          | 0/8927 [00:00<?, ? examples/s]

In [None]:
# Step 6: Split into train/test
split     = tokenized_ds.train_test_split(test_size=0.2)
train_ds  = split["train"]
eval_ds   = split["test"]


In [None]:
# Step 7: Load model & send to GPU :contentReference[oaicite:8]{index=8}:contentReference[oaicite:9]{index=9}
num_labels = df["label"].nunique()
model      = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=num_labels
)
model.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./bert_misinformation",
    do_train=True,             # enable training
    do_eval=True,              # enable evaluation
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    logging_steps=500,         # log every 500 steps
    eval_steps=500,            # evaluate every 500 steps
    save_steps=500,            # checkpoint every 500 steps
    save_total_limit=1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
)

trainer.train()


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss
500,0.7139
1000,0.5151
1500,0.4492
2000,0.4028
2500,0.3256
3000,0.2696
3500,0.22


TrainOutput(global_step=3572, training_loss=0.4087547543067548, metrics={'train_runtime': 3030.5686, 'train_samples_per_second': 9.425, 'train_steps_per_second': 1.179, 'total_flos': 7515639142662144.0, 'train_loss': 0.4087547543067548, 'epoch': 4.0})

In [None]:
# Step 9: Save weights as a .pkl and download
state_dict = model.state_dict()
with open("model_weights.pkl", "wb") as f:
    pickle.dump(state_dict, f)

colab_files.download("model_weights.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pickle
import numpy as np
import torch
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, classification_report

# 1) Re-instantiate the model architecture
checkpoint = "bert-base-uncased"
num_labels = 4  # or df['label'].nunique()
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=num_labels
)

# 2) Load the pickled state_dict
with open("model_weights.pkl", "rb") as f:
    state_dict = pickle.load(f)
model.load_state_dict(state_dict)
model.to(device)

# 3) Wrap in a Trainer for easy batching
# We only need evaluation, so we’ll supply a dummy args object
eval_args = TrainingArguments(
    output_dir="eval",
    per_device_eval_batch_size=8,
    do_train=False,
    do_eval=True,
    logging_dir=None,
)

trainer = Trainer(
    model=model,
    args=eval_args,
    eval_dataset=eval_ds,     # your 20% split
    tokenizer=tokenizer,
)

# 4) Run evaluation
metrics = trainer.evaluate()
print("🔍 Eval metrics:", metrics)

# 5) Get raw predictions & compute a classification report
preds_output = trainer.predict(eval_ds)
logits = preds_output.predictions        # shape (N, num_labels)
y_pred = np.argmax(logits, axis=-1)
y_true = preds_output.label_ids

print("\n📊 Accuracy:", accuracy_score(y_true, y_pred))
print("\n📝 Classification Report:\n")
print(classification_report(
    y_true,
    y_pred,
    target_names=["Unreliable","Fake","Reliable","NOT ENOUGH INFO"]
))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


🔍 Eval metrics: {'eval_loss': 0.7536430954933167, 'eval_model_preparation_time': 0.0028, 'eval_runtime': 51.0028, 'eval_samples_per_second': 35.018, 'eval_steps_per_second': 4.392}

📊 Accuracy: 0.8213885778275476

📝 Classification Report:

                 precision    recall  f1-score   support

     Unreliable       0.93      0.86      0.89       375
           Fake       0.86      0.83      0.84       679
       Reliable       0.76      0.79      0.77       511
NOT ENOUGH INFO       0.70      0.81      0.75       221

       accuracy                           0.82      1786
      macro avg       0.81      0.82      0.82      1786
   weighted avg       0.83      0.82      0.82      1786

