In [25]:
import torch
import pandas as pd
from transformers import AutoTokenizer, GPT2ForSequenceClassification

!pip install datasets
from datasets import Dataset




In [26]:
print("GPU available:", torch.cuda.is_available())
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU found")

GPU available: True
Device name: NVIDIA A100-SXM4-40GB


In [27]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/1684 project/WELFake_Dataset.csv')
df.dropna(inplace=True)

dataset = Dataset.from_pandas(df)


tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")



Map:   0%|          | 0/71537 [00:00<?, ? examples/s]

In [35]:
train_temp_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_temp_split["train"]
temp_dataset = train_temp_split["test"]

test_eval_split = temp_dataset.train_test_split(test_size=0.5)
test_dataset = test_eval_split["train"]
eval_dataset = test_eval_split["test"]

In [36]:
print(train_dataset[0])

{'Unnamed: 0': tensor(16786), 'title': 'Defense Board: White House Blocked Navy From S. China Sea Warship Passages', 'text': 'Washington Free Beacon October 26, 2016 \nSenior White House officials blocked the Navy from conducting needed freedom of navigation operations in the South China Sea amid growing concerns that China is militarizing newly reclaimed islands, according to the Pentagon’s Defense Policy Board. \nA working paper produced in September 2015 by John Hamre, the policy board chairman, called for an immediate resumption of Navy warship passages to prevent China from taking over the strategic Southeast Asian waterway. \nThe internal document was disclosed Monday by WikiLeaks as part of its latest batch of hacked emails from the account of John Podesta, campaign chairman for Democratic presidential nominee Hillary Clinton. The Obama administration has accused “Russia’s senior-most officials” of hacking and leaking the emails posted to WikiLeaks and other sites in order to in

In [37]:
device = torch.device("cuda")
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2)
model.config.pad_token_id = tokenizer.pad_token_id
model.to(device)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)

In [43]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",        # Save the model at the end of each epoch
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    logging_dir="./logs",
    load_best_model_at_end=True,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)



In [44]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0066,0.002364


TrainOutput(global_step=1789, training_loss=0.013862152883232207, metrics={'train_runtime': 2616.1427, 'train_samples_per_second': 21.875, 'train_steps_per_second': 0.684, 'total_flos': 2.9907503479259136e+16, 'train_loss': 0.013862152883232207, 'epoch': 1.0})

In [101]:
model_dir = "/content/drive/MyDrive/Colab Notebooks/1684 project/save"
trainer.save_model(model_dir)
tokenizer.save_pretrained(model_dir)

('/content/drive/MyDrive/Colab Notebooks/1684 project/save/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/1684 project/save/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/1684 project/save/vocab.json',
 '/content/drive/MyDrive/Colab Notebooks/1684 project/save/merges.txt',
 '/content/drive/MyDrive/Colab Notebooks/1684 project/save/added_tokens.json',
 '/content/drive/MyDrive/Colab Notebooks/1684 project/save/tokenizer.json')

In [46]:
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)
labels = predictions.label_ids

In [47]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Calculate accuracy
accuracy = accuracy_score(labels, preds)

# Calculate precision, recall, and F1 score
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.9988817444786133
Precision: 0.9994497936726272
Recall: 0.998351195383347
F1 Score: 0.9989001924663184


In [93]:
fake = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/1684 project/Fake.csv')
true = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/1684 project/True.csv')

# Add labels
fake['label'] = 1
true['label'] = 0
# Combine the datasets and shuffle the rows
test_dataset = pd.concat([fake, true], ignore_index=True).sample(frac=1, random_state=42)
test_dataset.at[0, 'text'] = "Trump was killed in 2020"
# Convert to Hugging Face Dataset and apply tokenization
test_dataset = Dataset.from_pandas(test_dataset)
print(len(test_dataset))


44898


In [94]:
test_dataset = test_dataset.map(tokenize_function, batched=True)
test_dataset.set_format("torch")

Map:   0%|          | 0/44898 [00:00<?, ? examples/s]

In [None]:
print()

In [95]:
predicto = trainer.predict(test_dataset)
preds = predicto.predictions.argmax(-1)
labels = predicto.label_ids
accuracy = accuracy_score(labels, preds)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9998663637578511


In [96]:
predicto

PredictionOutput(predictions=array([[ -6.848428  ,   5.570065  ],
       [ 12.097086  ,  -0.6287981 ],
       [ 15.074913  ,   0.84104717],
       ...,
       [ 16.527786  ,   1.4619098 ],
       [-11.343193  ,   1.456846  ],
       [-10.032998  ,   3.0558374 ]], dtype=float32), label_ids=array([1, 0, 0, ..., 0, 1, 1]), metrics={'test_loss': 0.0006099226884543896, 'test_runtime': 638.158, 'test_samples_per_second': 70.356, 'test_steps_per_second': 2.2})

In [99]:
for x in range(len(test_dataset)):
  if preds[x] != labels[x]:
    print(x)
    print(test_dataset[x]['text'])

9443
Trump was killed in 2020
9517
STREET ART HAS APPEARED on the streets surrounding ABC News  Good Morning America studios in New York City. The posters feature Hillary Clinton with a smiling George Stephanopoulos, the ABC News anchor under fire for failing to disclose $75,000 in donations to the Clinton Foundation, along with the words  Pay Pal  and  Donate. They were posted near ABC News studios on  Peter Jennings Way  in Manhattan.
21395
This is not a fluke. After years of towing the union line, life-long Democrat union members are switching their allegiance to Trump. A recent AFL-CIO poll found that Trump has more support than Hillary and Bernie Sanders combined.
22891
 
32350
This is not a fluke. After years of towing the union line, life-long Democrat union members are switching their allegiance to Trump. A recent AFL-CIO poll found that Trump has more support than Hillary and Bernie Sanders combined.
37752
He also dodges the question of sanctuary cities:
