Package Installations

In [1]:
!pip install kaggle



In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 53.6 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 42.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 37.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 5.8 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

In [3]:
!pip install -U torchtext==0.8.0

Collecting torchtext==0.8.0
  Downloading torchtext-0.8.0-cp37-cp37m-manylinux1_x86_64.whl (6.9 MB)
[K     |████████████████████████████████| 6.9 MB 4.7 MB/s 
Installing collected packages: torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.11.0
    Uninstalling torchtext-0.11.0:
      Successfully uninstalled torchtext-0.11.0
Successfully installed torchtext-0.8.0


Imports

In [4]:
import tensorflow as tf
import torch
import pandas as pd
import numpy as np
import random
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from tqdm.notebook import tqdm
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [5]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
seed_val = 500
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

Kaggle Dataset

In [7]:
# Create the kaggle directory and read the uploaded kaggle.json file
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [8]:
!chmod 600 /root/.kaggle/kaggle.json

In [9]:
# Download dataset
!kaggle datasets download -d clmentbisaillon/fake-and-real-news-dataset

Downloading fake-and-real-news-dataset.zip to /content
 98% 40.0M/41.0M [00:00<00:00, 223MB/s]
100% 41.0M/41.0M [00:00<00:00, 209MB/s]


In [10]:
# Unzip folder in Colab content folder
!unzip /content/fake-and-real-news-dataset.zip

Archive:  /content/fake-and-real-news-dataset.zip
  inflating: Fake.csv                
  inflating: True.csv                


In [11]:
df1 = pd.read_csv("Fake.csv")
df2 = pd.read_csv("True.csv")
df1["y"] = ["False"]*len(df1)
df2["y"] = ["True"]*len(df2)

In [12]:
frames = [df1, df2]
corpus = pd.concat(frames, ignore_index=True)

In [13]:
corpus["x"] = corpus["title"]

In [14]:
y_encoder = LabelEncoder()
corpus['y'] = y_encoder.fit_transform(corpus['y'])

In [15]:
corpus = corpus[["x", "y"]]

In [16]:
corpus.drop_duplicates(inplace=True)
corpus

Unnamed: 0,x,y
0,Donald Trump Sends Out Embarrassing New Year’...,0
1,Drunk Bragging Trump Staffer Started Russian ...,0
2,Sheriff David Clarke Becomes An Internet Joke...,0
3,Trump Is So Obsessed He Even Has Obama’s Name...,0
4,Pope Francis Just Called Out Donald Trump Dur...,0
...,...,...
44892,North Korea shipments to Syria chemical arms a...,1
44894,LexisNexis withdrew two products from Chinese ...,1
44895,Minsk cultural hub becomes haven from authorities,1
44896,Vatican upbeat on possibility of Pope Francis ...,1


**Reddit Dataset**

In [17]:
reddit_df = pd.read_csv('/content/news_posts.csv')
reddit_df

Unnamed: 0.1,Unnamed: 0,Title,Post Text,ID,Score,Total Comments,Post URL
0,0,People Are Accusing Robinhood Of Stealing From...,,l7afyx,181017,4408,https://www.buzzfeednews.com/article/clarissaj...
1,1,US Military Could Lose Space Force Trademark t...,,gyzw2p,129257,2844,https://www.cbr.com/us-military-lose-space-for...
2,2,White House threatens to fire anyone who tries...,,jrskag,126468,4141,https://americanindependent.com/white-house-th...
3,3,Meta's threat to close down Facebook and Insta...,,so0ree,126258,3786,https://www.cityam.com/metas-threat-to-close-d...
4,4,Don't eat or inject yourself with disinfectant...,,g6zci5,125437,7024,https://www.cnn.com/world/live-news/coronaviru...
...,...,...,...,...,...,...,...
4898,987,Colorado pizza delivery driver saves mans life...,,3b37gf,24221,898,http://www.postindependent.com/news/16943384-1...
4899,988,"Utah Woman Donates Over 1,200 Handmade Toys to...",,90pbng,24216,888,https://www.insideedition.com/utah-woman-donat...
4900,989,Year of the Tiger marks increase in tiger popu...,,sinyqg,24197,205,https://democratic-europe.eu/2022/02/01/%ef%bf...
4901,990,Florida cops deliver dresses made by a 99-year...,,8q8njr,24164,183,http://www.miamiherald.com/news/state/florida/...


In [18]:
reddit_df['x'] = reddit_df['Title']

In [19]:
reddit_df['y'] = ["True"]*4903
reddit_df['y'] = y_encoder.transform(reddit_df['y'])

In [20]:
reddit_df = reddit_df[["x", "y"]]
reddit_df.drop_duplicates(inplace=True)
reddit_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Unnamed: 0,x,y
0,People Are Accusing Robinhood Of Stealing From...,1
1,US Military Could Lose Space Force Trademark t...,1
2,White House threatens to fire anyone who tries...,1
3,Meta's threat to close down Facebook and Insta...,1
4,Don't eat or inject yourself with disinfectant...,1
...,...,...
4898,Colorado pizza delivery driver saves mans life...,1
4899,"Utah Woman Donates Over 1,200 Handmade Toys to...",1
4900,Year of the Tiger marks increase in tiger popu...,1
4901,Florida cops deliver dresses made by a 99-year...,1


Twitter Dataset

In [21]:
twitter_df1 = pd.read_csv("/content/fake_news_tweets.csv")
twitter_df2 = pd.read_csv("/content/real_news_tweets.csv")
twitter_df1["y"] = ["False"]*len(twitter_df1)
twitter_df2["y"] = ["True"]*len(twitter_df2)

In [22]:
twitter_df1['x'] = twitter_df1['Text']
twitter_df2['x'] = twitter_df2['Text']

In [23]:
twitter_df = pd.concat([twitter_df1, twitter_df2], ignore_index=True)

In [24]:
twitter_df['y'] = y_encoder.transform(twitter_df['y'])

In [25]:
twitter_df = twitter_df[["x", "y"]]
twitter_df.drop_duplicates(inplace=True)
twitter_df

Unnamed: 0,x,y
0,Grandpa Surprisingly Willing To Talk About Man...,0
1,Nation Attempts To Fall Asleep By Doing Little...,0
2,"When it comes to pip blips, Megan Fox takes th...",0
3,Longtime Sleepytime Tea Addict Has To Use 6 Ba...,0
4,The Onion: Now on another part of your phone. ...,0
...,...,...
9995,The soldiers on the front line of the Ukraine-...,1
9996,What's Putin's next move?,1
9997,"Virgin Hyperloop, a futuristic train service e...",1
9998,Metaverse app allows kids into virtual strip c...,1


Data Preparation for Training

In [26]:
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")
torch.cuda.get_device_name(0)

'Tesla T4'

In [27]:
text = corpus.x.values
labels = corpus.y.values

In [28]:
reddit_text = reddit_df.x.values
reddit_labels = reddit_df.y.values

In [29]:
twitter_text = twitter_df.x.values
twitter_labels = twitter_df.y.values

In [30]:
tokenizer = AutoTokenizer.from_pretrained("google/canine-c")

Downloading:   0%|          | 0.00/892 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/657 [00:00<?, ?B/s]

Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


In [31]:
#Complete Tokenization
inputIds = [tokenizer.encode(element, add_special_tokens= True) for element in text]
redditInputIds = [tokenizer.encode(element, add_special_tokens= True) for element in reddit_text]
twitterInputIds = [tokenizer.encode(element, add_special_tokens= True) for element in twitter_text]

In [32]:
#Truncating/Padding
MAX_LEN = 512
inputIdsTrunc = pad_sequences(inputIds, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
redditInputIdsTrunc = pad_sequences(redditInputIds, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
twitterInputIdsTrunc = pad_sequences(twitterInputIds, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")

In [33]:
#Attention Masks
attentionMasks = [[int(tokenId>0) for tokenId in ele] for ele in inputIdsTrunc]
redditAttentionMasks = [[int(tokenId>0) for tokenId in ele] for ele in redditInputIdsTrunc]
twitterAttentionMasks = [[int(tokenId>0) for tokenId in ele] for ele in twitterInputIdsTrunc]

In [34]:
#Data-Type Conversion to Torch Tensor
trainInputs = torch.tensor(inputIdsTrunc)
redditInputs = torch.tensor(redditInputIdsTrunc)
twitterInputs = torch.tensor(twitterInputIdsTrunc)

trainLabels = torch.tensor(labels)
redditLabels = torch.tensor(reddit_labels)
twitterLabels = torch.tensor(twitter_labels)

trainMasks = torch.tensor(attentionMasks)
redditMasks = torch.tensor(redditAttentionMasks)
twitterMasks = torch.tensor(twitterAttentionMasks)

In [35]:
batch_size = 4
#Train DataLoader
train_data = TensorDataset(trainInputs, trainMasks, trainLabels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
#Test DataLoader - Reddit
reddit_data = TensorDataset(redditInputs, redditMasks, redditLabels)
reddit_sampler = RandomSampler(reddit_data)
reddit_dataloader = DataLoader(reddit_data, sampler=reddit_sampler, batch_size=batch_size)
#Test DataLoader - Twitter
twitter_data = TensorDataset(twitterInputs, twitterMasks, twitterLabels)
twitter_sampler = RandomSampler(twitter_data)
twitter_dataloader = DataLoader(twitter_data, sampler=twitter_sampler, batch_size=batch_size)

Model Initialization

In [36]:
model = AutoModelForSequenceClassification.from_pretrained("google/canine-c", num_labels = 2)
model.cuda()

Downloading:   0%|          | 0.00/698 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/504M [00:00<?, ?B/s]

Some weights of CanineForSequenceClassification were not initialized from the model checkpoint at google/canine-c and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CanineForSequenceClassification(
  (canine): CanineModel(
    (char_embeddings): CanineEmbeddings(
      (HashBucketCodepointEmbedder_0): Embedding(16384, 96)
      (HashBucketCodepointEmbedder_1): Embedding(16384, 96)
      (HashBucketCodepointEmbedder_2): Embedding(16384, 96)
      (HashBucketCodepointEmbedder_3): Embedding(16384, 96)
      (HashBucketCodepointEmbedder_4): Embedding(16384, 96)
      (HashBucketCodepointEmbedder_5): Embedding(16384, 96)
      (HashBucketCodepointEmbedder_6): Embedding(16384, 96)
      (HashBucketCodepointEmbedder_7): Embedding(16384, 96)
      (char_position_embeddings): Embedding(16384, 768)
      (token_type_embeddings): Embedding(16, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (initial_char_encoder): CanineEncoder(
      (layer): ModuleList(
        (0): CanineLayer(
          (attention): CanineAttention(
            (self): CanineSelfAttention(
            

In [37]:
#optimizer
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)



In [38]:
epochs = 2
train_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = train_steps)
scheduler

<torch.optim.lr_scheduler.LambdaLR at 0x7fd27b84c490>

Training

In [39]:
loss_values = []
for i in range(epochs):
    print("")
    print('Epoch: {}'.format(i + 1))
    print('Training...')
    # Reset the total loss for this epoch.
    total_loss = 0

    model.train()

    # For each batch of training data...
    for _, batch in enumerate(tqdm(train_dataloader)):
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 

        batch_input_ids = batch[0].to(device)
        batch_input_mask = batch[1].to(device)
        batch_labels = batch[2].to(device)
        model.zero_grad()        
        outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_input_mask, labels=batch_labels)
        loss = outputs[0]
        total_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()
          
    loss_values.append(total_loss / len(train_dataloader) )

    print("")
    print("  Average training loss: {0:.2f}".format(total_loss/len(train_dataloader)))

print("")
print("Training complete!")


Epoch: 1
Training...


  0%|          | 0/9683 [00:00<?, ?it/s]


  Average training loss: 0.02

Epoch: 2
Training...


  0%|          | 0/9683 [00:00<?, ?it/s]


  Average training loss: 0.01

Training complete!


Prediction on Reddit Dataset

In [40]:
predictions = []
true_labels = []
model.eval()
for batch in reddit_dataloader:
    batch = tuple(t.to(device) for t in batch)
    batchInputIds, batchInput_mask, batchLabels = batch
    with torch.no_grad():
        outputs = model(batchInputIds, token_type_ids=None, attention_mask=batchInput_mask)

    out = outputs[0]

    out = out.detach().cpu().numpy()
    labelIds = batchLabels.to('cpu').numpy()

    predictions.append(out)
    true_labels.append(labelIds)

In [41]:
finalPredictions_reddit = [ele for predList in predictions for ele in predList]
finalPredictions_reddit = np.argmax(finalPredictions_reddit, axis=1).flatten()
finalTrueLabels_reddit = [ele for trueList in true_labels for ele in trueList]

Prediction on Twitter Dataset

In [42]:
predictions = []
true_labels = []
model.eval()
for batch in twitter_dataloader:
    batch = tuple(t.to(device) for t in batch)
    batchInputIds, batchInput_mask, batchLabels = batch
    with torch.no_grad():
        outputs = model(batchInputIds, token_type_ids=None, attention_mask=batchInput_mask)

    out = outputs[0]

    out = out.detach().cpu().numpy()
    labelIds = batchLabels.to('cpu').numpy()

    predictions.append(out)
    true_labels.append(labelIds)

In [43]:
finalPredictions_twitter = [ele for predList in predictions for ele in predList]
finalPredictions_twitter = np.argmax(finalPredictions_twitter, axis=1).flatten()
finalTrueLabels_twitter = [ele for trueList in true_labels for ele in trueList]

Results - Reddit

In [44]:
accuracy_score(finalPredictions_reddit, finalTrueLabels_reddit)

0.7942076279828676

In [45]:
f1_score(finalPredictions_reddit, finalTrueLabels_reddit)

0.8853018074343526

Results - Twitter

In [46]:
accuracy_score(finalPredictions_twitter, finalTrueLabels_twitter)

0.8963777055063362

In [47]:
f1_score(finalPredictions_twitter, finalTrueLabels_twitter)

0.9018065887353879

Results - Twitter + Reddit

In [48]:
accuracy_score(list(finalPredictions_reddit)+list(finalPredictions_twitter), list(finalTrueLabels_reddit)+list(finalTrueLabels_twitter))

0.8601302460202604

In [49]:
f1_score(list(finalPredictions_reddit)+list(finalPredictions_twitter), list(finalTrueLabels_reddit)+list(finalTrueLabels_twitter))

0.8938320426209699

In [50]:
torch.save(model.state_dict(), "canine_weights.pth")
torch.save(model, "canine.pth")