Package Installations

In [1]:
!pip install kaggle



In [2]:
!pip install transformers



In [3]:
!pip install -U torchtext==0.8.0



In [4]:
import tensorflow as tf
import torch
import pandas as pd
import numpy as np
import random
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from tqdm.notebook import tqdm
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import transformers
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [5]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
seed_val = 500
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

Kaggle Dataset

In [7]:
# Create the kaggle directory and read the uploaded kaggle.json file
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [8]:
!chmod 600 /root/.kaggle/kaggle.json

In [9]:
# Download dataset
!kaggle datasets download -d clmentbisaillon/fake-and-real-news-dataset

Downloading fake-and-real-news-dataset.zip to /content
 71% 29.0M/41.0M [00:00<00:00, 89.7MB/s]
100% 41.0M/41.0M [00:00<00:00, 94.6MB/s]


In [10]:
# Unzip folder in Colab content folder
!unzip /content/fake-and-real-news-dataset.zip

Archive:  /content/fake-and-real-news-dataset.zip
  inflating: Fake.csv                
  inflating: True.csv                


In [11]:
df1 = pd.read_csv("Fake.csv")
df2 = pd.read_csv("True.csv")
df1["y"] = ["False"]*len(df1)
df2["y"] = ["True"]*len(df2)

In [12]:
frames = [df1, df2]
corpus = pd.concat(frames, ignore_index=True)

In [13]:
corpus["x"] = corpus["title"]

In [14]:
y_encoder = LabelEncoder()
corpus['y'] = y_encoder.fit_transform(corpus['y'])

In [15]:
corpus = corpus[["x", "y"]]

In [16]:
corpus.drop_duplicates(inplace=True)
corpus

Unnamed: 0,x,y
0,Donald Trump Sends Out Embarrassing New Year’...,0
1,Drunk Bragging Trump Staffer Started Russian ...,0
2,Sheriff David Clarke Becomes An Internet Joke...,0
3,Trump Is So Obsessed He Even Has Obama’s Name...,0
4,Pope Francis Just Called Out Donald Trump Dur...,0
...,...,...
44892,North Korea shipments to Syria chemical arms a...,1
44894,LexisNexis withdrew two products from Chinese ...,1
44895,Minsk cultural hub becomes haven from authorities,1
44896,Vatican upbeat on possibility of Pope Francis ...,1


**Reddit Dataset**

In [17]:
reddit_df = pd.read_csv('/content/news_posts.csv')
reddit_df

Unnamed: 0.1,Unnamed: 0,Title,Post Text,ID,Score,Total Comments,Post URL
0,0,People Are Accusing Robinhood Of Stealing From...,,l7afyx,181017,4408,https://www.buzzfeednews.com/article/clarissaj...
1,1,US Military Could Lose Space Force Trademark t...,,gyzw2p,129257,2844,https://www.cbr.com/us-military-lose-space-for...
2,2,White House threatens to fire anyone who tries...,,jrskag,126468,4141,https://americanindependent.com/white-house-th...
3,3,Meta's threat to close down Facebook and Insta...,,so0ree,126258,3786,https://www.cityam.com/metas-threat-to-close-d...
4,4,Don't eat or inject yourself with disinfectant...,,g6zci5,125437,7024,https://www.cnn.com/world/live-news/coronaviru...
...,...,...,...,...,...,...,...
4898,987,Colorado pizza delivery driver saves mans life...,,3b37gf,24221,898,http://www.postindependent.com/news/16943384-1...
4899,988,"Utah Woman Donates Over 1,200 Handmade Toys to...",,90pbng,24216,888,https://www.insideedition.com/utah-woman-donat...
4900,989,Year of the Tiger marks increase in tiger popu...,,sinyqg,24197,205,https://democratic-europe.eu/2022/02/01/%ef%bf...
4901,990,Florida cops deliver dresses made by a 99-year...,,8q8njr,24164,183,http://www.miamiherald.com/news/state/florida/...


In [18]:
reddit_df['x'] = reddit_df['Title']

In [19]:
reddit_df['y'] = ["True"]*4903
reddit_df['y'] = y_encoder.transform(reddit_df['y'])

In [20]:
reddit_df = reddit_df[["x", "y"]]
reddit_df.drop_duplicates(inplace=True)
reddit_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Unnamed: 0,x,y
0,People Are Accusing Robinhood Of Stealing From...,1
1,US Military Could Lose Space Force Trademark t...,1
2,White House threatens to fire anyone who tries...,1
3,Meta's threat to close down Facebook and Insta...,1
4,Don't eat or inject yourself with disinfectant...,1
...,...,...
4898,Colorado pizza delivery driver saves mans life...,1
4899,"Utah Woman Donates Over 1,200 Handmade Toys to...",1
4900,Year of the Tiger marks increase in tiger popu...,1
4901,Florida cops deliver dresses made by a 99-year...,1


**DistilBERT**

In [21]:
pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.63.6-py3-none-any.whl (249 kB)
[K     |████████████████████████████████| 249 kB 5.4 MB/s 
Collecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 1.2 MB/s 
[?25hCollecting wandb>=0.10.32
  Downloading wandb-0.12.14-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 36.9 MB/s 
Collecting streamlit
  Downloading streamlit-1.8.1-py2.py3-none-any.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 29.7 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 40.0 MB/s 
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 1.3 MB/s 
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.5.10-py2.py3-none-any.whl (144 kB)
[K     |█████

In [22]:
from simpletransformers.classification import MultiLabelClassificationModel


# Create a ClassificationModel
model = MultiLabelClassificationModel('electra', 'google/electra-small-discriminator', num_labels=2)

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/51.7M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForMultiLabelSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForMultiLabelSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

In [23]:
# Create a TransformerModel with modified attributes
from simpletransformers.model import TransformerModel
model = TransformerModel('electra', 'google/electra-small-discriminator', num_labels=2, args={'learning_rate':1e-5, 'num_train_epochs': 3, 'reprocess_input_data': True, 'overwrite_output_dir': True})

Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [24]:
train = corpus
test = reddit_df

In [25]:
model.train_model(train)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/38729 [00:00<?, ?it/s]



Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/4842 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/4842 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/4842 [00:00<?, ?it/s]

(14526, 0.029993916531781364)

In [26]:
from sklearn.metrics import f1_score, accuracy_score


def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='micro')
    
result, model_outputs, wrong_predictions = model.eval_model(test, f1=f1_multiclass, acc=accuracy_score)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/4903 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/613 [00:00<?, ?it/s]



In [27]:
print(result)
# print(model_outputs)
# print(wrong_predictions)

{'mcc': 0.0, 'tp': 4048, 'tn': 0, 'fp': 0, 'fn': 855, 'auroc': nan, 'auprc': 1.0, 'f1': 0.825616969202529, 'acc': 0.825616969202529, 'eval_loss': 1.6084279439626277}
