In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
!pip install kaggle



In [4]:
# Create the kaggle directory and read the uploaded kaggle.json file
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [5]:
!chmod 600 /root/.kaggle/kaggle.json

In [6]:
# Download dataset
!kaggle datasets download -d clmentbisaillon/fake-and-real-news-dataset

Downloading fake-and-real-news-dataset.zip to /content
 63% 26.0M/41.0M [00:00<00:00, 270MB/s]
100% 41.0M/41.0M [00:00<00:00, 293MB/s]


In [7]:
# Unzip folder in Colab content folder
!unzip /content/fake-and-real-news-dataset.zip

Archive:  /content/fake-and-real-news-dataset.zip
  inflating: Fake.csv                
  inflating: True.csv                


In [8]:
!ls

fake-and-real-news-dataset.zip	Fake.csv  kaggle.json  sample_data  True.csv


In [9]:
df1 = pd.read_csv("Fake.csv")
df2 = pd.read_csv("True.csv")
df1["y"] = ["False"]*len(df1)
df2["y"] = ["True"]*len(df2)

In [10]:
frames = [df1, df2]
corpus = pd.concat(frames, ignore_index=True)

In [11]:
corpus

Unnamed: 0,title,text,subject,date,y
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",False
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",False
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",False
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",False
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",False
...,...,...,...,...,...
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",True
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",True
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",True
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",True


In [12]:
corpus.describe()

Unnamed: 0,title,text,subject,date,y
count,44898,44898.0,44898,44898,44898
unique,38729,38646.0,8,2397,2
top,Factbox: Trump fills top jobs for his administ...,,politicsNews,"December 20, 2017",False
freq,14,627.0,11272,182,23481


In [13]:
corpus["x"] = corpus["title"]

In [14]:
corpus

Unnamed: 0,title,text,subject,date,y,x
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",False,Donald Trump Sends Out Embarrassing New Year’...
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",False,Drunk Bragging Trump Staffer Started Russian ...
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",False,Sheriff David Clarke Becomes An Internet Joke...
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",False,Trump Is So Obsessed He Even Has Obama’s Name...
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",False,Pope Francis Just Called Out Donald Trump Dur...
...,...,...,...,...,...,...
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",True,'Fully committed' NATO backs new U.S. approach...
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",True,LexisNexis withdrew two products from Chinese ...
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",True,Minsk cultural hub becomes haven from authorities
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",True,Vatican upbeat on possibility of Pope Francis ...


In [15]:
#Drop additional columns
corpus.drop(columns=["title","text", "subject", "date"], inplace=True)

#Drop blank rows
corpus['x'].dropna(inplace = True)

#Convert text to lowercase
corpus['x'] = [text.lower() for text in corpus['x']]

#Tokenization
corpus['x'] = [word_tokenize(text) for text in corpus['x']]

#WordNetLemmatizer
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

In [16]:
corpus

Unnamed: 0,y,x
0,False,"[donald, trump, sends, out, embarrassing, new,..."
1,False,"[drunk, bragging, trump, staffer, started, rus..."
2,False,"[sheriff, david, clarke, becomes, an, internet..."
3,False,"[trump, is, so, obsessed, he, even, has, obama..."
4,False,"[pope, francis, just, called, out, donald, tru..."
...,...,...
44893,True,"['fully, committed, ', nato, backs, new, u.s.,..."
44894,True,"[lexisnexis, withdrew, two, products, from, ch..."
44895,True,"[minsk, cultural, hub, becomes, haven, from, a..."
44896,True,"[vatican, upbeat, on, possibility, of, pope, f..."


In [17]:
membersProcessed = 0
for idx, text in enumerate(corpus['x']):
  finalWords = []
  word_net_lemmatizer = WordNetLemmatizer()
  set_stop = set(stopwords.words('english'))
  iterate = pos_tag(text)
  [finalWords.append(word_net_lemmatizer.lemmatize(word, tag_map[tag[0]])) for word, tag in iterate if word not in set_stop and word.isalpha()]
  corpus.loc[idx, 'finalText'] = str(finalWords)
  membersProcessed+=1
  print('Progress: {}/{} members processed'.format(membersProcessed, len(corpus)))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Progress: 39899/44898 members processed
Progress: 39900/44898 members processed
Progress: 39901/44898 members processed
Progress: 39902/44898 members processed
Progress: 39903/44898 members processed
Progress: 39904/44898 members processed
Progress: 39905/44898 members processed
Progress: 39906/44898 members processed
Progress: 39907/44898 members processed
Progress: 39908/44898 members processed
Progress: 39909/44898 members processed
Progress: 39910/44898 members processed
Progress: 39911/44898 members processed
Progress: 39912/44898 members processed
Progress: 39913/44898 members processed
Progress: 39914/44898 members processed
Progress: 39915/44898 members processed
Progress: 39916/44898 members processed
Progress: 39917/44898 members processed
Progress: 39918/44898 members processed
Progress: 39919/44898 members processed
Progress: 39920/44898 members processed
Progress: 39921/44898 members processed
Progress: 39922

In [18]:
y_encoder = LabelEncoder()
corpus['y'] = y_encoder.fit_transform(corpus['y'])

In [19]:
corpus

Unnamed: 0,y,x,finalText
0,0,"[donald, trump, sends, out, embarrassing, new,...","['donald', 'trump', 'send', 'embarrass', 'new'..."
1,0,"[drunk, bragging, trump, staffer, started, rus...","['drunk', 'brag', 'trump', 'staffer', 'start',..."
2,0,"[sheriff, david, clarke, becomes, an, internet...","['sheriff', 'david', 'clarke', 'become', 'inte..."
3,0,"[trump, is, so, obsessed, he, even, has, obama...","['trump', 'obsessed', 'even', 'obama', 'name',..."
4,0,"[pope, francis, just, called, out, donald, tru...","['pope', 'francis', 'call', 'donald', 'trump',..."
...,...,...,...
44893,1,"['fully, committed, ', nato, backs, new, u.s.,...","['commit', 'nato', 'back', 'new', 'approach', ..."
44894,1,"[lexisnexis, withdrew, two, products, from, ch...","['lexisnexis', 'withdraw', 'two', 'product', '..."
44895,1,"[minsk, cultural, hub, becomes, haven, from, a...","['minsk', 'cultural', 'hub', 'becomes', 'autho..."
44896,1,"[vatican, upbeat, on, possibility, of, pope, f...","['vatican', 'upbeat', 'possibility', 'pope', '..."


In [20]:
from sklearn.model_selection import KFold

In [21]:
kf = KFold(n_splits=10)

In [22]:
X = corpus['finalText']
y = corpus['y']

In [23]:
# acc_ls = []
# for i, (train_index, test_index) in enumerate(kf.split(X)):
#   X_train, X_test = X[train_index], X[test_index]
#   y_train, y_test = y[train_index], y[test_index]
#   TFIDF_vect = TfidfVectorizer(max_features=5000)
#   x_train_tfidf = TFIDF_vect.fit_transform(X_train)
#   x_test_tfidf = TFIDF_vect.transform(X_test)
#   Naive = naive_bayes.MultinomialNB()
#   Naive.fit(x_train_tfidf,y_train)
#   predictions_NB = Naive.predict(x_test_tfidf)
#   acc = accuracy_score(predictions_NB, y_test)
#   acc_ls.append(acc)
#   print("Naive Bayes Accuracy Score " + str(i) + " -> ",acc*100)
# print("Mean Accuracy : ", sum(acc_ls)*100/len(acc_ls))

In [24]:
# acc_ls = []
# for i, (train_index, test_index) in enumerate(kf.split(X)):
#   X_train, X_test = X[train_index], X[test_index]
#   y_train, y_test = y[train_index], y[test_index]
#   TFIDF_vect = TfidfVectorizer(max_features=5000)
#   x_train_tfidf = TFIDF_vect.fit_transform(X_train)
#   x_test_tfidf = TFIDF_vect.transform(X_test)
#   SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
#   SVM.fit(x_train_tfidf, y_train)
#   predictions_SVM = SVM.predict(x_test_tfidf)
#   acc = accuracy_score(predictions_SVM, y_test)
#   acc_ls.append(acc)
#   print("SVM Accuracy Score " + str(i) + " -> ",acc*100)
# print("Mean Accuracy : ", sum(acc_ls)*100/len(acc_ls))

Prelim Results

In [25]:
# x_train, x_test, y_train, y_test = model_selection.train_test_split(corpus['finalText'], corpus['y'], test_size=0.3, shuffle=True)

In [26]:
# TFIDF_vect = TfidfVectorizer(max_features=5000)
# x_train_tfidf = TFIDF_vect.fit_transform(x_train)
# x_test_tfidf = TFIDF_vect.transform(x_test)

In [27]:
# #Naive Bayes Classifier Algorithm
# Naive = naive_bayes.MultinomialNB()
# Naive.fit(x_train_tfidf,y_train)
# # predict the labels on validation dataset
# predictions_NB = Naive.predict(x_test_tfidf)
# # Use accuracy_score function to get the accuracy
# print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, y_test)*100)

In [28]:
# #SVM
# SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
# SVM.fit(x_train_tfidf, y_train)

# predictions_SVM = SVM.predict(x_test_tfidf)

# print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, y_test)*100)

**TextCNN**

In [29]:
import pandas as pd
import numpy as np
import random

from sklearn import model_selection

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable

# cross validation and metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer


In [30]:
!pip install -U torchtext==0.8.0

Collecting torchtext==0.8.0
  Downloading torchtext-0.8.0-cp37-cp37m-manylinux1_x86_64.whl (6.9 MB)
[K     |████████████████████████████████| 6.9 MB 6.4 MB/s 
Installing collected packages: torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.11.0
    Uninstalling torchtext-0.11.0:
      Successfully uninstalled torchtext-0.11.0
Successfully installed torchtext-0.8.0


In [31]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 7.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 8.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 68.1 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 55.4 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 52.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml


In [32]:
np.random.seed(500)

In [33]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [34]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(corpus['finalText'], corpus['y'], test_size=0.3, shuffle=True)

In [35]:
embed_size = 300 # how big is each word vector
max_features = 120000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 750 # max number of words in a question to use
batch_size = 512 # how many samples to process at once
n_splits = 10 # Number of K-fold Splits

In [36]:
#Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(x_train))
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

# Pad the sentences 
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test= pad_sequences(x_test, maxlen=maxlen)

In [37]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip
!ls -lat

--2022-04-14 00:00:25--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-04-14 00:00:25--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-04-14 00:00:26--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2022-0

In [38]:
def load_glove(word_index,embed_size):
    EMBEDDING_FILE = '../content/glove.6B.'+str(embed_size)+'d.txt'
    def get_coefs(word,*arr): 
      return word, np.asarray(arr, dtype='float32')[:embed_size]

    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = -0.005838499,0.48782197
    embed_size_2 = all_embs.shape[1]
    nb_words = min(max_features, len(word_index)+1)
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size_2))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
        else:
            embedding_vector = embeddings_index.get(word.capitalize())
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

embedding_matrix = load_glove(tokenizer.word_index,embed_size)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [39]:
np.shape(embedding_matrix)

(13855, 300)

In [40]:
class TextCNN_Classifier(nn.Module):
    
    def __init__(self):
        super(TextCNN_Classifier, self).__init__()
        #kernel size will be filter _size * embedding size
        # we will have 5 filter covering these many words at a time
        self.filter_1 = 1 
        self.filter_2 = 2
        self.filter_3 = 3
        self.filter_4 = 4
        self.filter_5 = 5
        num_filters = 15 # no of output channels
        number_classes = 10
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.conv_1 = nn.Conv2d(1,num_filters,(self.filter_1, embed_size))
        self.conv_2 = nn.Conv2d(1,num_filters,(self.filter_2, embed_size))
        self.conv_3 = nn.Conv2d(1,num_filters,(self.filter_3, embed_size))
        self.conv_4 = nn.Conv2d(1,num_filters,(self.filter_4, embed_size))
        self.conv_5 = nn.Conv2d(1,num_filters,(self.filter_5, embed_size))
        self.dropout = nn.Dropout(0.1)
        self.fc1 = nn.Linear(5*num_filters, number_classes) # number of layers is 5

    def forward(self, x):
        x = self.embedding(x)  
        x =  x.unsqueeze(1)
        x1 =  F.max_pool1d( F.relu(self.conv_1(x)).squeeze(3) , F.relu(self.conv_1(x)).squeeze(3).size(2)).squeeze(2)
        x2 =  F.max_pool1d( F.relu(self.conv_2(x)).squeeze(3) , F.relu(self.conv_2(x)).squeeze(3).size(2)).squeeze(2)
        x3 =  F.max_pool1d( F.relu(self.conv_3(x)).squeeze(3) , F.relu(self.conv_3(x)).squeeze(3).size(2)).squeeze(2)
        x4 =  F.max_pool1d( F.relu(self.conv_4(x)).squeeze(3) , F.relu(self.conv_4(x)).squeeze(3).size(2)).squeeze(2)
        x5 =  F.max_pool1d( F.relu(self.conv_5(x)).squeeze(3) , F.relu(self.conv_5(x)).squeeze(3).size(2)).squeeze(2)
        x = torch.cat((x1,x2,x3,x4,x5),1)
        x = self.dropout(x)
        x = self.fc1(x) 
        return x

In [41]:
n_epochs = 4
model = TextCNN_Classifier()
loss_fn = nn.CrossEntropyLoss(reduction='sum')
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001)
device = torch.device("cpu")
model.to(device)

# Load train and test in CUDA Memory
x_train_2 = torch.tensor((x_train), dtype=torch.long)
y_train_2 = torch.tensor(y_train.values, dtype=torch.long)
x_cv = torch.tensor(x_test, dtype=torch.long)
y_cv = torch.tensor(y_test, dtype=torch.long)

# Create Torch datasets
train = torch.utils.data.TensorDataset(x_train_2, y_train_2)
valid = torch.utils.data.TensorDataset(x_cv, y_cv)

# Create Data Loaders
train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)

train_loss = []
valid_loss = []

for epoch in range(n_epochs):
    # Set model to train configuration
    model.train()
    avg_loss = 0.  
    for i, (x, y) in enumerate(train_loader):
        # Predict/Forward Pass
        y_pred = model(x)
        # Compute loss
        loss = loss_fn(y_pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        avg_loss += loss.item() / len(train_loader)
    
    # Set model to validation configuration -Doesn't get trained here
    model.eval()        

    val_preds = np.zeros((len(x_cv),10))
    #y_pred_2 =[]
    y_true =[]
    preds = []
    labels = []
    for i, (x, y) in enumerate(valid_loader):
      y_pred_2=torch.argmax(model(x), dim = 1).tolist()
      y_pred_2 = map(int, y_pred_2)
      preds.extend(list(y_pred_2))
      labels.extend(y.tolist())
    
    val_accuracy = (np.sum(np.array(preds) == np.array(labels)))/len(preds)
    f1_scorE_sklearn = f1_score(preds,labels,average="weighted")
    print("f1 score is: ")
    print(f1_scorE_sklearn)
    print('Epoch {}/{} \t val_acc={:.4f} \t f1_score={:.4f}'.format(
                epoch + 1, n_epochs, val_accuracy, f1_scorE_sklearn))


f1 score is: 
0.8511995391514724
Epoch 1/4 	 val_acc=0.8509 	 f1_score=0.8512
f1 score is: 
0.8900003078978742
Epoch 2/4 	 val_acc=0.8900 	 f1_score=0.8900
f1 score is: 
0.9100687186795319
Epoch 3/4 	 val_acc=0.9101 	 f1_score=0.9101
f1 score is: 
0.9223095081111624
Epoch 4/4 	 val_acc=0.9223 	 f1_score=0.9223
