In [1]:
import numpy as np
import pandas as pd

In [2]:
real_news = pd.read_csv('D:/kaggle_fake_news/news/True.csv')
fake_news = pd.read_csv('D:/kaggle_fake_news/news/Fake.csv')

In [3]:
real_news['Fake'] = 0
fake_news['Fake']  = 1

In [4]:
df = pd.concat([real_news,fake_news])

In [5]:
df.head()

Unnamed: 0,title,text,subject,date,Fake
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0


In [6]:
df.tail()

Unnamed: 0,title,text,subject,date,Fake
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",1
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",1
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",1
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",1
23480,10 U.S. Navy Sailors Held by Iranian Military ...,21st Century Wire says As 21WIRE predicted in ...,Middle-east,"January 12, 2016",1


In [7]:
df.to_csv('Dataset.csv')

In [8]:
#creating folds

from sklearn import model_selection

df = pd.read_csv('D:/kaggle_fake_news/news/Dataset.csv')
#creating new column called kfold
df ['kfold'] = -1

#next step is to randomize rows of the data

df = df.sample(frac=1).reset_index(drop = True)

#fetch labels or targetvalues

y = df.Fake.values

#intiating the Kfold class from model_selection

kf = model_selection.StratifiedKFold(n_splits = 5)

#fill the new kfold column

for f, (t_,v_) in enumerate(kf.split(X=df,y=y)):
    df.loc[v_,'kfold'] = f

#saving new csv with kfold column

df.to_csv('dataset_folds.csv', index = False)


In [9]:
df = pd.read_csv('D:/kaggle_fake_news/news/dataset_folds.csv')

In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,Fake,kfold
0,13869,WEDDING CRASHERS: Hillary Tries To Explain Why...,Who goes to a wedding and doesn t bring a gift...,politics,"May 20, 2016",1,0
1,11945,At least 65 media workers killed doing their j...,BERLIN (Reuters) - At least 65 media workers a...,worldnews,"December 19, 2017",0,0
2,12832,"Erdogan, Putin say U.S. decision on Jerusalem ...",ANKARA (Reuters) - Turkish President Tayyip Er...,worldnews,"December 7, 2017",0,0
3,13429,HERE’S HOW HILLARY’S VP PICK Has Just Proven H...,Hillary s VP pick is proving himself to be a a...,politics,"Jul 26, 2016",1,0
4,6749,Fed turns to Trump agenda with rate hike nearl...,WASHINGTON (Reuters) - The Federal Reserve ina...,politicsNews,"December 12, 2016",0,0


In [11]:
df.drop(columns=['title', 'subject','date'], inplace = True)

In [12]:
#we create a simple dataset class Dataset class returns one sample of the training or validation data. 

import torch

class NEWSData:
    def __init__(self,texts,targets):
        self.texts = texts
        self.target = targets
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self,item):
        #for any given item, which is an int,  
        #return review and targets as torch tensor  
        #item is the index of the item in concern
        text = self.texts[item,:]
        target = self.target[item]
        
        return{
            'text': torch.tensor(text,dtype=torch.long),
            'target':torch.tensor(target,dtype = torch.float)            
        }

        
        

In [13]:
#we can create lstm class which consists of our LSTM  model. 

import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, embedding_matrix):
        super(LSTM,self).__init__()
        #number of words = number of rows in embedding matrix
        num_words = embedding_matrix.shape[0]
        
        #dimension of embedding matrix is num of columns in matrix
        
        embed_dim = embedding_matrix.shape[1]
        
        #we define an input embedding layer
        
        self.embedding = nn.Embedding(
            num_embeddings=num_words,
            embedding_dim= embed_dim
        )
        
        #embedding matrix is used as weights of the embedding layer
        
        self.embedding.weight = nn.Parameter(
            torch.tensor(
            embedding_matrix, dtype = torch.float32)
        )
        
        #we dont want to train the pretrained embedding
        
        self.embedding.weight.requires_grad = False
        
        #a simple bidirectional LSTM with hidden size of 128
        
        
        self.lstm = nn.LSTM(
            embed_dim,128, bidirectional=True, batch_first=True,
        )
        
        #output layer which is a linear layer we have only one output
        
        self.out = nn.Linear(512,1)
    
    def forward(self,x):
        #passing data through embedding layer
        #input is tokens
        
        x = self.embedding(x)
        
        #moving embedding output to lstm
        
        x,_ = self.lstm(x)
        
        #apply mean and max pooling on lstm output
        
        avg_pool = torch.mean(x,1)
        max_pool, _  = torch.max(x,1)
        
        #concat mean and max pool 
        #this is why the size is 512
        #128 for each direction
        #avg_pool = 256 and max_pool = 256
        
        out = torch.cat((avg_pool,max_pool),1)
        
        #pass through the output layer and return the output
        
        out = self.out(out)
        
        #return linear output
        
        return out


In [14]:
#Now, we create  our training and evaluation functions. 


In [15]:
def train(data_loader,model,optimizer,device):
    #set model to training mode
    print('setting model to training mode')
    model.train()
    for data in data_loader:
        texts = data['text']
        targets = data['target']
        
        #move the data to device
        
        texts = texts.to(device, dtype = torch.long)
        targets  = targets.to(device, dtype = torch.float32)
        
        #clear the gradients
        optimizer.zero_grad()
        
        #make predictions from model
        
        predictions = model(texts)
        
        #calc loss
        
        loss = nn.BCEWithLogitsLoss()(
        predictions,targets.view(-1,1))
        
        #compute grad wrt all parameters of model that are trainable
        
        loss.backward()
        
        #single optimization step
        
        optimizer.step()
    
def evaluate(data_loader,model,device):
    #initialize list to store prediction and targets
    
    final_predictions = []
    final_targets = []
    
    #put model in eval mode
    
    model.eval()
    #disable gradient cal
    with torch.no_grad():
        for data in data_loader:
            texts = data['text']
            targets = data['target']
            
            
            texts = texts.to(device, dtype = torch.long)
            
            targets = targets.to(device)
            
            #make predictions
            
            predictions = model(texts)
            
            #move prediction and target to list
            
            predictions = predictions.cpu().numpy().tolist()
            targets = data['target'].cpu().numpy().tolist()
            final_predictions.extend(predictions)
            final_targets.extend(targets)
    
    return final_predictions,final_targets
        

In [16]:
#These functions will help us in training multiple folds. 
import io
import tensorflow as tf
from sklearn import metrics

In [17]:
def load_vectors(fname):
    fin = io.open(fname,'r',errors = 'ignore')
    
    data = {}
    for line in fin:
        values = line.split(' ')
        word = values[0] ## The first entry is the word
        coefs = np.asarray(values[1:], dtype='float32') ## These are the vecotrs representing the embedding for the word
        data[word] = coefs
    return data
    



In [18]:
def create_embedding_matrix(word_index, embedding_dict):
    #intialize matrix with zeros
    
    embedding_matrix = np.zeros((len(word_index)+1,300))
    
    #loop over all the words
    
    for word, i in word_index.items():
        #if word is found in pretrained embedding update the matrix
        #if not found the vector is zero
        if word in embedding_dict:
            embedding_matrix[i] = embedding_dict[word]
    #return embedding matrix
    return embedding_matrix

In [19]:

MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
EPOCHS = 10



def run(df,fold):
    train_df = df[df.kfold != fold].reset_index(drop = True)
    valid_df = df[df.kfold == fold].reset_index(drop = True)
    
    print('Fitting tokenizer')
    
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(df.text.values.tolist())
    
    xtrain = tokenizer.texts_to_sequences(train_df.text.values)
    xtest = tokenizer.texts_to_sequences(valid_df.text.values)
    
    
    #zero padding the training seq given the max length this padding is done on left side
    xtrain = tf.keras.preprocessing.sequence.pad_sequences(xtrain, maxlen=MAX_LEN)
    
    xtest = tf.keras.preprocessing.sequence.pad_sequences(xtest, maxlen=MAX_LEN)
    
    #initialize dataset class for training
    
    train_dataset = NEWSData(
        texts = xtrain,
        targets = train_df.Fake.values
    )
    
    #create torh data loader for training 
    
    train_data_loader = torch.utils.data.DataLoader(
                    train_dataset,
                    batch_size=TRAIN_BATCH_SIZE,
                    num_workers = 0
                    )
    
    #initialize dataset class for validation
    
    valid_dataset = NEWSData(
        texts = xtest,
        targets = valid_df.Fake.values
    )
    
    #create torh data loader for validation 
    
    valid_data_loader = torch.utils.data.DataLoader(
                    valid_dataset,
                    batch_size=VALID_BATCH_SIZE,
                    num_workers = 0
                    )
    
    print('Loading Embedding')
    
    embedding_dict = load_vectors('D:/kaggle_fake_news/glove.6B/glove.6B.300d.txt')
    embedding_matrix = create_embedding_matrix(
            
        tokenizer.word_index,embedding_dict)
    
    #create torch device
    
    device = torch.device('cuda')
    
    #fetching our lstm model
    
    model = LSTM(embedding_matrix)
    
    #send model to device
    model.to(device)
    
    #initialize adam optimizer
    optimizer = torch.optim.Adam(model.parameters(),lr=1e-3)
    
    print('training model')
    
    
    #set best accuracy to zero
    
    best_accuracy = 0
    
    #set early stopping counter to zero
    
    early_stopping_counter = 0
    
    for epoch in range(EPOCHS):
        train(train_data_loader,model,optimizer,device)
        
        outputs, targets = evaluate(valid_data_loader,model,device)
        
        outputs = np.array(outputs) >= 0.5
        #calc accuracy
        
        accuracy = metrics.accuracy_score(targets,outputs)
        
        print(f'FOLD:{fold},EPOCH:{epoch},Accuracy Score = {accuracy}')
        
        #simple early stopping
        
        if accuracy>best_accuracy:
            torch.save(model.state_dict(),"model.bin")
            best_accuracy = accuracy
        else:
            early_stopping_counter +=1
        if early_stopping_counter >2:
            break
            
        
    
    

In [20]:
run(df,fold = 0)

Fitting tokenizer
Loading Embedding
training model
setting model to training mode
FOLD:0,EPOCH:0,Accuracy Score = 0.983630289532294
setting model to training mode
FOLD:0,EPOCH:1,Accuracy Score = 0.9902004454342984
setting model to training mode
FOLD:0,EPOCH:2,Accuracy Score = 0.9906458797327394
setting model to training mode
FOLD:0,EPOCH:3,Accuracy Score = 0.9925389755011136
setting model to training mode
FOLD:0,EPOCH:4,Accuracy Score = 0.9927616926503341
setting model to training mode
FOLD:0,EPOCH:5,Accuracy Score = 0.9933184855233853
setting model to training mode
FOLD:0,EPOCH:6,Accuracy Score = 0.9934298440979955
setting model to training mode
FOLD:0,EPOCH:7,Accuracy Score = 0.9920935412026726
setting model to training mode
FOLD:0,EPOCH:8,Accuracy Score = 0.9898663697104677
setting model to training mode
FOLD:0,EPOCH:9,Accuracy Score = 0.994097995545657


In [21]:
run(df,fold = 1)

Fitting tokenizer
Loading Embedding
training model
setting model to training mode
FOLD:1,EPOCH:0,Accuracy Score = 0.9822939866369711
setting model to training mode
FOLD:1,EPOCH:1,Accuracy Score = 0.9893095768374165
setting model to training mode
FOLD:1,EPOCH:2,Accuracy Score = 0.9908685968819599
setting model to training mode
FOLD:1,EPOCH:3,Accuracy Score = 0.9926503340757238
setting model to training mode
FOLD:1,EPOCH:4,Accuracy Score = 0.9919821826280624
setting model to training mode
FOLD:1,EPOCH:5,Accuracy Score = 0.9929844097995546
setting model to training mode
FOLD:1,EPOCH:6,Accuracy Score = 0.9934298440979955
setting model to training mode
FOLD:1,EPOCH:7,Accuracy Score = 0.9934298440979955
setting model to training mode
FOLD:1,EPOCH:8,Accuracy Score = 0.9914253897550112


In [22]:
run(df,fold = 2)

Fitting tokenizer
Loading Embedding
training model
setting model to training mode
FOLD:2,EPOCH:0,Accuracy Score = 0.9850779510022272
setting model to training mode
FOLD:2,EPOCH:1,Accuracy Score = 0.9907572383073496
setting model to training mode
FOLD:2,EPOCH:2,Accuracy Score = 0.9927616926503341
setting model to training mode
FOLD:2,EPOCH:3,Accuracy Score = 0.9910913140311804
setting model to training mode
FOLD:2,EPOCH:4,Accuracy Score = 0.9929844097995546
setting model to training mode
FOLD:2,EPOCH:5,Accuracy Score = 0.9927616926503341
setting model to training mode
FOLD:2,EPOCH:6,Accuracy Score = 0.9929844097995546


In [23]:
run(df,fold = 3)

Fitting tokenizer
Loading Embedding
training model
setting model to training mode
FOLD:3,EPOCH:0,Accuracy Score = 0.9842966922819913
setting model to training mode
FOLD:3,EPOCH:1,Accuracy Score = 0.9913130638155697
setting model to training mode
FOLD:3,EPOCH:2,Accuracy Score = 0.9915358057690166
setting model to training mode
FOLD:3,EPOCH:3,Accuracy Score = 0.9919812896759105
setting model to training mode
FOLD:3,EPOCH:4,Accuracy Score = 0.9930949994431452
setting model to training mode
FOLD:3,EPOCH:5,Accuracy Score = 0.9914244347922931
setting model to training mode
FOLD:3,EPOCH:6,Accuracy Score = 0.9929836284664216
setting model to training mode
FOLD:3,EPOCH:7,Accuracy Score = 0.9925381445595278


In [24]:
run(df,fold = 4)

Fitting tokenizer
Loading Embedding
training model
setting model to training mode
FOLD:4,EPOCH:0,Accuracy Score = 0.9824033856776924
setting model to training mode
FOLD:4,EPOCH:1,Accuracy Score = 0.9879719345138657
setting model to training mode
FOLD:4,EPOCH:2,Accuracy Score = 0.991869918699187
setting model to training mode
FOLD:4,EPOCH:3,Accuracy Score = 0.9922040316293574
setting model to training mode
FOLD:4,EPOCH:4,Accuracy Score = 0.9912016928388462
setting model to training mode
FOLD:4,EPOCH:5,Accuracy Score = 0.9904220960017819
setting model to training mode
FOLD:4,EPOCH:6,Accuracy Score = 0.993540483350039
setting model to training mode
FOLD:4,EPOCH:7,Accuracy Score = 0.9928722574896982
