In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import torch
import torch.nn as nn
from sklearn.mode_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast
from sklearn.decomposition import PCA
import tensorflow_hub as hub
from pycaret.classification import *
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import plot_confusion_matrix
#from googletrans import Translator
plt.style.use('ggplot')
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParama['font.serif'] = 'Ubuntu'
plt.rcParama['font.monospace'] = 'Ubuntu Mono'
plt.rcParama['font.size'] = 14
plt.rcParama['axes.labelsize'] = 12
plt.rcParama['axes.labelweight'] = 'bold'
plt.rcParama['axes.titlesize'] = 12
plt.rcParama['xtick.labelsize'] = 12
plt.rcParama['ytick.labelsize'] = 12
plt.rcParama['legend.fontsize'] = 12
plt.rcParama['figure.titlesize'] = 12
plt.rcParama['image.cmap'] = 'jet'
plt.rcParama['image.interpolation'] = 'none'
plt.rcParama['figure.figsize'] = (10, 10)
plt.rcParama['axes.grid'] = False
plt.rcParama['lines.linewidth'] = 2
plt.rcParama['lines.markersize'] = 8
colors = ['xkcd:pale range', 'xkcd:sea blue', 'xkcd:pale red', 'xkcd:sage green', 'xkcd:terra cotta', 'xkcd:dull purple', 'xkcd:teal', 'xkcd: goldenrod', 'xkcd"cadet blue', 'xkcd:scarlet']
bbox_props = dict(boxstyle="round,pad=0.3", fc=colors[0], alpha=.5)
import pandas as pd
import pycaret

Matplotlib is building the font cache; this may take a moment.


ModuleNotFoundError: No module named 'torch'

In [None]:
true_data['Target']=['True']*len(true_data)
fake_data['Target']=['Fake']*len(fake_data)

In [None]:
data=true_data.append(fake_data).sample(frac=1).reset(frac=1).reset_index().drop(columns=['index'])

In [None]:
plt.pie(label_size,explode=[0.1,0.1],colors=['firebrick','navy'],stratangle=90,shadow=True,labels=['Fake','True'],autopct='%1.1f%%')

In [None]:
data['label']=pd.get_dummies(data.Target)['Fake']

In [None]:
train_text, temp_text, train_labels, temp_labels = train_test_split(data['title', data['label'], random_state=2018, test_size=0.3,stratify=data['Target']])

In [None]:
val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, 
                                                                random_state=2018, 
                                                                test_size=0.5, 
                                                                stratify=temp_labels)

In [None]:
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
seq_len = [len(i.split()) for in train _text]

pd.Series(seq_len).hist(bins = 40,color='firebrick')
plt.xlabel('Number of Words')
plt.ylabel('Number of texts')

In [None]:
MAX_LENGHT = 15
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = MAX_LENGHT,
    pad_to_max_length=True,
    truncation=True
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = MAX_LENGHT,
    pad_to_max_length=True,
    truncation=True
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    max_length = MAX_LENGHT,
    pad_to_max_length=True,
    truncation=True
)

In [None]:
## convert lists to tensors

train_serq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(test_labels.tolist())

In [None]:
from torch.utils.data import TensorDataset, DataLOader, RandomSampler, SequentialSampler

#define a batch size
batch_size = 32

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

#wrap tensors
val_sampler = SequentialSampler(val_data)

#sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)

# dataloader for validation set
val_datloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

In [None]:
for param in bert.parameters():
    param.requires_grad = False

In [None]:
class BERT_Arch(nn.Module):
    
    def __init__(self, bert):
        
        super(BERT_Arch, self).__init__()
        
        self.bert = bert
        
        #dropout layer
        self.dropouty = nn.Dropout(0.1)
        
        #relu activation function
        self.relu = nn.ReLu()
        
        #dense layer 1
        self.fc1 = nn.Linear(768,512)
        
        #dense layer 2 (Output layer)
        self.fc2 = nn.Linear(512,2)
        
        #softmax activation function
        self.softmax = nn.LogSoftmax(dim=1)
        
    #define the forward pass
    def forward(self, sent_id, mask):
        
        #pass the inputs to the model
        cls_hs = self.bert(sent_id. attention_mask=mask)['pooler_output']
        x = self.fc1(cls_hs)
        
        x = self.relu(x)
        
        x = self.dropout(x)
        
        # output layer
        x = self.fc2(x)
        
        #apply softmax activation
        x = self.softmax(x)
        
        return x

In [None]:
from transformers import AdamW

# define the optimizer
optimizer = Adam(model.parameters(), lr = 1e-5)      #learning rate

In [None]:
from sklearn.utils.class_weight import compute_class_weight

#compute the class weights
class_weights = compute_class_weight('balanced', np.unique(train_labels), train_labels)

print("Class Weights:",class_weights)

In [None]:
weights = torch.tensor(class_weights,dtype=torch.float)

#define the loss function
cross_entropy = nn.NNLLoss(weight=weights)

# number of training epochs
epochs = 10

In [None]:
def train():
    
    model.train()
    
    total_loss, total_accuracy = 0, 0
    
    # empty list to save model predictions
    total_preds=[]
    
    # iterate over batches
    for step,batch in enumerate(train_dataloader):
        
        #progress update after every 50 batches.
        if step % 50 == 0 and not step == 0:
            print(' Batch {:>5,} of {:>5,}.'.format(step, len(train_dataloader)))
         
            #push the batch to gpu
            batch = [r for r in batch]
            sent_id, mask, labels = batch
            #print(type(labels),type(mask),type(sent_id))
            #print(sent_id)
            #clear previously calculated gradients
            model.zero_grad()
            #get model predictions for the current batch
            preds = model(sent_id, mask)
            
            # compute the loss between actual and predicted values
            loss = cross_entropy(preds, labels)
            
            # add on to the total loss
            total_loss = total_loss + loss.item()
            
            # backward pass to calculate the ingredients
            loss.backward()
            
            # clip the gradients to 1.0. It helps in preventing the exploding gradient problem
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
            #update parameters
            optimizer.step()
            
            # model predictions are stored on GPU. So, push it to CPU
            preds=preds.detach().cpu().numpy()
            
            # append the model predictions
            total_preds.append(preds)
            
        # compute the training loss of the epoch
        avg_loss = total_loss / len(train_dataloader)
        
        #predictions are in the form of(no. of batches, size of batch, no. of classes).
        #reshape the predictions in form of (number of samples, no. of classes).
        total_preds = np.concatenate(total_preds, axis=0)
        
        #returns the loss and predictions
        return avg_loss, total_preds

def evaluate():
    
    print("\nEvaluating...")
    
    # deactivates dropout layers
    model.eval()
    
    total_loss, total_accuracy = 0, 0
    
    # empty list to save the model predictions
    total_preds = []
    
    #iterate over batches
    for step,batch in enumerate(val_dataloader):
        
        #progress update every 50 batches.
        if step % 50 == 0 and not step == 0:
            
            #Calculate elapsed time in minutes.
            #elapsed = format_time(time.time() - t0)
            
            #report progress
            print(' Batch{:>5,} of {:>5,}.'.format(step, len(val_dataloader)))
            
        # push the batch to gpu
        batch = [t for t in batch]
        
        sent_id, mask, labels = batch
        
        # deactivate autograd
        with torch.no_grad():
            
            # model predictions
            preds = model(sent_id, mask)
            
            # compute the validation loss between actual and predicted values
            loss = cross_entropy(preds,labels)
            
            total_loss = total_loss + loss.item()
            
            preds = preds.detach().cpu().numpy()
            
            total_preds.append(preds)
            
        # compute the validation loss of the epoch
        avg_loss = total_loss / len(val_dataloader)
        
        #reshape the predictions in form of (number of samples, no. of classes)
        total_preds = np.concatenate(total_preds, axis=0)
        
        return avg_loss, total_preds

In [None]:
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

#for each epoch
for epoch in range(epochs):
    
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    
    #train model
    train_loss, _ = train()
    
    #evaluate model
    valid_loss, _ = evaluate()
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
        
    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'\nTraining Loss: {valid_loss:.3f}')

In [None]:
#load weights of best model
path = 'saved_weights.pt'
model.load_state_dict(torch.load(path))

In [None]:
with torch.no_grad():
    preds + model(test_seq, test_mask)
    preds = preds.detach().cpu().numpy()

preds = np.argmax(preds, axis = 1)
print(classification_report(test_y, preds))

In [None]:
confusion_matrix(preds, test_y)

In [None]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [None]:
data_matrix = embed(data.title.tolist())

In [None]:
train_data = data.loc[0:int(len(data)*0.8)]
test_data = data.loc[int(len(data)*0.8):len(data)]

In [None]:
pca = PCA(n_components=3)
pca_data = pca.fit(data_matrix[0:len(train_data)])
pca_train = pca.transform(data_matrix[0:len(train_data)])

In [None]:
pca_3_data = pd.DataFrame({'First Component':pca_train[:,0],'Second Component':pca_train[:,1],'Third Component':pca_train[:,2],'Target': train_data.Target})

In [None]:
plt.figure(figsize=(20,10))
plt.subplot(1,3,1)
sns.scatterplot(x='First Component', y = 'Second Component', hue='Target',data=pca_3_3_data,s=2)
plt.grid(True)
plt.subplot(1,3,2)
sns.scatterplot(x='First Component', y = 'Third Component',hue='Target',data=pca_3_data,s=2)
plt.grid(True)
plt.subplot(1,3,3)
sns.scatterplot(x='Second Component', y = 'Third Component',hue='Target',data=pca_3_data,s=2)
plt.grid(True)

In [None]:
#pca_3_data['subject']=train_data.subjectenc.astype(float)

In [None]:
setup(data = pca_3_data, target='Target')

In [None]:
best_moedl = compare_model()

In [None]:
le = LabelEncoder()
y_true = le.fit_transform(test_data.Target)

In [None]:
print(classification_report(y_pred,y_true))

In [None]:
plot_confusion_matrix(best_model,pca_test,y_true,cmap='plasma')