In [None]:
!pip install transformers
!pip install wandb

In [4]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import AutoTokenizer, AutoModel
from typing import List
from torch.utils.data import DataLoader
from tqdm import tqdm

In [5]:
DATA_PATH = 'data/WELFake_Dataset.csv'
BATCH_SIZE = 1
LR = 1e-4
EPOCH = 10
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [6]:
news_df = pd.read_csv(DATA_PATH)

#pre-processing
# dropping the duplicate values
news_df.drop_duplicates(inplace=True)
news_df.dropna(inplace=True, axis=0)

train_df , test_df = train_test_split(news_df, test_size=0.2 ,random_state=42)

In [7]:
class myDataset(torch.utils.data.Dataset):
    def __init__(self,df:pd.DataFrame):
        self.df = df.reset_index(drop=True)
        self.label = df['label']
        self.text = df['text']
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self,idx):
        return self.text.iloc[idx], self.label.iloc[idx]

In [13]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


MODEL_NAME = "Q93WnX4FUHx2mJ/e5-multi-base-sbert"
class myModel(nn.Module):
    def __init__(self):
        super().__init__()
        # Load model directly
        self.tokenizer = AutoTokenizer.from_pretrained(f'{MODEL_NAME}')
        self.model = AutoModel.from_pretrained(f'{MODEL_NAME}')
        self.linear1 = nn.Linear(768,1) #only one class, 0 or 1
        self.sigmoid = nn.Sigmoid()
    
    def forward(self,x:List[str] ) -> torch.Tensor:
        bert_tokenized = self.tokenizer(x,max_length=512,truncation=True,padding=True, return_tensors='pt')
        bert_output = self.model(**bert_tokenized) # [# of batch, max_len, vocab_size] -> [# of batch, 768]
        sentence_embedding = mean_pooling(bert_output, bert_tokenized['attention_mask']) # [# of batch, 1]
        output = self.linear1(sentence_embedding)
        return self.sigmoid(output)

model = myModel().to(device)

In [14]:
train_dataset = myDataset(train_df)
test_dataset = myDataset(test_df)
train_dataloader = DataLoader(train_dataset,batch_size=BATCH_SIZE, shuffle=True)
test_dataset = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [15]:
import wandb
from sklearn.metrics import accuracy_score
wandb.init(project="news_classification_baseline")
wandb.config = {
    "learning_rate": LR,
    "epochs": EPOCH,
    "batch_size": BATCH_SIZE
    }
wandb.run.name = 'baseline'
wandb.run.save()

VBox(children=(Label(value='0.001 MB of 0.009 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.124419…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011288888886984852, max=1.0…

True

In [16]:
def train():    
    criterion = nn.BCELoss() #[0 or 1]  # p'(x) log(1 - p(x)) + (1-p'(x)) log( p(x) )
    optimizer = torch.optim.SGD(model.parameters(), lr=LR)
    
    for epoch in range(EPOCH):
        epoch_loss = 0.0
        epoch_acc = 0.0
        for i, batch in enumerate(tqdm(train_dataloader)):
            text, label = batch
            
            optimizer.zero_grad()
            
            output = model(text)
            
            loss = criterion(output, label.unsqueeze(1).float() )
            epoch_loss+=loss.item()
            
            loss.backward()
            
            optimizer.step()
            
            pred = output.detach().numpy()
            pred[pred<0.5] = 0
            pred[pred!=0] = 1
            
            epoch_acc+= accuracy_score(label.unsqueeze(1), pred)
            
            if(i%2==0):
                wandb.log( {"train_loss": epoch_loss/(i+1),
                            "train_acc": epoch_acc/(i+1)} )
    

In [18]:
def test():
    ###### TODO: logging test accuracy & F1-score ############
    epoch_acc = 0.0
    model.eval()
    with torch.no_grad():
        for i, batch in enumerate(tqdm(test_dataset)):
                text, label = batch

                output = model(text)
                
                pred = output.detach().numpy()
                pred[pred<0.5] = 0
                pred[pred!=0] = 1
                
                epoch_acc+= accuracy_score(label.unsqueeze(1), pred)
                
                if(i%2==0):
                    wandb.log( {"test_acc" : epoch_acc/(i+1) } )

    wandb.log( {"final_accuracy" : epoch_acc / (i+1) })
    ##########################################################

In [17]:
train()

  0%|          | 80/57229 [05:18<63:15:09,  3.98s/it]


KeyboardInterrupt: 

In [19]:
test()

  0%|          | 21/14308 [00:29<5:38:48,  1.42s/it]


KeyboardInterrupt: 