In [105]:
!pip install transformers
!pip install wandb

Collecting wandb
  Downloading wandb-0.15.12-py3-none-any.whl (2.1 MB)
     ---------------------------------------- 2.1/2.1 MB 22.3 MB/s eta 0:00:00
Collecting docker-pycreds>=0.4.0
  Using cached docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting appdirs>=1.4.3
  Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)
Collecting protobuf!=4.21.0,<5,>=3.19.0
  Downloading protobuf-4.24.4-cp37-cp37m-win_amd64.whl (430 kB)
     ------------------------------------- 430.0/430.0 kB 26.2 MB/s eta 0:00:00
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.34.0-py2.py3-none-any.whl (243 kB)
     ------------------------------------- 243.9/243.9 kB 15.6 MB/s eta 0:00:00
Collecting Click!=8.0.0,>=7.1
  Downloading click-8.1.7-py3-none-any.whl (97 kB)
     ---------------------------------------- 97.9/97.9 kB ? eta 0:00:00
Collecting pathtools
  Using cached pathtools-0.1.2-py3-none-any.whl
Collecting GitPython!=3.1.29,>=1.0.0
  Downloading GitPython-3.1.40-py3-none-any.whl (190

In [43]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import AutoTokenizer, AutoModel
from typing import List
from torch.utils.data import DataLoader
from tqdm import tqdm

In [83]:
DATA_PATH = 'data/WELFake_Dataset.csv'
BATCH_SIZE = 1
LR = 1e-4
EPOCH = 10
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [84]:
news_df = pd.read_csv(DATA_PATH)

#pre-processing
# dropping the duplicate values
news_df.drop_duplicates(inplace=True)
news_df.dropna(inplace=True, axis=0)

train_df , test_df = train_test_split(news_df, test_size=0.2 ,random_state=42)

In [90]:
class myDataset(torch.utils.data.Dataset):
    def __init__(self,df:pd.DataFrame):
        self.df = df.reset_index(drop=True)
        self.label = df['label']
        self.text = df['text']
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self,idx):
        return self.text.iloc[idx], self.label.iloc[idx]

In [91]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


MODEL_NAME = "Q93WnX4FUHx2mJ/e5-multi-base-sbert"
class myModel(nn.Module):
    def __init__(self):
        super().__init__()
        # Load model directly
        self.tokenizer = AutoTokenizer.from_pretrained(f'{MODEL_NAME}')
        self.model = AutoModel.from_pretrained(f'{MODEL_NAME}')
        self.linear1 = nn.Linear(768,1) #only one class, 0 or 1
    
    def forward(self,x:List[str] ) -> torch.Tensor:
        bert_tokenized = self.tokenizer(x,max_length=512,truncation=True,padding=True, return_tensors='pt')
        bert_output = self.model(**bert_tokenized)
        sentence_embedding = mean_pooling(bert_output, bert_tokenized['attention_mask'])
        output = self.linear1(sentence_embedding)
        return output
model = myModel().to(device)

In [92]:
train_dataset = myDataset(train_df)
test_dataset = myDataset(test_df)
train_dataloader = DataLoader(train_dataset,batch_size=BATCH_SIZE, shuffle=True)
test_dataset = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
import wandb
from sklearn.metrics import accuracy_score
wandb.init(project="news_classification_baseline")

In [115]:
def train():    
    criterion = nn.BCELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=LR)
    
    wandb.config = {
    "learning_rate": LR,
    "epochs": EPOCH,
    "batch_size": BATCH_SIZE
    }
    wandb.run.name = 'baseline'
    wandb.run.save()
    
    for epoch in range(EPOCH):
        epoch_loss = 0.0
        epoch_acc = 0.0
        for i, batch in enumerate(tqdm(train_dataloader)):
            text, label = batch
            
            optimizer.zero_grad()
            
            output = model(text)
            
            loss = criterion(output, label.unsqueeze(1).float() )
            epoch_loss+=loss.item()
            
            loss.backward()
            
            optimizer.step()
            
            pred = output.detach().numpy()
            pred[pred<0.5] = 0
            pred[pred!=0] = 1
            
            epoch_acc+= accuracy_score(label.unsqueeze(1), pred)
            
            if(i%2==0):
                wandb.log( {"train_loss": epoch_loss/(i+1),
                            "train_acc": epoch_acc/(i+1)} )
    

In [None]:
train()