In [1]:
!pip install transformers
!pip install wandb
!pip install optuna

Collecting optuna
  Downloading optuna-3.4.0-py3-none-any.whl (409 kB)
     -------------------------------------- 409.6/409.6 kB 8.5 MB/s eta 0:00:00
Collecting colorlog
  Using cached colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting alembic>=1.5.0
  Downloading alembic-1.12.1-py3-none-any.whl (226 kB)
     ------------------------------------- 226.8/226.8 kB 14.4 MB/s eta 0:00:00
Collecting sqlalchemy>=1.3.0
  Downloading SQLAlchemy-2.0.23-cp37-cp37m-win_amd64.whl (2.1 MB)
     ---------------------------------------- 2.1/2.1 MB 14.5 MB/s eta 0:00:00
Collecting Mako
  Using cached Mako-1.2.4-py3-none-any.whl (78 kB)
Collecting typing-extensions>=4
  Downloading typing_extensions-4.7.1-py3-none-any.whl (33 kB)
Collecting greenlet!=0.4.17
  Downloading greenlet-3.0.1-cp37-cp37m-win_amd64.whl (287 kB)
     ------------------------------------- 287.7/287.7 kB 17.3 MB/s eta 0:00:00
Installing collected packages: typing-extensions, greenlet, colorlog, sqlalchemy, Mako, alembic, optun

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pytorch-lightning 1.9.5 requires lightning-utilities>=0.6.0.post0, which is not installed.
pytorch-lightning 1.9.5 requires torchmetrics>=0.7.0, which is not installed.


In [43]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import AutoTokenizer, AutoModel
from typing import List
from torch.utils.data import DataLoader
from tqdm import tqdm

In [83]:
DATA_PATH = 'data/WELFake_Dataset.csv'
BATCH_SIZE = 1
LR = 1e-4
EPOCH = 10
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [84]:
news_df = pd.read_csv(DATA_PATH)

#pre-processing
# dropping the duplicate values
news_df.drop_duplicates(inplace=True)
news_df.dropna(inplace=True, axis=0)

train_df , test_df = train_test_split(news_df, test_size=0.2 ,random_state=42)

In [90]:
class myDataset(torch.utils.data.Dataset):
    def __init__(self,df:pd.DataFrame):
        self.df = df.reset_index(drop=True)
        self.label = df['label']
        self.text = df['text']
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self,idx):
        return self.text.iloc[idx], self.label.iloc[idx]

In [91]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


MODEL_NAME = "Q93WnX4FUHx2mJ/e5-multi-base-sbert"
class myModel(nn.Module):
    def __init__(self):
        super().__init__()
        # Load model directly
        self.tokenizer = AutoTokenizer.from_pretrained(f'{MODEL_NAME}')
        self.model = AutoModel.from_pretrained(f'{MODEL_NAME}')
        self.linear1 = nn.Linear(768,1) #only one class, 0 or 1
    
    def forward(self,x:List[str] ) -> torch.Tensor:
        bert_tokenized = self.tokenizer(x,max_length=512,truncation=True,padding=True, return_tensors='pt')
        bert_output = self.model(**bert_tokenized)
        sentence_embedding = mean_pooling(bert_output, bert_tokenized['attention_mask'])
        output = self.linear1(sentence_embedding)
        return output
model = myModel().to(device)

In [92]:
train_dataset = myDataset(train_df)
test_dataset = myDataset(test_df)
train_dataloader = DataLoader(train_dataset,batch_size=BATCH_SIZE, shuffle=True)
test_dataset = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
import wandb
from sklearn.metrics import accuracy_score

In [115]:
def train():
    wandb.init(project="news_classification_baseline")
    wandb.config = {
    "learning_rate": LR,
    "epochs": EPOCH,
    "batch_size": BATCH_SIZE
    }
    wandb.run.name = f'baseline_optuna_{LR}-{EPOCH}-{BATCH_SIZE}'
    wandb.run.save()

    criterion = nn.BCELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=LR)
    
    for epoch in range(EPOCH):
        epoch_loss = 0.0
        epoch_acc = 0.0
        for i, batch in enumerate(tqdm(train_dataloader)):
            text, label = batch
            
            optimizer.zero_grad()
            
            output = model(text)
            
            loss = criterion(output, label.unsqueeze(1).float() )
            epoch_loss+=loss.item()
            
            loss.backward()
            
            optimizer.step()
            
            pred = output.detach().numpy()
            pred[pred<0.5] = 0
            pred[pred!=0] = 1
            
            epoch_acc+= accuracy_score(label.unsqueeze(1), pred)
            
            if(i%2==0):
                wandb.log( {"train_loss": epoch_loss/(i+1),
                            "train_acc": epoch_acc/(i+1)} )
    

In [None]:
def test():
    ###### TODO: logging test accuracy & F1-score ############
    acc = None
    return acc
    ##########################################################

In [None]:
train()

In [None]:
test()

In [None]:
import optuna

def objective(trial):
    # 하이퍼파라미터 추출
    LR = trial.suggest_float('learning_rate', 1e-5, 1e-1)
    EPOCH = trial.suggest_int('epoch', 3, 10)
    
    # 모델 초기화 및 학습
    model = myModel()
    train()
    acc = test()
    
    return acc

In [None]:
study = optuna.create_study(direction='maximize')  # 최대화하려면 'maximize' 사용, 최소화하려면 'minimize'
study.optimize(objective, n_trials=100)  # n_trials는 시도할 하이퍼파라미터 조합 횟수

In [None]:
best_params = study.best_params
best_accuracy = study.best_value