In [None]:
!pip install transformers
!pip install wandb
!pip install optuna

In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import AutoTokenizer, AutoModel
from typing import List
from torch.utils.data import DataLoader
from tqdm import tqdm

In [2]:
DATA_PATH = 'data/WELFake_Dataset.csv'
BATCH_SIZE = 1
LR = 1e-4
EPOCH = 10
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
news_df = pd.read_csv(DATA_PATH)

#pre-processing
# dropping the duplicate values
news_df.drop_duplicates(inplace=True)
news_df.dropna(inplace=True, axis=0)

train_df , test_df = train_test_split(news_df, test_size=0.2 ,random_state=42)

In [4]:
class myDataset(torch.utils.data.Dataset):
    def __init__(self,df:pd.DataFrame):
        self.df = df.reset_index(drop=True)
        self.label = df['label']
        self.text = df['text']
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self,idx):
        return self.text.iloc[idx], self.label.iloc[idx]

In [5]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


MODEL_NAME = "Q93WnX4FUHx2mJ/e5-multi-base-sbert"
class myModel(nn.Module):
    def __init__(self):
        super().__init__()
        # Load model directly
        self.tokenizer = AutoTokenizer.from_pretrained(f'{MODEL_NAME}')
        self.model = AutoModel.from_pretrained(f'{MODEL_NAME}')
        self.linear1 = nn.Linear(768,1) #only one class, 0 or 1
    
    def forward(self,x:List[str] ) -> torch.Tensor:
        bert_tokenized = self.tokenizer(x,max_length=512,truncation=True,padding=True, return_tensors='pt')
        bert_output = self.model(**bert_tokenized)
        sentence_embedding = mean_pooling(bert_output, bert_tokenized['attention_mask'])
        output = self.linear1(sentence_embedding)
        return output
model = myModel().to(device)

In [6]:
train_dataset = myDataset(train_df)
test_dataset = myDataset(test_df)
train_dataloader = DataLoader(train_dataset,batch_size=BATCH_SIZE, shuffle=True)
test_dataset = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [7]:
import wandb
from sklearn.metrics import accuracy_score

In [8]:
def train():
    wandb.init(project="news_classification_baseline")
    wandb.config = {
    "learning_rate": LR,
    "epochs": EPOCH,
    "batch_size": BATCH_SIZE
    }
    wandb.run.name = f'baseline_optuna_{LR}-{EPOCH}-{BATCH_SIZE}'
    wandb.run.save()

    criterion = nn.BCELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=LR)
    
    for epoch in range(EPOCH):
        epoch_loss = 0.0
        epoch_acc = 0.0
        for i, batch in enumerate(tqdm(train_dataloader)):
            text, label = batch
            
            optimizer.zero_grad()
            
            output = model(text)
            
            loss = criterion(output, label.unsqueeze(1).float() )
            epoch_loss+=loss.item()
            
            loss.backward()
            
            optimizer.step()
            
            pred = output.detach().numpy()
            pred[pred<0.5] = 0
            pred[pred!=0] = 1
            
            epoch_acc+= accuracy_score(label.unsqueeze(1), pred)
            
            if(i%2==0):
                wandb.log( {"train_loss": epoch_loss/(i+1),
                            "train_acc": epoch_acc/(i+1)} )
    

In [9]:
def test():
    ###### TODO: logging test accuracy & F1-score ############
    acc = None
    return acc
    ##########################################################

In [None]:
train()

In [None]:
test()

In [10]:
import optuna

def objective(trial):
    # 하이퍼파라미터 추출
    LR = trial.suggest_float('learning_rate', 1e-5, 1e-1)
    EPOCH = trial.suggest_int('epoch', 3, 10)
    
    # 모델 초기화 및 학습
    model = myModel()
    train()
    return 1

In [11]:
study = optuna.create_study(direction='maximize')  # 최대화하려면 'maximize' 사용, 최소화하려면 'minimize'
study.optimize(objective, n_trials=100)  # n_trials는 시도할 하이퍼파라미터 조합 횟수

[I 2023-11-07 20:06:42,452] A new study created in memory with name: no-name-b3729530-1192-4c3a-9ce2-cfa0841f9dd8
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mnaye971012[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 3/57229 [00:08<47:36:04,  2.99s/it]
[W 2023-11-07 20:07:14,435] Trial 0 failed with parameters: {'learning_rate': 0.0725219861478422, 'epoch': 9} because of the following error: RuntimeError('all elements of input should be between 0 and 1').
Traceback (most recent call last):
  File "c:\Users\naye0\anaconda3\envs\pytorch\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\naye0\AppData\Local\Temp\ipykernel_34744\174817979.py", line 10, in objective
    train()
  File "C:\Users\naye0\AppData\Local\Temp\ipykernel_34744\2173910692.py", line 24, in train
    loss = criterion(output, label.unsqueeze(1).float() )
  File "c:\Users\naye0\anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "c:\Users\naye0\anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\loss.py", line 613, in forward
    return F.binary_cross_

RuntimeError: all elements of input should be between 0 and 1

In [None]:
best_params = study.best_params
best_accuracy = study.best_value