In [53]:
# csv, df
CLEAN_TEXT = 'clean_text'
TOKEN_IDS = 'token_ids'
EMBEDDING = 'embedding'
TARGET = 'target'
ATTENTION_MASK = 'attention_mask'

# training models
MAX_EPOCH = 7
BATCH_SIZE = 64
LEARNING_RATE = 2e-6
DROPOUT_RATE = 0.3

# models
INPUT_DIM = 768
HIDDEN_DIM = 256  # 256 ~ 512

# BERT
MAX_LENGTH = 128
BERT_MODEL_NAME = "bert-base-uncased"

# 파일 경로 설정
TRAIN_PICKLE_PATH = "twitter_disaster/data/output/train_bert_embeddings.pkl"
TEST_PICKLE_PATH = "twitter_disaster/data/output/test_bert_embeddings.pkl"
MODEL_SAVE_PATH = "twitter_disaster/models/classifier.pth"
SUBMISSION_SAVE_PATH = 'twitter_disaster/data/output/submission.csv'

# MPS
DEVICE = "mps" if __import__("torch").backends.mps.is_available() else "cpu"

In [3]:
import os
import numpy as np
import glob
import pandas as pd
from pathlib import Path
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import demoji
import torch
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import Dataset, DataLoader

In [27]:
train_df = pd.read_csv('twitter_disaster/data/input/train.csv',
                           usecols=['text', 'target'],
                           dtype={'text': str, 'target': np.int64})
test_df = pd.read_csv('twitter_disaster/data/input/test.csv',
                       usecols=['text', 'id'],
                       dtype={'text': str, 'target': np.int64}
                       )

In [20]:
train_df['target'].value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [14]:
test_df.head()

Unnamed: 0,id,text
0,0,Just happened a terrible car crash
1,2,"Heard about #earthquake is different cities, s..."
2,3,"there is a forest fire at spot pond, geese are..."
3,9,Apocalypse lighting. #Spokane #wildfires
4,11,Typhoon Soudelor kills 28 in China and Taiwan


In [15]:
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

stop_words = set(stopwords.words('english'))


def preprocess(df):
    if "target" in df.columns:
        df = df.dropna(subset=["target"])
    df.loc[:, 'clean_text'] = df['text'].apply(cleaning_txt)
    return df

def cleaning_txt(text: str):
    lemma = WordNetLemmatizer()
    text = text.lower()
    text = re.sub(r"\n", " ", text)  # 개행 제거
    text = re.sub(r"&amp;", "and", text)  # HTML 엔티티 변환
    text = re.sub(r'http\S+|www.\S+', '', text)  # URL 제거
    text = re.sub(r"@\w+", "", text)  # 멘션 제거
    text = re.sub(r"\d+", "", text)  # 숫자 제거
    text = re.sub(r'[^\w\s]', '', text)  # 특수 문자 제거
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    text = demoji.replace(text, "")

    words = text.split()
    words = [
        lemma.lemmatize(word) for word in words if word.lower() not in stop_words
    ]
    return " ".join(words)

In [28]:
indices = [4415, 4400, 4399, 4403, 4397, 4396, 4394, 4414, 4393, 4392,
                 4404, 4407, 4420, 4412, 4408, 4391, 4405,
                 6840, 6834, 6837, 6841, 6816, 6828, 6831,
                 246, 270, 266, 259, 253, 251, 250, 271,
                 6119, 6122, 6123, 6131, 6160, 6166, 6167, 6172, 6212, 6221, 6230, 6091, 6108,
                 7435, 7460, 7464, 7466, 7469, 7475, 7489, 7495, 7500, 7525, 7552, 7572, 7591, 7599]
train_df.loc[indices, 'target'] = 0
indices = [3913, 3914, 3936, 3921, 3941, 3937, 3938, 3136, 3133, 3930, 3933, 3924, 3917]
train_df.loc[indices, 'target'] = 1

In [29]:
train_df['target'].value_counts()

target
0    4370
1    3243
Name: count, dtype: int64

In [25]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

MODEL_NAME = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)
model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE)


def tokenize_and_convert(text):
    if not isinstance(text, str) or text.strip() == "":
        text = "[PAD]"
    encoding = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    input_ids = encoding["input_ids"].squeeze(0).to(DEVICE, dtype=torch.long)
    attention_mask = encoding["attention_mask"].squeeze(0).to(DEVICE, dtype=torch.long)

    return input_ids, attention_mask


def bert_embedding(input_ids, attention_mask):
    if input_ids.dim() == 1:
        input_ids = input_ids.unsqueeze(0)
    if attention_mask.dim() == 1:
        attention_mask = attention_mask.unsqueeze(0)

    with torch.no_grad():
        output = model(input_ids, attention_mask)

    return output.logits.squeeze(0).cpu().numpy()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
def process_text(text):
    token_ids, attention_mask = tokenize_and_convert(text)
    embedding = bert_embedding(token_ids, attention_mask)
    return token_ids.squeeze(0).cpu().numpy(), attention_mask.squeeze(0).cpu().numpy(), embedding


def start_embedding(df, path=TRAIN_PICKLE_PATH):
    df = preprocess(df)
    results = [process_text(text) for text in df[CLEAN_TEXT].tolist()]
    token_ids_list, attention_masks_list, embeddings_list = zip(*results)
    df[TOKEN_IDS] = list(token_ids_list)
    df[ATTENTION_MASK] = list(attention_masks_list)
    df.to_pickle(path)

In [33]:
train_df = start_embedding(train_df, TRAIN_PICKLE_PATH)

In [34]:
test_df = start_embedding(test_df, TEST_PICKLE_PATH)

In [37]:
columns = ['attention_mask', 'token_ids', 'target']
train_df = pd.read_pickle(TRAIN_PICKLE_PATH)[columns]
train_df.head()

Unnamed: 0,attention_mask,token_ids,target
0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...","[101, 15046, 3114, 8372, 2089, 16455, 9641, 10...",1
1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...","[101, 3224, 2543, 2379, 2474, 6902, 3351, 2187...",1
2,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...","[101, 6319, 2356, 7713, 2173, 19488, 2961, 139...",1
3,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...","[101, 2111, 4374, 3748, 10273, 13982, 2344, 26...",1
4,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...","[101, 2288, 2741, 6302, 10090, 7397, 5610, 374...",1
...,...,...,...
7608,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...","[101, 2048, 5016, 11308, 3173, 2958, 7859, 351...",1
7609,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...","[101, 2491, 3748, 2543, 2662, 2130, 2642, 2112...",1
7610,"[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[101, 11396, 22287, 12779, 7359, 102, 0, 0, 0,...",1
7611,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2610, 11538, 1041, 5638, 3489, 17745, 24...",1


In [38]:
columns = ['attention_mask', 'token_ids', 'id']
test_df = pd.read_pickle(TEST_PICKLE_PATH)[columns]
test_df.head()

Unnamed: 0,attention_mask,token_ids,id
0,"[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[101, 3047, 6659, 2482, 5823, 102, 0, 0, 0, 0,...",0
1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...","[101, 2657, 8372, 2367, 2103, 2994, 3647, 3071...",2
2,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ...","[101, 3224, 2543, 3962, 8644, 13020, 14070, 24...",3
3,"[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[101, 16976, 7497, 21878, 3748, 10273, 102, 0,...",9
4,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...","[101, 15393, 2061, 12672, 10626, 3102, 2859, 6...",11


In [39]:
train_df[TARGET].value_counts()

target
0    4370
1    3243
Name: count, dtype: int64

In [40]:
from transformers import DistilBertModel

class DisasterDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = torch.tensor(np.vstack(embeddings), dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]


class DisasterClassifier(nn.Module):
    def __init__(self):
        super(DisasterClassifier, self).__init__()
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased").to(DEVICE)
        self.fc1 = nn.Linear(INPUT_DIM, HIDDEN_DIM)
        self.relu = nn.ReLU()
        self.bn1 = nn.BatchNorm1d(HIDDEN_DIM)  # BatchNorm 추가
        self.dropout = nn.Dropout(DROPOUT_RATE)
        self.fc2 = nn.Linear(HIDDEN_DIM, 1)

    def forward(self, input_ids, attention_mask):
        bert_out = self.bert(input_ids=input_ids,
                             attention_mask=attention_mask
                             ).last_hidden_state[:, 0, :]
        x = self.fc1(bert_out)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x


In [81]:
def save_model(model):
    os.makedirs("models", exist_ok=True)
    torch.save({
        "model_state_dict": model.state_dict()
    }, MODEL_SAVE_PATH)



def train_model(train_df):
    core(
        input_ids=np.array(train_df[TOKEN_IDS].values.tolist()),
        attention_mask=np.array(train_df[ATTENTION_MASK].values.tolist()),
        labels=train_df[TARGET].values
    )


def core(input_ids, attention_mask, labels):
    input_ids = torch.tensor(input_ids, dtype=torch.int64).to(DEVICE)
    attention_mask = torch.tensor(attention_mask, dtype=torch.int64).to(DEVICE)
    labels_tensor = torch.tensor(labels, dtype=torch.float32).unsqueeze(1).to(DEVICE)

    print(f'input_ids[:10]: {input_ids[:10]}')
    print(f'attention_mask[:10]: {attention_mask[:10]}')

    train_dataset = TensorDataset(input_ids, attention_mask, labels_tensor)
    model = DisasterClassifier().to(DEVICE)
    class_weights = compute_class_weight(class_weight='balanced',
                                         classes=np.array([0, 1]),
                                         y=labels)
    loop(
        model=model,
        train_loader=DataLoader(
            train_dataset,
            batch_size=BATCH_SIZE,
            shuffle=True
        ),
        optimizer=optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=5e-3),
        criterion=nn.BCEWithLogitsLoss(
            pos_weight=torch.tensor(class_weights[1] * 5,
                                    dtype=torch.float32))
    )


def loop(model: DisasterClassifier, train_loader: DataLoader, optimizer: optim, criterion):
    len_train_loader = len(train_loader)
    for epoch in range(MAX_EPOCH):
        model.train()
        total_loss = 0.0

        for input_ids, attention_mask, labels in train_loader:
            input_ids, attention_mask, labels = input_ids.to(DEVICE), attention_mask.to(DEVICE), labels.to(DEVICE)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len_train_loader
        print(f"🔹 Epoch {epoch + 1}/{MAX_EPOCH}, "
              f"Loss: {avg_loss:.4f}, LR: {optimizer.param_groups[0]['lr']:.6f}")

    save_model(model)

In [82]:
train_model(train_df)

input_ids[:10]: tensor([[  101, 15046,  3114,  ...,     0,     0,     0],
        [  101,  3224,  2543,  ...,     0,     0,     0],
        [  101,  6319,  2356,  ...,     0,     0,     0],
        ...,
        [  101, 10047,  2327,  ...,     0,     0,     0],
        [  101,  2045,  5057,  ...,     0,     0,     0],
        [  101, 10047,  4452,  ...,     0,     0,     0]], device='mps:0')
attention_mask[:10]: tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='mps:0')
🔹 Epoch 1/7, Loss: 1.1974, LR: 0.000100
🔹 Epoch 2/7, Loss: 0.7976, LR: 0.000100
🔹 Epoch 3/7, Loss: 0.5258, LR: 0.000100
🔹 Epoch 4/7, Loss: 0.3196, LR: 0.000100
🔹 Epoch 5/7, Loss: 0.2501, LR: 0.000100
🔹 Epoch 6/7, Loss: 0.1784, LR: 0.000100
🔹 Epoch 7/7, Loss: 0.1546, LR: 0.000100


In [90]:
class TestDataset(Dataset):
    def __init__(self, df):
        self.input_ids = torch.tensor(
            np.array(df[TOKEN_IDS].values.tolist()),
            dtype=torch.int64)
        self.attention_mask = torch.tensor(
            np.array(df[ATTENTION_MASK].values.tolist()),
            dtype=torch.int64)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_mask[idx]


def predict(test_df, threshold=0.5):
    predications = []
    model: DisasterClassifier = load_model()

    test_dataset = TestDataset(test_df)
    test_loader = DataLoader(
        test_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False
    )
    with torch.no_grad():
        model.eval()
        for input_ids, attention_mask in test_loader:
            input_ids, attention_mask = input_ids.to(DEVICE), attention_mask.to(DEVICE)

            output = model(input_ids, attention_mask)
            probs = torch.sigmoid(output)
            batch_predictions = (probs >= threshold).int()
            predications.extend(batch_predictions.cpu().numpy().flatten().tolist())

    print("Unique Predictions:", np.unique(predications, return_counts=True))
    save_result(test_df, predications)


def load_model() -> DisasterClassifier:
    checkpoint = torch.load(MODEL_SAVE_PATH)
    model = DisasterClassifier().to(DEVICE)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    return model


def save_result(test_df: pd.DataFrame, preds: list[np.ndarray]):
    submission_df = pd.DataFrame({"id": test_df["id"], "target": preds})
    submission_df.to_csv(SUBMISSION_SAVE_PATH, index=False)

In [95]:
test_df = test_df.fillna('null')
predict(test_df, threshold=0.5)

Unique Predictions: (array([0, 1]), array([1864, 1399]))


In [96]:
submission = pd.read_csv(SUBMISSION_SAVE_PATH)
submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
