In [57]:
import numpy as np
import pandas as pd
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# 훈련셋 로드
train_data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
train_data.head()

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [58]:
SEED = 42

In [59]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_NAME = 'distilbert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [61]:
from datasets import Dataset, ClassLabel

dataset = Dataset.from_pandas(train_data)
dataset = dataset.cast_column('target', ClassLabel(num_classes=2))
dataset = dataset.train_test_split(test_size=0.1, seed=SEED, stratify_by_column='target')

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=157)

tokenized_dataset = dataset.map(preprocess_function, batched=True) # 데이터셋 토크나이징
tokenized_dataset = tokenized_dataset.rename_column("target", "labels") # target 컬럼을 'labels'로 이름 변경

train_dataset = tokenized_dataset["train"]
val_dataset = tokenized_dataset["test"]

print('훈련셋 크기 :', len(train_dataset))
print('검증셋 크기 :', len(val_dataset))

Casting the dataset:   0%|          | 0/7613 [00:00<?, ? examples/s]

Map:   0%|          | 0/6851 [00:00<?, ? examples/s]

Map:   0%|          | 0/762 [00:00<?, ? examples/s]

훈련셋 크기 : 6851
검증셋 크기 : 762


In [62]:
from sklearn.metrics import f1_score, accuracy_score

def compute_metrics(pred): # f1 score 계산 필요
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    
    f1 = f1_score(labels, preds)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1
    }

In [63]:
# 하이퍼파라미터

LR = 2e-5
TRAIN_BATCHSIZE = 256
EVAL_BATCHSIZE = 128
EPOCHS = 5
WEIGHT_DECAY = 0.01

In [64]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

os.environ["WANDB_DISABLED"] = "true"

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir='./results/',
    run_name=MODEL_NAME,
    learning_rate=LR,
    per_device_train_batch_size=TRAIN_BATCHSIZE,
    per_device_eval_batch_size=EVAL_BATCHSIZE,
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    logging_steps=10,
    logging_dir='./logs',
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [65]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6397,0.55013,0.740157,0.681672
2,0.5219,0.458217,0.791339,0.739771
3,0.4019,0.433864,0.817585,0.770248
4,0.3871,0.423056,0.816273,0.771987
5,0.3594,0.423655,0.813648,0.773885




TrainOutput(global_step=70, training_loss=0.44402148042406353, metrics={'train_runtime': 253.4258, 'train_samples_per_second': 135.168, 'train_steps_per_second': 0.276, 'total_flos': 1391434192052580.0, 'train_loss': 0.44402148042406353, 'epoch': 5.0})

In [66]:
# 최종 검증셋 성능
metrics = trainer.evaluate(eval_dataset=val_dataset)
print(metrics)



{'eval_loss': 0.42365512251853943, 'eval_accuracy': 0.8136482939632546, 'eval_f1': 0.7738853503184713, 'eval_runtime': 2.1333, 'eval_samples_per_second': 357.185, 'eval_steps_per_second': 1.406, 'epoch': 5.0}


In [None]:
# 테스트셋 로드
test_data = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
test_data.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [None]:
test_dataset = Dataset.from_pandas(test_data)

tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True) # 데이터셋 토크나이징

tokenized_test_dataset

Map:   0%|          | 0/3263 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'keyword', 'location', 'text', 'input_ids', 'attention_mask'],
    num_rows: 3263
})

In [None]:
predictions = trainer.predict(tokenized_test_dataset)
pred_logits = predictions.predictions
pred_probs = np.exp(pred_logits) / np.exp(pred_logits).sum(axis=1, keepdims=True)
pred_labels = np.argmax(pred_logits, axis=1)

# 제출 파일 생성
submission = pd.DataFrame({
    'id': test_data['id'],
    'target': pred_labels
})

submission.to_csv('submission.csv', index=False)
submission

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,0
3259,10865,1
3260,10868,1
3261,10874,1
