### Student Information
Name: 黃玄彰

Student ID: 110030031

GitHub ID: pigerface@gmail.com

Kaggle name:

Kaggle private scoreboard snapshot:

In [21]:
import json
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, precision_score, recall_score
from transformers import BertTokenizer
    


def load_data(emotion_path, data_identification_path, tweets_path):
    """
    載入與處理數據的函數
    :param emotion_path: str, emotion.csv 檔案路徑
    :param data_identification_path: str, data_identification.csv 檔案路徑
    :param tweets_path: str, tweets_DM.json 檔案路徑
    :return: pd.DataFrame, pd.DataFrame, pd.DataFrame
    """
    # 讀取 emotion 和 data_identification 數據
    emotion = pd.read_csv(emotion_path)
    data_identification = pd.read_csv(data_identification_path)

    # 讀取 tweets_DM.json 並解析為 DataFrame
    with open(tweets_path, 'r') as f:
        data = [json.loads(line) for line in f]

    df = pd.DataFrame(data)
    _source = df['_source'].apply(lambda x: x['tweet'])
    df = pd.DataFrame({
        'tweet_id': _source.apply(lambda x: x['tweet_id']),
        'hashtags': _source.apply(lambda x: x['hashtags']),
        'text': _source.apply(lambda x: x['text']),
    })
    
    # 合併 data_identification
    df = df.merge(data_identification, on='tweet_id', how='left')

    return df, emotion, data_identification


def preprocess_data(df, emotion, label_encoder, tokenizer):
    """
    處理數據、分割訓練與測試集，並完成 Tokenization
    """
    # Tokenization 函數
    def tokenize_function(text):
        return tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')

    # 分割數據集
    train_data = df[df['identification'] == 'train']
    test_data = df[df['identification'] == 'test']

    # 合併 emotion 與訓練集
    train_data = train_data.merge(emotion, on='tweet_id', how='left')
    train_data.drop_duplicates(subset=['text'], keep=False, inplace=True)

    # 分割訓練與驗證集
    X_train, X_val, y_train, y_val = train_test_split(
        train_data['text'], train_data['emotion'], test_size=0.2, random_state=42, stratify=train_data['emotion']
    )

    # Reset index
    y_train.reset_index(drop=True, inplace=True)
    y_val.reset_index(drop=True, inplace=True)
    
    # Label Encoding
    y_train = label_encoder.fit_transform(y_train)
    y_val = label_encoder.transform(y_val)
    
    # Tokenize 訓練與驗證集
    X_train_tokenized = X_train.apply(tokenize_function)
    X_val_tokenized = X_val.apply(tokenize_function)
    test_data_tokenized = test_data['text'].apply(tokenize_function)
    

    return train_data, test_data, X_train_tokenized, X_val_tokenized, y_train, y_val, test_data_tokenized

def predict_and_generate_submission(trainer, label_encoder, test_dataset, output_path="submission.csv"):
    """
    根據測試數據生成預測並保存為Kaggle提交格式
    :param trainer: Trainer object, 已訓練的模型
    :param test_dataset: pd.DataFrame, 測試數據
    :param output_path: str, 保存提交文件的路徑
    """

    # 使用模型進行預測
    predictions = trainer.predict(test_dataset)
    predicted_classes = predictions.predictions.argmax(axis=1)
    
    predicted_labels = label_encoder.inverse_transform(predicted_classes)

    # 將預測結果添加到測試數據中
    test_data['emotion'] = predicted_labels

    # 保存為Kaggle提交格式
    submission = test_data[['tweet_id', 'emotion']]
    # 將tweet_id改成id
    submission.rename(columns={'tweet_id': 'id'}, inplace=True)
    submission.to_csv(output_path, index=False)

    print(f"Kaggle提交文件已保存至 {output_path}！")

class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {
                'input_ids': self.encodings['input_ids'][idx],
                'attention_mask': self.encodings['attention_mask'][idx],
                'labels': torch.tensor(self.labels[idx])
            }
        return item
    
def convert_to_dicts(tokenized_texts):
    # 修改轉換方式，確保輸出正確的張量格式
    input_ids = []
    attention_masks = []
    
    for text in tokenized_texts:
        # 移除多餘的維度
        input_ids.append(text['input_ids'].squeeze(0))
        attention_masks.append(text['attention_mask'].squeeze(0))
    
    return {
        'input_ids': torch.stack(input_ids),
        'attention_mask': torch.stack(attention_masks)
    }

# 計算評估指標
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(axis=1)
    f1 = f1_score(labels, preds, average="weighted")
    precision = precision_score(labels, preds, average="weighted")
    recall = recall_score(labels, preds, average="weighted")
    return {
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }



In [22]:
from transformers import TrainingArguments, EarlyStoppingCallback
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# training_args = TrainingArguments(
#     output_dir='./results', # All files generated during training will be stored here
#     num_train_epochs=3, # The model will be trained for 3 full epochs unless the step limit (max_steps) is reached first
#     per_device_train_batch_size=5, # Training batch size per device (GPU or CPU).
#     per_device_eval_batch_size=5, # Evaluation batch size per device (GPU or CPU).
#     warmup_steps=10, # Number of warm-up steps during which the learning rate gradually increases to its initial value
#     weight_decay=0.01, # Weight decay rate: this technique helps to avoid overfitting, penalizing large weights in the neural network
#     logging_dir='./logs', # Directory where training logs will be stored
#     max_steps=10,  # Maximum number of training steps to be performed
#     save_steps=2,  # Range of steps after which the model will be saved
#     logging_steps=2,  # Range of steps after which log information will be recorded
# )
training_args = TrainingArguments(
    output_dir="./results",                  # 儲存輸出
    evaluation_strategy="steps",            # 按步進行評估
    eval_steps=500,                         # 每 500 步進行一次評估
    save_steps=500,                         # 每 500 步保存一次模型
    logging_steps=100,                      # 每 100 步記錄一次日誌
    learning_rate=2e-5,                     # 初始學習率
    per_device_train_batch_size=128,         # 訓練批次大小
    per_device_eval_batch_size=128,          # 評估批次大小
    num_train_epochs=3,                     # 訓練 3 個週期
    warmup_steps=500,                       # 熱身步數
    weight_decay=0.01,                      # L2 正則化
    load_best_model_at_end=True,            # 載入最佳模型
    metric_for_best_model="f1",             # 使用 F1 判定最佳模型
    greater_is_better=True,                 # F1 越高越好
    fp16=True                               # 使用 16-bit 浮點數加速
)



In [23]:
import os

# if __name__ == "__main__":


# 設置數據路徑
# data_base_path = '/kaggle/input/dm-2024-isa-5810-lab-2-homework/'
data_base_path = './data/'
emotion_path = data_base_path + 'emotion.csv'
data_identification_path = data_base_path + 'data_identification.csv'
tweets_path = data_base_path + 'tweets_DM.json'

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
label_encoder = LabelEncoder()

# 載入與處理數據
df, emotion, data_identification = load_data(emotion_path, data_identification_path, tweets_path)
train_data, test_data, X_train, X_val, y_train, y_val, test_data_tokenized = preprocess_data(df, emotion, label_encoder, tokenizer)

print("數據載入與處理完成！")
print(f"訓練集樣本數：{len(X_train)}，驗證集樣本數：{len(X_val)}")

# Convert to lists of dictionaries
train_encodings = convert_to_dicts(X_train)
val_encodings = convert_to_dicts(X_val)
test_encodings = convert_to_dicts(test_data_tokenized)

assert len(train_encodings['input_ids']) == len(y_train), "Encodings and labels must have the same length."
assert len(val_encodings['input_ids']) == len(y_val), "Encodings and labels must have the same length."
print("Data preprocessing completed successfully.")

# Create three dataset objects using the SentimentDataset
train_dataset = TweetDataset(train_encodings, y_train)
val_dataset = TweetDataset(val_encodings, y_val)
test_dataset = TweetDataset(test_encodings, labels=[0] * len(test_encodings['input_ids']))

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))
model.to(device)
print(f"Using device: {device}")

trainer = Trainer(
    model=model,                          # 模型
    args=training_args,                   # 訓練參數
    train_dataset=train_dataset,          # 訓練資料
    eval_dataset=val_dataset,             # 驗證資料
    compute_metrics=compute_metrics,       # 評估指標
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01)
    ]
)
# Start training
trainer.train()

# Evaluate the Model
results = trainer.evaluate(test_dataset)

print("Evaluation Results:")
print(f"  - Loss: {results['eval_loss']:.4f}")
print(f"  - Runtime: {results['eval_runtime']:.2f} seconds")
print(f"  - Samples per Second: {results['eval_samples_per_second']:.2f}")
print(f"  - Steps per Second: {results['eval_steps_per_second']:.2f}")
print(f"  - Epoch: {results['epoch']:.4f}")

# Save the model and tokenizer in the current folder
model_save_path = "./BERT_param2"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

submission_path = "./BERT_param2/submission.csv"
predict_and_generate_submission(trainer, label_encoder, test_dataset, output_path=submission_path)

數據載入與處理完成！
訓練集樣本數：1159345，驗證集樣本數：289837
Data preprocessing completed successfully.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda


Step,Training Loss,Validation Loss,F1,Precision,Recall
500,1.4094,1.337915,0.476073,0.541557,0.524036
1000,1.2016,1.181912,0.555444,0.597985,0.57906
1500,1.1429,1.118451,0.580705,0.6128,0.600372
2000,1.114,1.082297,0.59478,0.624844,0.6078
2500,1.0845,1.058882,0.598449,0.638351,0.617095
3000,1.044,1.049764,0.607919,0.625044,0.621211
3500,1.0565,1.034557,0.614092,0.630521,0.624517


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Evaluation Results:
  - Loss: 4.1442
  - Runtime: 250.57 seconds
  - Samples per Second: 1644.13
  - Steps per Second: 12.85
  - Epoch: 0.3864


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['emotion'] = predicted_labels
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission.rename(columns={'tweet_id': 'id'}, inplace=True)


Kaggle提交文件已保存至 ./BERT_param2/submission.csv！
