# Preparation

In [1]:
from google.colab import files
uploaded = files.upload()

Saving test.jsonl to test.jsonl
Saving train.jsonl to train.jsonl
Saving val.jsonl to val.jsonl


In [2]:
pip install transformers datasets evaluate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


# Task1

In [3]:
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments, TrainerCallback
from transformers import DataCollatorWithPadding
from transformers.modeling_outputs import SequenceClassifierOutput
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import evaluate
import json

In [34]:
# Step 1: Load data
def load_json(filename):
    with open(filename, 'r') as f:
        return [json.loads(line) for line in f]

train_data = load_json("/content/train.jsonl")
val_data = load_json("/content/val.jsonl")
test_data = load_json("/content/test.jsonl")

In [35]:
# Step 2: Extract text and labels
train_texts = [item["postText"][0] for item in train_data]
train_labels = [item["tags"][0] for item in train_data]
val_texts = [item["postText"][0] for item in val_data]
val_labels = [item["tags"][0] for item in val_data]
test_texts = [item["postText"][0] for item in test_data]

In [36]:
# Step 3: Label Encoding
le = LabelEncoder()
train_labels_encoded = le.fit_transform(train_labels)
val_labels_encoded = le.transform(val_labels)

In [37]:
# Step 4: Convert to Dataset
train_df = pd.DataFrame({"text": train_texts, "labels": train_labels_encoded})
val_df = pd.DataFrame({"text": val_texts, "labels": val_labels_encoded})
test_df = pd.DataFrame({"text": test_texts})
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)

In [38]:
# Step 5: Tokenization
checkpoint = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize(example):
    return tokenizer(example["text"], truncation=True)

train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)



Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [39]:
# Step 6: Custom Model
class CustomDeberta(nn.Module):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.base = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.relu = nn.ReLU()
        self.classifier = nn.Linear(self.base.config.hidden_size, num_labels)

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.base(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0]  # [CLS] token
        logits = self.classifier(self.relu(self.dropout(pooled)))
        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)
        return SequenceClassifierOutput(logits=logits, loss=loss)

In [40]:
# Step 7: Model Init
num_labels = len(le.classes_)
model = CustomDeberta(checkpoint, num_labels)

In [41]:
# Step 8: Metrics
f1 = evaluate.load("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return f1.compute(predictions=preds, references=labels, average="macro")

In [45]:
# Step 9: Trainer Setup
args = TrainingArguments(
    output_dir="./deberta_output",
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=8,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="epoch",
    load_best_model_at_end=False,
    report_to="none" # Add this line to disable wandb logging
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [43]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [46]:
# Step 10: Train
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.4155,0.898442,0.671675
2,0.2307,1.215585,0.686372
3,0.2399,1.316756,0.702619
4,0.1505,1.584138,0.710218
5,0.0937,1.690086,0.700017
6,0.0589,1.901974,0.699468
7,0.0307,1.915575,0.712659
8,0.0204,1.942925,0.70996


TrainOutput(global_step=2136, training_loss=0.15503117207730754, metrics={'train_runtime': 168.4924, 'train_samples_per_second': 151.936, 'train_steps_per_second': 12.677, 'total_flos': 0.0, 'train_loss': 0.15503117207730754, 'epoch': 8.0})

In [47]:
# Step 11: Predict
preds = trainer.predict(test_ds)
pred_ids = np.argmax(preds.predictions, axis=1)
pred_labels = le.inverse_transform(pred_ids)

In [48]:
# Step 12: Output submission
submission = pd.DataFrame({
    "id": [item["id"] for item in test_data],
    "spoilerType": pred_labels
})
submission.to_csv("task1_output.csv", index=False)

# Task 2

In [17]:
import json
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset
from sklearn.model_selection import train_test_split

In [18]:
# read jsonl data
def load_jsonl(path):
    with open(path, 'r') as f:
        return [json.loads(line) for line in f]

train_data = load_jsonl("train.jsonl")
val_data = load_jsonl("val.jsonl")
test_data = load_jsonl("test.jsonl")

In [19]:
# postText + targetParagraphs
def preprocess(data):
    inputs = []
    targets = []
    for item in data:
        post = item['postText'][0]
        para = " ".join(item['targetParagraphs'])[:1024]  # 截断太长的输入
        text_input = f"generate spoiler: {post} </s> {para}"
        inputs.append(text_input)
        targets.append(item['spoiler'][0])  # 只取第一个
    return pd.DataFrame({'input': inputs, 'target': targets})

train_df = preprocess(train_data)
val_df = preprocess(val_data)

In [20]:
model_name = "t5-large"  # OR t5-base / t5-small
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [21]:
def tokenize_function(example):
    model_inputs = tokenizer(
        example["input"], max_length=512, truncation=True, padding="max_length"
    )
    labels = tokenizer(
        example["target"], max_length=64, truncation=True, padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)

tokenized_train = train_ds.map(tokenize_function, batched=True)
tokenized_val = val_ds.map(tokenize_function, batched=True)

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [22]:
training_args = TrainingArguments(
    output_dir="./t5_spoiler",
    eval_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    report_to="none"
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [23]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.3019,0.301759
2,0.133,0.298576
3,0.1891,0.339188


TrainOutput(global_step=4800, training_loss=0.24354363907050963, metrics={'train_runtime': 1711.3632, 'train_samples_per_second': 5.61, 'train_steps_per_second': 2.805, 'total_flos': 2.07844540416e+16, 'train_loss': 0.24354363907050963, 'epoch': 3.0})

In [26]:
def preprocess_test(data):
    ids = []
    inputs = []
    for item in data:
        ids.append(item['id'])
        post = item['postText'][0]
        para = " ".join(item['targetParagraphs'])[:1024]
        text_input = f"generate spoiler: {post} </s> {para}"
        inputs.append(text_input)
    return ids, inputs

test_ids, test_inputs = preprocess_test(test_data)

batch_size = 6
results = []

model.eval()
for i in range(0, len(test_inputs), batch_size):
    batch = test_inputs[i:i+batch_size]
    encodings = tokenizer(batch, return_tensors="pt", truncation=True, padding=True).to(model.device)
    with torch.no_grad():
        outputs = model.generate(**encodings, max_new_tokens=64)
    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    results.extend(decoded)

In [29]:
df_submit = pd.DataFrame({"id": test_ids, "spoiler": results})
df_submit.to_csv("task2_output.csv", index=False)