# üìù ModalX v2 - Content BERT Training

**Model:** Fine-tuned DistilBERT for Content Quality

**Task:** Multi-task learning for argument, vocabulary, structure scoring

In [None]:
!pip install -q torch transformers datasets scikit-learn tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch
import torch.nn as nn
from transformers import DistilBertModel, DistilBertTokenizer
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Device: {device}')

In [None]:
class ContentBERT(nn.Module):
    def __init__(self, hidden=768, dropout=0.3):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.pooler = nn.Sequential(nn.Linear(hidden, hidden), nn.Tanh())
        
        self.argument_head = nn.Sequential(
            nn.Linear(hidden, 256), nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(256, 1), nn.Sigmoid()
        )
        self.vocab_head = nn.Sequential(
            nn.Linear(hidden, 128), nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(128, 3)
        )
        self.structure_head = nn.Sequential(
            nn.Linear(hidden, 256), nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(256, 1), nn.Sigmoid()
        )
    
    def forward(self, input_ids, attention_mask):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls = self.pooler(out.last_hidden_state[:, 0, :])
        return {
            'argument': self.argument_head(cls),
            'vocab': self.vocab_head(cls),
            'structure': self.structure_head(cls)
        }

model = ContentBERT().to(device)
print(f'Parameters: {sum(p.numel() for p in model.parameters()):,}')

In [None]:
# Sample training data
SAMPLES = [
    ('Therefore, our methodology demonstrates significant impact.', 0.9, 2, 0.8),
    ('Um, so like, we did some stuff.', 0.2, 0, 0.3),
    ('The results indicate a 40% improvement in efficiency.', 0.85, 2, 0.9),
    ('It was good I think maybe.', 0.1, 0, 0.2),
]

class ContentDataset(Dataset):
    def __init__(self, samples, tokenizer):
        self.samples = samples
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        text, arg, vocab, struct = self.samples[idx]
        enc = self.tokenizer(text, max_length=128, padding='max_length', truncation=True, return_tensors='pt')
        return {
            'input_ids': enc['input_ids'].squeeze(),
            'attention_mask': enc['attention_mask'].squeeze(),
            'argument': torch.tensor(arg),
            'vocab': torch.tensor(vocab),
            'structure': torch.tensor(struct)
        }

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
dataset = ContentDataset(SAMPLES * 50, tokenizer)
loader = DataLoader(dataset, batch_size=8, shuffle=True)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
mse = nn.MSELoss()
ce = nn.CrossEntropyLoss()

for epoch in range(10):
    model.train()
    total_loss = 0
    for batch in tqdm(loader):
        optimizer.zero_grad()
        out = model(
            batch['input_ids'].to(device),
            batch['attention_mask'].to(device)
        )
        loss = mse(out['argument'].squeeze(), batch['argument'].float().to(device))
        loss += ce(out['vocab'], batch['vocab'].to(device))
        loss += mse(out['structure'].squeeze(), batch['structure'].float().to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch+1}: Loss = {total_loss/len(loader):.4f}')

torch.save(model.state_dict(), '/content/drive/MyDrive/modalx_v2/content_bert.pt')