# Set up Environment

Originally was run on Google Colab

In [None]:
!nvidia-smi

In [None]:
%%capture
!pip install kaggle
!pip install transformers
!pip install datasets
!pip install emoji
!pip install accelerate
!pip install nlpaug

In [None]:
!mkdir ~/.kaggle
!touch ~/.kaggle/kaggle.json

usr_name = ''
key_ = ''

api_token = {"username":usr_name,"key":key_}

import json

with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(api_token, file)

!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c congressionaltweetcompetitionspring2022

In [None]:
!unzip /content/congressionaltweetcompetitionspring2022.zip

# Data Split

Change paths accordingly

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

train_valid_path = '/content/congressional_tweet_training_data.csv'
test_path = '/content/congressional_tweet_test_data.csv'
train_path = '/content/congressional_tweet_train_data.csv'
valid_path = '/content/congressional_tweet_val_data.csv'
subset_path = '/content/congressional_tweet_train_subset_data.csv'
training_output_path = '/content/training_output'

In [None]:
def make_train_dev_split():
  data = pd.read_csv(train_valid_path)
  train, dev = train_test_split(data, test_size=0.05, random_state=456)

  train.to_csv(train_path, index=False)
  dev.to_csv(valid_path, index=False)

def make_train_subset():
  data = pd.read_csv(train_path)
  _, subset = train_test_split(data, test_size=500, random_state=456)

  subset.to_csv(subset_path, index=False)

make_train_dev_split()
make_train_subset()

# Modeling and Training Script

Dataloader

In [None]:
import os
from tqdm.auto import tqdm
import pandas as pd
import re
import torch
from torch.utils.data import Dataset


class TweetDataset(Dataset):
    def __init__(self, data_path, tokenizer):
        self.tokenizer = tokenizer
        self.party2id = {'D': 0, 'R': 1}
        data = self._read_data(data_path)
        self.input_strs, self.label_ids = self._convert_to_samples(data)
        self.n_samples = len(self.input_strs)

    def _read_data(self, path):
        path = os.path.join(*path.split('\\'))
        data = pd.read_csv(path)
        data = data.drop_duplicates(['full_text'])

        return data

    def _convert_to_samples(self, data):
        input_strs = data['full_text'].tolist()
        labels = data['party_id'].tolist()
        input_strs_out, label_ids = [], []

        for i_sample, (input_str, label) in enumerate(tqdm(zip(input_strs, labels))):
            input_str = input_str.replace('\"b', '').replace('b\'', '').replace('b\"', '').replace('\'', '')
            input_str = re.sub(r"http\S+", "", input_str)
            # tokenizer_out = self.tokenizer(input_str, truncation=True)
            label_id = self.party2id[label]

            input_strs_out.append(input_str)
            label_ids.append(torch.tensor(label_id, dtype=torch.long))

        return input_strs_out, label_ids

    def __len__(self):
        return self.n_samples

    def __getitem__(self, item):
        return self.input_strs[item], self.label_ids[item]

    def collate_fn(self, batch):
        input_strs, label_ids = zip(*batch)

        tokenizer_out = self.tokenizer(list(input_strs), 
                                       truncation=True, 
                                       padding=True,
                                       return_attention_mask=True,
                                       return_tensors='pt')
        input_ids = tokenizer_out['input_ids']
        attention_mask = tokenizer_out['attention_mask']
        label_ids = torch.tensor(label_ids)

        return input_ids, attention_mask, label_ids



Model

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import AutoModel
from transformers import logging as t_logging
t_logging.set_verbosity_error()

class Model(nn.Module):
    def __init__(self, pretrained_encoder):
        super(Model, self).__init__()
        self.encoder = pretrained_encoder
        self.linear = nn.Linear(768, 2)
        
        self.loss = nn.CrossEntropyLoss()
        
        self.frozen = True
        for param in self.encoder.parameters():
            param.requires_grad = False
            
    def loss_fn(self, pred, target):
        return self.loss(pred, target)
    
    def forward(self, batch, return_type='loss'):
        input_ids, attention_mask, target_ids = batch        
        
        out, _ = self.encoder(input_ids = input_ids, 
                              attention_mask = attention_mask,
                              return_dict=False)
        out = torch.mean(out, dim=1)
        logits = self.linear(out)
        
        if return_type == 'loss':
            return self.loss_fn(logits, target_ids)
        elif return_type == 'logits':
            return logits

Trainer

In [None]:
import random
import numpy as np
import json
import os
from tqdm.auto import tqdm

from sklearn.metrics import accuracy_score, classification_report

import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW

from accelerate import Accelerator

class Trainer:
    def __init__(self, model, accelerator, device, train_dataset, val_dataset=None, ckpt_path=None):
        self.model = model
        self.accelerator = accelerator
        self.device = device
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset if val_dataset != None else train_dataset 
        
        bsz_train = 32
        bsz_val = 128
        
        self._set_seed()
        self._init_model(ckpt_path)
        self._get_dataloaders(bsz_train, bsz_val)
        self._get_optimizer()
    
    def _set_seed(self):
        self.seed = 69420
        torch.manual_seed(self.seed)
        random.seed(self.seed)
        np.random.seed(self.seed)
        
    def _init_model(self, model_path):
        if model_path:
            path = os.path.join(*model_path.split('\\'))
            self.model.load_state_dict(torch.load(path))
        
        # self.model = self.accelerator.prepare(self.model)
        self.model.to(self.device)
        
    def _get_dataloaders(self, bsz_train, bsz_val):
        train_loader = DataLoader(self.train_dataset,
                                      batch_size=bsz_train,
                                      collate_fn=self.train_dataset.collate_fn,
                                      shuffle=True,
                                      drop_last=True
                                      )
        val_loader = DataLoader(self.val_dataset,
                                    batch_size=bsz_val,
                                    collate_fn=self.val_dataset.collate_fn,
                                    shuffle=False,
                                    drop_last=False
                                    )
        self.train_loader, self.val_loader = train_loader, val_loader
        # self.train_loader, self.val_loader = self.accelerator.prepare(train_loader, val_loader)
    
    def _get_optimizer(self):
        model_params = list(self.model.named_parameters())
        no_decay = ['bias']
        optimized_params = [
            {
                'params':[p for n, p in model_params if not any(nd in n for nd in no_decay)], 
                'weight_decay': 0.01
            },
            {
                'params': [p for n, p in model_params if any(nd in n for nd in no_decay)],
                'weight_decay': 0.0
            }   
        ]
        optimizer = AdamW(optimized_params, lr=0.001)
        self.optimizer = optimizer
        # self.optimizer = self.accelerator.prepare(optimizer)
        
    def run_train(self, n_epochs):
        best_loss = self.run_validation()
        
        for epoch in range(n_epochs):
            pbar = tqdm(self.train_loader, disable = not self.accelerator.is_local_main_process)
            batch_loss = 0
            self.model.train()
            self.model.zero_grad(set_to_none=True)
            
            for i, batch in enumerate(pbar):
                batch = tuple(item.to(self.device) for item in batch)
                batch_loss = self._training_step(batch)
                
                self.optimizer.step()
                self.model.zero_grad(set_to_none=True)
                
                pbar.set_description(f'(Training) Epoch: {epoch} Loss: {batch_loss:.4f}')
            
            val_loss = self.run_validation()
            
            if val_loss < best_loss:
                print(f'New best validation loss at {val_loss:.4f}, saving checkpoint')
                best_loss = val_loss
                # self.accelerator.wait_for_everyone()
                # unwrapped_model = self.accelerator.unwrap_model(self.model)
                ckpt_path = os.path.join('model_ckpt.pt')
                # torch.save(unwrapped_model.state_dict(), ckpt_path)
                torch.save(self.model.state_dict(), ckpt_path)
                print(f'New checkpoint saved at {ckpt_path}')
            elif (val_loss >= best_loss or epoch > 2) and self.model.frozen == True:
                print(f'Unfreeze encoder at epoch {epoch}')
                self.model.frozen = False
                for g in self.optimizer.param_groups:
                    g['lr'] = 0.00002
                for param in self.model.encoder.parameters():
                    param.requires_grad = True
                
    def run_validation(self):
        pbar = tqdm(enumerate(self.val_loader), total=len(self.val_loader))
        self.model.eval()
        preds, labels = [], []
        val_loss = 0
        
        for i, batch in pbar:
            batch = tuple(item.to(self.device) for item in batch)
            loss, pred = self._prediction_step(batch)
            pbar.set_description(f'(Validating)')
            val_loss += loss
            
            _, _, target_ids = batch
            pred = [int(i) for i in pred.cpu().numpy()]
            label = [int(i) for i in target_ids.cpu().numpy()]
            
            preds.extend(pred)
            labels.extend(label)
            
        score = classification_report(labels, preds)
        
        print(score)
        print(f' Validation loss: {val_loss:.4f}')
        
        return val_loss
    
    def _training_step(self, batch):
        loss = self.model(batch, return_type='loss')
        loss.backward()
        # self.accelerator.backward(loss)
        
        return loss.detach()
    
    @torch.no_grad()
    def _prediction_step(self, batch):
        loss = self.model(batch, return_type='loss')
        preds = self.model(batch, return_type='logits')
        preds = torch.argmax(preds, dim=1)
        
        return loss.detach(), preds

# Interface

Train

In [None]:
from transformers import AutoTokenizer, AutoModel
from accelerate import Accelerator
import torch

train_path = 'data\congressional_tweet_train_data.csv'
val_path = 'data\congressional_tweet_val_data.csv'
pretrained_model = 'vinai/bertweet-base'
ckpt_path = None

accelerator = Accelerator()
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
train_dataset = TweetDataset(train_path, tokenizer)
val_dataset = TweetDataset(val_path, tokenizer)
pretrained_encoder = AutoModel.from_pretrained(pretrained_model)
model = Model(pretrained_encoder)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

trainer = Trainer(model, accelerator, device, train_dataset, val_dataset, ckpt_path)
trainer.run_train(25)

Inference

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import pandas as pd
import torch.nn as nn

def produce_test_prediction(dataset, dataloader, model, data_path):
    pbar = tqdm(enumerate(dataloader), total=len(dataloader))
    
    preds = []
    for i, batch in pbar:
        batch = tuple(item.to(device) for item in batch)
        pred = model(batch, return_type='logits')
        pred = torch.argmax(pred, dim=1)
        pbar.set_description(f'(Validating)')
        
        pred = [int(i) for i in pred.cpu().numpy()]        
        preds.extend(pred)
    
    id2party = {v: k for k,v in dataset.party2id.items()}
    preds = [id2party[pred] for pred in preds]
    df_out = pd.DataFrame({'party': preds})
    data_out = pd.read_csv(data_path)
    data_out['pred'] = df_out['party']
    
    data_out.to_csv('pred_distilroberta_temp.csv', index=False)

ckpt_path = 'distilroberta90.pt'
pretrained_model = 'distilroberta-base'
data_path = 'data\congressional_tweet_training_data.csv'

tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
dataset = TweetDataset(data_path, tokenizer)
pretrained_encoder = AutoModel.from_pretrained(pretrained_model)
model = Model(pretrained_encoder)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.load_state_dict(torch.load(ckpt_path))
model.eval()

dataloader = DataLoader(dataset,
                        batch_size=64,
                        collate_fn=dataset.collate_fn,
                        shuffle=False,
                        drop_last=False
                        )

produce_test_prediction(dataset, dataloader, model, data_path)
    

Citation          
Parts of the code was taken from the author's previous works