# <header>**Baseline model for Automated Essay Scoring competition**</header>

*Version 0.1.1*

In [None]:
!pip install torchview
!pip install torchinfo

# Utils

In [None]:
import random
from kaggle_secrets import UserSecretsClient
import wandb
import os
import datetime
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchinfo import summary
from torchview import draw_graph
from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import LongformerModel, LongformerTokenizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

user_secrets = UserSecretsClient()
wandb_api = user_secrets.get_secret("wandb_ha")

wandb.login(key=wandb_api)

random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)

In [None]:
model_path = 'allenai/longformer-base-4096'
tokenizer = LongformerTokenizer.from_pretrained(model_path)
embedder = LongformerModel.from_pretrained(model_path, attention_window=128)

In [None]:
ling_features = ['num_paragraphs', 'num_words', 'num_conjunctions',
                 'num_distinct_words', 'num_misspell',
                 'mean_word_len', 'num_sentences', 'mean_sent_len']

In [None]:
data_dir = "/kaggle/input"
# data_dir = "../../output/"
train_data = pd.read_csv(os.path.join(data_dir, 'train_linguistic.csv'))
extra_data = pd.read_csv(os.path.join(data_dir, 'extra_linguistic.csv'))

extra_data = extra_data[~extra_data['full_text'].isna()]
train_data = train_data[~train_data['full_text'].isna()]

train_data = train_data[['essay_id', 'full_text', 'score'] + ling_features]
extra_data = extra_data[['essay_id', 'full_text', 'score'] + ling_features]

extra_data = extra_data[~extra_data['full_text'].isin(train_data['full_text'])]

print(train_data.shape, extra_data.shape)

train_data.sample(5)

Split train, val, test set

In [None]:
all_data = pd.concat([train_data, extra_data], ignore_index=True)
# shuffle the data
all_data = all_data.sample(frac=1, random_state=random_seed)
all_data = all_data.reset_index(drop=True)

train_ratio, val_ratio, test_ratio = 0.7, 0.15, 0.15

train_df, val_df = train_test_split(all_data, test_size=val_ratio + test_ratio, 
                                    random_state=random_seed)
val_df, test_df = train_test_split(val_df, test_size=test_ratio/(val_ratio + test_ratio), 
                                   random_state=random_seed)

print(train_df.shape, val_df.shape, test_df.shape)

Hyperparameters

In [None]:
hyperparameters = {
    'lr': 1e-4,
    'dropout': 0.3,
    'epochs': 10,
    'batch_size': 64,
    'train_set': {
        'total': len(train_df),
        'ratio': train_ratio,
    },
    'val_set': {
        'total': len(val_df),
        'ratio': val_ratio,
    },
    'test_set': {
        'total': len(test_df),
    },
    'linguistic_features': ling_features,
    'accelator': str(device)
}

In [None]:
train_tokenized = tokenizer(train_df['full_text'].tolist(),
                            padding=True, 
                            return_tensors="np")

hyperparameters['max_seq_len'] = train_tokenized['input_ids'].shape[1]

train_df['input_ids'] = train_tokenized['input_ids'].tolist()
train_df['attention_mask'] = train_tokenized['attention_mask'].tolist()


print(len(train_df.sample(1).iloc[0]['input_ids']))

In [None]:
val_tokenized = tokenizer(val_df['full_text'].tolist(),
                          max_length=hyperparameters['max_seq_len'],
                          padding='max_length', truncation=True, 
                          return_tensors="np")

val_df['input_ids'] = val_tokenized['input_ids'].tolist()
val_df['attention_mask'] = val_tokenized['attention_mask'].tolist()


print(len(val_df.sample(1).iloc[0]['input_ids']))

In [None]:
test_tokenized = tokenizer(test_df['full_text'].tolist(),
                           max_length=hyperparameters['max_seq_len'], 
                           padding='max_length', truncation=True, 
                           return_tensors="np")

test_df['input_ids'] = test_tokenized['input_ids'].tolist()
test_df['attention_mask'] = test_tokenized['attention_mask'].tolist()

print(len(test_df.sample(1).iloc[0]['input_ids']))

In [None]:
class MultiFeaturesDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.token_ids = df['input_ids'].values
        self.attention_mask = df['attention_mask'].values
        self.score = df['score'].values
        self.ling_features = []
        for feature in ling_features:
            self.ling_features.append(df[feature].values)

    def __len__(self):
        return len(self.score)

    def __getitem__(self, idx):
        features = []
        for feature in self.ling_features:
            features.append(feature[idx])

        features = torch.tensor(features, dtype=torch.float)

        score = torch.reshape(torch.tensor(
            self.score[idx], dtype=torch.float), (1,))

        return torch.tensor(self.token_ids[idx]), torch.tensor(self.attention_mask[idx]), features, score


train_dataset = MultiFeaturesDataset(train_df)
val_dataset = MultiFeaturesDataset(val_df)

In [None]:
train_dataloader = DataLoader(
    train_dataset, batch_size=hyperparameters['batch_size'], shuffle=True)
val_dataloader = DataLoader(
    val_dataset, batch_size=hyperparameters['batch_size'], shuffle=True)

for token_ids, attention_mask, features, score in train_dataloader:
    print(token_ids.shape, attention_mask.shape, features.shape, score.shape)
    break

In [None]:
class MultiFeaturesModel(torch.nn.Module):
    def __init__(self, embedder,
                 lf_input_size, lf_hidden_size=64,
                 dropout=0.2):
        super(MultiFeaturesModel, self).__init__()
        # freeze
        for param in embedder.parameters():
            param.requires_grad = False
        # unfreeze the pooler
        for param in embedder.pooler.parameters():
            param.requires_grad = True
            
        self.embedder = embedder
        self.lf = torch.nn.Linear(lf_input_size,lf_hidden_size)
        self.fc1 = torch.nn.Linear(lf_hidden_size + embedder.config.hidden_size, 512)
        self.fc2 = torch.nn.Linear(512, 256)
        self.regressor = torch.nn.Linear(256, 1)
        self.dropout = torch.nn.Dropout(dropout)
    
    def config(self):
        return {
            'embedder': self.embedder.config,
            'lf': {
                'input_size': self.lf.in_features,
                'hidden_size': self.lf.out_features
            },
            'fc1': {
                'input_size': self.fc1.in_features,
                'output_size': self.fc1.out_features
            },
            'fc2': {
                'input_size': self.fc2.in_features,
                'output_size': self.fc2.out_features
            },
            'regressor': {
                'input_size': self.regressor.in_features,
                'output_size': self.regressor.out_features
            }
        }

    def forward(self, token_ids, attention_mask, ling_features):
        embedded = self.embedder(token_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        if self.training:
            embedded = self.dropout(embedded)
            
        ling_features = F.leaky_relu(self.lf(ling_features))
        if self.training:
            ling_features = self.dropout(ling_features)
            
        features = torch.cat((embedded, ling_features), dim=1)

        fc1 = F.leaky_relu(self.fc1(features))
        if self.training:
            fc1 = self.dropout(fc1)
        
        fc2 = F.leaky_relu(self.fc2(fc1))
        if self.training:
            fc2 = self.dropout(fc2)
            
        score = self.regressor(fc2)
        return score


model = MultiFeaturesModel(embedder, len(ling_features),
                           64,
                           hyperparameters['dropout'])

inputs = next(iter(train_dataloader))[:-1]
model_summary = summary(model, input_data=inputs)

model_summary

In [None]:
model_graph = draw_graph(model,
                         input_data=inputs,
                         expand_nested=True,
                         depth=2)
model_graph.visual_graph

In [None]:
def train(model, optimizer, criterion, train_dataloader, logging_steps=20):
    model.train()
    running_loss = 0.0

    for i, (token_ids, attention_mask, features, score) in enumerate(train_dataloader):

        output = model(token_ids.to(device),
                       attention_mask.to(device),
                       features.to(device))
        loss = criterion(output, score.to(device))

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        running_loss += loss.item()
        
        if (i + 1) % (logging_steps) == 0 or (i + 1) == len(train_dataloader):
            wandb.log({'train_loss_steps': running_loss / (i + 1)})

    return running_loss / len(train_dataloader)


def evaluate(model, criterion, dataloader):
    model.eval()
    running_loss = 0.0
    all_scores = []
    predictions = []

    with torch.no_grad():
        for token_ids, attention_mask, features, score in dataloader:
            output = model(token_ids.to(device),
                           attention_mask.to(device),
                           features.to(device))

            loss = criterion(output, score.to(device))

            running_loss += loss.item()
            all_scores.extend(score.cpu().numpy())
            predictions.extend(output.cpu().numpy())

    return running_loss / len(dataloader), torch.tensor(all_scores), torch.tensor(predictions)

In [None]:
class EarlyStopper:
    def __init__(self, patience=1, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = float('inf')

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

In [None]:
def logit_to_score(logit, min_score=1, max_score=6):
    scores = torch.clamp(torch.round(logit), min_score, max_score)
    scores = scores.long()
    return scores


def hybrid_loss(y_pred, y_true):
    mse_loss = F.mse_loss(y_pred, y_true)

    return mse_loss

In [None]:
criterion = hybrid_loss
optimizer = torch.optim.Adam(model.parameters(),
                             lr=hyperparameters['lr'],
                             weight_decay=1e-6)
scheduler = ReduceLROnPlateau(optimizer, patience=3, factor=0.5)
early_stopper = EarlyStopper(patience=2, min_delta=1e-4)

train_losses, val_losses, val_kappa_scores = [], [], []

hyperparameters['early_stopper'] = early_stopper.__dict__['patience']
hyperparameters['scheduler'] = scheduler.__dict__
hyperparameters['model'] = model.config()

hyperparameters

In [None]:
torch.cuda.empty_cache()
model.to(device)

wandb.init(project='deep-essay-scoring', config=hyperparameters)

for epoch in range(hyperparameters['epochs']):
    train_loss = train(model, optimizer, criterion, train_dataloader)
    train_losses.append(train_loss)

    val_loss, val_scores, val_predictions = evaluate(
        model, criterion, val_dataloader)
    val_kappa = cohen_kappa_score(val_scores.cpu().numpy(),
                                  logit_to_score(val_predictions).cpu().numpy(),
                                  weights='quadratic')

    val_losses.append(val_loss)
    val_kappa_scores.append(val_kappa)

    scheduler.step(val_loss)

    wandb.log({'train_loss': train_loss, 'val_loss': val_loss,
              'val_kappa': val_kappa, 'epoch': epoch+1,
              'learning_rate': optimizer.param_groups[0]['lr']})

    print(f'Epoch: {epoch+1}, Train Loss: {train_loss}, Val Loss: {val_loss}, Val Kappa: {val_kappa}')

#     if early_stopper.early_stop(val_losses[-1]):
#         print("Early stopping")
#         break

In [None]:
# Get current date and time
now = datetime.datetime.now()
now_str = now.strftime("%YY-%mm-%dd") 

embedder_name = model_path.split('/')[-1]
model_name = f'multi_features_{embedder_name}_{now_str}_model'

# Save the model
with open(f'{model_name}_summary.text', 'w') as f:
    f.write(str(model_summary))

torch.save(model.state_dict(), f'{model_name}.pth')

In [None]:
torch.cuda.empty_cache()

In [None]:
model.eval()
test_predictions = []

with torch.no_grad():
    for i, row in test_df.iterrows():
        token_ids = torch.tensor(row['input_ids']).unsqueeze(0)
        attention_mask = torch.tensor(row['attention_mask']).unsqueeze(0)
        row_ling_features = torch.tensor(row[ling_features].tolist()).unsqueeze(0)

        output = model(token_ids.to(device), 
                       attention_mask.to(device), 
                       row_ling_features.to(device))
        test_predictions.append(output.item())
        
submit_df = pd.DataFrame({
    'essay_id': test_df['essay_id'],
    'prediction': logit_to_score(torch.tensor(test_predictions)).cpu().numpy()
})
print(submit_df.shape)
submit_df.to_csv('submission.csv', index=False)