# <header>**Baseline model for Automated Essay Scoring competition**</header>

*Version 0.1*

In [None]:
# !pip install torchview
# !pip install torchinfo

# Utils

In [1]:
import random
from kaggle_secrets import UserSecretsClient
import wandb
import os
import datetime
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn
from sklearn.metrics import cohen_kappa_score

import torch
from torch.utils.data import Dataset, DataLoader
from torchinfo import summary
from torchview import draw_graph
from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import AutoModel, AutoTokenizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

user_secrets = UserSecretsClient()
wandb_api = user_secrets.get_secret("wandb_ha")

wandb.login(key=wandb_api)


random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)

cpu


In [None]:
model_path = 'allenai/longformer-base-4096'
tokenizer = AutoTokenizer.from_pretrained(model_path)
embedder = AutoModel.from_pretrained(model_path)

In [2]:
ling_features = ['num_paragraphs', 'num_words', 'num_conjunctions',
                 'num_distinct_words', 'num_misspell',
                 'mean_word_len', 'num_sentences', 'mean_sent_len']

In [3]:
# data_dir = "/kaggle/input/deep-essay-scoring"
data_dir = "../../output/"
train_data = pd.read_csv(os.path.join(data_dir, 'train_linguistic.csv'))
extra_data = pd.read_csv(os.path.join(data_dir, 'extra_linguistic.csv'))

train_data = train_data.dropna(subset=['full_text'])
extra_data = extra_data.dropna(subset=['full_text'])

train_data = train_data[['essay_id', 'full_text', 'score'] + ling_features]
extra_data = extra_data[['essay_id', 'full_text', 'score'] + ling_features]

extra_data = extra_data[~extra_data['full_text'].isin(train_data['full_text'])]

print(train_data.shape, extra_data.shape)

train_data.sample(5)

(17307, 14) (13125, 14)


Unnamed: 0,essay_id,full_text,score,num_words,num_punctuations,num_nouns,num_verbs,num_adverbs,num_conjunctions,num_distinct_words,num_misspell,mean_word_len,num_sentences,mean_sent_len
12696,bb4c434,"People tend to use there cars so much, they ba...",3,616,36,113,81,43,26,211,20,3.905263,26,113.076923
4625,44e88b0,Imagine being a top scientist at NASA and Viki...,3,432,58,93,40,12,10,180,8,4.342391,22,94.954545
733,0ba78ec,The face of Mars could not be created by alien...,3,244,17,46,15,13,9,114,5,4.267857,12,101.583333
16885,f96c287,Many people belive that the face on Mars was c...,3,273,26,53,29,10,6,163,26,4.290984,13,104.461538
3334,317173f,Driverless Cars are coming soon or later? Peop...,4,649,38,114,83,29,39,168,8,4.028099,31,100.129032


Split train, val, test set

In [5]:
all_data = pd.concat([train_data, extra_data], ignore_index=True)
# shuffle the data
all_data = all_data.sample(frac=1, random_state=random_seed)
all_data = all_data.reset_index(drop=True)

train_ratio, val_ratio, test_ratio = 0.7, 0.15, 0.15

train_df = all_data[:int(train_ratio * len(all_data))]
train_df = train_df.reset_index(drop=True)

val_df = all_data[int(train_ratio * len(all_data)):int(
    (train_ratio + val_ratio) * len(all_data))]
val_df = val_df.reset_index(drop=True)

test_df = all_data[int((train_ratio + val_ratio) * len(all_data)):]
test_df = test_df.reset_index(drop=True)

print(train_df.shape, val_df.shape, test_df.shape)

(21302, 14) (4565, 14) (4565, 14)


Hyperparameters

In [None]:
hyperparameters = {
    'lr': 5e-5,
    'dropout': 0.25,
    'epochs': 5,
    'batch_size': 40,
    'ling_features_hidden_size': 128,
    'embedding_model': model_path,
    'train_set': {
        'total': len(train_df),
        'ratio': train_ratio,
    },
    'val_set': {
        'total': len(val_df),
        'ratio': val_ratio,
    },
    'test_set': {
        'total': len(test_df),
        'ratio': test_ratio,
    },
    'linguistic_features': ling_features,
    'accelator': str(device)
}

In [None]:
train_tokenized = tokenizer(train_df['full_text'].tolist(),
                            padding=True, truncation=False, return_tensors="np")

train_df['input_ids'] = train_tokenized['input_ids'].tolist()
train_df['attention_mask'] = train_tokenized['attention_mask'].tolist()

hyperparameters['max_seq_len'] = train_tokenized['input_ids'].shape[1]
print(hyperparameters['max_seq_len'])

print(len(train_df.sample(1).iloc[0]['input_ids']))

In [None]:
val_tokenized = tokenizer(val_df['full_text'].tolist(),
                          max_length=hyperparameters['max_seq_len'], padding='max_length',
                          truncation=True, return_tensors="pt")

val_df['input_ids'] = val_tokenized['input_ids'].tolist()
val_df['attention_mask'] = val_tokenized['attention_mask'].tolist()

print(len(val_df.sample(1).iloc[0]['input_ids']))

test_tokenized = tokenizer(test_df['full_text'].tolist(),
                           max_length=hyperparameters['max_seq_len'], padding='max_length',
                           truncation=True, return_tensors="pt")

test_df['input_ids'] = test_tokenized['input_ids'].tolist()
test_df['attention_mask'] = test_tokenized['attention_mask'].tolist()

print(len(test_df.sample(1).iloc[0]['input_ids']))

In [None]:
class MultiFeaturesDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        # self.text = df['clean_text'].values
        self.token_ids = df['input_ids'].values
        self.attention_mask = df['attention_mask'].values
        self.score = df['score'].values
        self.ling_features = []
        for feature in ling_features:
            self.ling_features.append(df[feature].values)

    def __len__(self):
        return len(self.score)

    def __getitem__(self, idx):
        features = []
        for feature in self.ling_features:
            features.append(feature[idx])

        features = torch.tensor(features, dtype=torch.float)

        score = torch.reshape(torch.tensor(
            self.score[idx], dtype=torch.float), (1,))

        return torch.tensor(self.token_ids[idx]), torch.tensor(self.attention_mask[idx]), features, score


train_dataset = MultiFeaturesDataset(train_df)
val_dataset = MultiFeaturesDataset(val_df)
test_dataset = MultiFeaturesDataset(test_df)

In [None]:
train_dataloader = DataLoader(
    train_dataset, batch_size=hyperparameters['batch_size'], shuffle=True)
val_dataloader = DataLoader(
    val_dataset, batch_size=hyperparameters['batch_size'], shuffle=True)
test_dataloader = DataLoader(
    test_dataset, batch_size=hyperparameters['batch_size'], shuffle=True)

for token_ids, attention_mask, features, score in train_dataloader:
    print(token_ids.shape, attention_mask.shape, features.shape, score.shape)
    break

In [None]:
class MultiFeaturesModel(torch.nn.Module):
    def __init__(self, embedder,
                 ling_features_input_size, ling_features_hidden_size=64,
                 dropout=0.2):
        super(MultiFeaturesModel, self).__init__()
        # freeze
        for param in embedder.parameters():
            param.requires_grad = False
        self.embedder = embedder
        self.ling_features = torch.nn.Sequential(
            torch.nn.Linear(ling_features_input_size,
                            ling_features_hidden_size),
            torch.nn.Dropout(dropout)
        )
        self.regressor = torch.nn.Linear(
            ling_features_hidden_size + embedder.config.hidden_size, 1)

    def forward(self, token_ids, attention_mask, ling_features):
        embedded = self.embedder(
            token_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        embedded = F.dropout(embedded, training=self.training)
        ling_features = F.dropout(F.relu(self.ling_features(ling_features)),
                                  training=self.training)
        features = torch.cat((embedded, ling_features), dim=1)
        score = self.regressor(features)
        return score


model = MultiFeaturesModel(embedder, len(ling_features),
                           hyperparameters['ling_features_hidden_size'],
                           hyperparameters['dropout'])

inputs = next(iter(train_dataloader))[:-1]
model_summary = summary(model, input_data=inputs)

model_summary

In [None]:
model_graph = draw_graph(model,
                         input_data=inputs,
                         expand_nested=True,
                         depth=2)
model_graph.visual_graph

In [None]:
def train(model, optimizer, criterion, train_dataloader, logging_steps=20):
    model.train()
    running_loss = 0.0

    for i, (token_ids, attention_mask, features, score) in enumerate(train_dataloader):
        
        output = model(token_ids.to(device), attention_mask.to(
            device), features.to(device))
        loss = criterion(output, score.to(device)).float()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        running_loss += loss.item()

        if (i + 1) % (logging_steps) == 0 or (i + 1) == len(train_dataloader):
            wandb.log({'train_loss_steps': running_loss / (i + 1),
                       'learning_rate': optimizer.param_groups[0]['lr']})

    return running_loss / len(train_dataloader)


def evaluate(model, criterion, dataloader):
    model.eval()
    running_loss = 0.0
    all_scores = []
    predictions = []

    with torch.no_grad():
        for token_ids, attention_mask, features, score in dataloader:
            output = model(token_ids.to(device), attention_mask.to(
                device), features.to(device))
            
            loss = criterion(output, score.to(device))
            
            running_loss += loss.item()
            all_scores.extend(score.cpu().numpy())
            predictions.extend(output.cpu().numpy())

    return running_loss / len(dataloader), np.array(all_scores), np.array(predictions)

In [None]:
class EarlyStopper:
    def __init__(self, patience=1, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = float('inf')

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

In [None]:
criterion = torch.nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(),
                              lr=hyperparameters['lr'])
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=1, factor=0.1)
early_stopper = EarlyStopper(patience=3, min_delta=1e-3)

train_losses, val_losses, val_kappa_scores = [], [], []

hyperparameters['early_stopper'] = early_stopper.__dict__
hyperparameters['model'] = dict(model.__dict__['_modules'])

hyperparameters

In [None]:
torch.cuda.empty_cache()

In [22]:
def logit_to_score(logit, min_score=1, max_score=6):
    return np.clip(np.round(logit), min_score, max_score)

random_logit = random.uniform(0, 10)
random_logit, logit_to_score(random_logit)

(1.4920849034938521, 1.0)

In [None]:
wandb.init(project='deep-essay-scoring', config=hyperparameters)

for epoch in range(hyperparameters['epochs']):
    train_loss = train(model, optimizer, criterion, train_dataloader)
    val_loss, val_scores, val_predictions = evaluate(
        model, criterion, val_dataloader)

    val_kappa = cohen_kappa_score(
        logit_to_score(val_predictions), val_scores, 
        labels=[1, 2, 3, 4, 5, 6],
        weights='quadratic')

    train_losses.append(train_loss)
    val_losses.append(val_loss)
    val_kappa_scores.append(val_kappa)

    scheduler.step(val_loss)

    wandb.log({'train_loss': train_loss, 'val_loss': val_loss,
              'val_kappa': val_kappa, 'epoch': epoch+1,
               'learning_rate': optimizer.param_groups[0]['lr']})

    print(
        f'Epoch: {epoch+1}, Train Loss: {train_loss}, Val Loss: {val_loss}, Val Kappa: {val_kappa}')
    if early_stopper.early_stop(val_losses[-1]):
        break

In [None]:
# Get current date and time
now = datetime.datetime.now()
now_str = now.strftime("%Y-%m-%d_%H-%M")

embedder_name = model_path.split('/')[-1]
model_name = f'multi_features_{embedder_name}_model_{now_str}'

# Save the model
with open(f'{model_name}_summary.text', 'w') as f:
    f.write(str(model_summary))

torch.save(model.state_dict(), f'{model_name}.pth')

In [None]:
torch.cuda.empty_cache()

In [None]:
def inference(model, dataloader):
    model.eval()
    all_predictions = []

    with torch.no_grad():
        for token_ids, attention_mask, features, score in dataloader:
            output = model(token_ids.to(device), attention_mask.to(
                device), features.to(device))
            all_predictions.extend(output.cpu().numpy())

    return np.array(all_predictions)


test_predictions = inference(model, test_dataloader)

test_kappa = cohen_kappa_score(
    logit_to_score(test_predictions), test_df['score'].values, weights='quadratic')

test_df['prediction'] = test_predictions

test_df[['essay_id', 'score', 'prediction']].to_csv('test_predictions.csv')

test_df[['essay_id', 'score', 'prediction']]