# <header>**Baseline model for Automated Essay Scoring competition**</header>

*Version 0.1.1*

In [1]:
# !pip install torchview
%pip install torchinfo

Note: you may need to restart the kernel to use updated packages.


# Utils

In [2]:
import random
from kaggle_secrets import UserSecretsClient # type: ignore
import wandb # type: ignore
import os
import datetime
import json
import pandas as pd
import numpy as np
import re

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchinfo import summary
# from torchview import draw_graph
from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import LongformerModel, LongformerTokenizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

user_secrets = UserSecretsClient()
wandb_api = user_secrets.get_secret("wandb_ha")

wandb.login(key=wandb_api)

random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)

cuda


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [3]:
model_path = '/kaggle/input/essay-scoring-models/longformer-base-4096'
tokenizer = LongformerTokenizer.from_pretrained(model_path)
embedder = LongformerModel.from_pretrained(model_path, attention_window=128)

  return self.fget.__get__(instance, owner)()


In [4]:
data_dir = "/kaggle/input/aes-linguistic"

ling_features = []
with open(os.path.join(data_dir, 'features.txt'), 'r') as f:
    ling_features = f.read().splitlines()
    
ling_features = sorted(list(set(ling_features)))

# data_dir = "../../output/"
train_data = pd.read_csv(os.path.join(data_dir, 'train_linguistic.csv'))
extra_data = pd.read_csv(os.path.join(data_dir, 'extra_linguistic.csv'))

extra_data = extra_data[~extra_data['full_text'].isna()]
train_data = train_data[~train_data['full_text'].isna()]

train_data = train_data[['essay_id', 'full_text', 'score'] + ling_features]
extra_data = extra_data[['essay_id', 'full_text', 'score'] + ling_features]

train_data = train_data.drop_duplicates()
extra_data = extra_data.drop_duplicates()

train_data = train_data.reset_index(drop=True)
extra_data = extra_data.reset_index(drop=True)

print(train_data.shape, extra_data.shape)

train_data.sample(5)

(17307, 119) (13125, 119)


Unnamed: 0,essay_id,full_text,score,25th_percentile_num_adverbs_in_sentence,75th_percentile_num_nouns_in_paragraph,25th_percentile_mean_word_lens_in_sentence,num_proper_nouns_in_essay,max_num_verbs_in_sentence,mean_num_verbs_in_paragraph,25th_percentile_num_sentences_in_paragraph,...,max_num_pronouns_in_paragraph,min_mean_word_lens_in_sentence,min_num_adverbs_in_sentence,mean_num_sentences_in_paragraph,75th_percentile_num_pronouns_in_sentence,mean_num_misspelled_words_in_sentence,min_num_adjectives_in_sentence,mean_num_adverbs_in_sentence,75th_percentile_num_words_in_paragraph,75th_percentile_num_proper_nouns_in_sentence
12696,bb4c434,"people tend to use there cars so much, they ba...",3,0.0,25.0,3.419643,4,12,14.666667,5.0,...,16,2.0,0,5.0,3.0,20.2,0,1.433333,135.0,0.0
4625,44e88b0,imagine being a top scientist at nasa and viki...,3,0.0,33.0,3.752422,4,4,8.863636,5.0,...,7,2.85,0,5.090909,1.0,19.409091,0,0.590909,112.0,0.0
733,0ba78ec,the face of mars could not be created by alien...,3,0.75,23.0,3.920867,5,2,5.25,3.0,...,11,3.545455,0,4.5,2.0,20.083333,0,1.0,124.0,0.25
16885,f96c287,many people belive that the face on mars was c...,3,0.0,16.0,3.719048,13,5,5.785714,2.25,...,6,2.333333,0,3.0,1.0,19.214286,0,0.642857,65.0,1.75
3334,317173f,driverless cars are coming soon or later? peop...,4,0.0,24.0,3.631579,0,11,15.515152,6.0,...,27,3.0,0,6.515152,3.0,19.484848,0,0.878788,136.0,0.0


Split train, val, test set

In [5]:
all_data = pd.concat([train_data, extra_data], ignore_index=True)

# drop columns with nan values
nan_columns = all_data.columns[all_data.isna().any()].tolist()
all_data = all_data.drop(columns=nan_columns)
ling_features = list(set(ling_features).difference(nan_columns))
print(len(ling_features))

# shuffle the data
all_data = all_data.sample(frac=1, random_state=random_seed)
all_data = all_data.reset_index(drop=True)

train_ratio, val_ratio, test_ratio = 0.8, 0.1, 0.1

train_df, val_df = train_test_split(all_data, test_size=val_ratio + test_ratio, 
                                    random_state=random_seed)
val_df, test_df = train_test_split(val_df, test_size=test_ratio/(val_ratio + test_ratio), 
                                   random_state=random_seed)

print(train_df.shape, val_df.shape, test_df.shape)

116
(24345, 119) (3043, 119) (3044, 119)


Hyperparameters

In [6]:
hyperparameters = {
    'lr': 1e-4,
    'dropout': 0.3,
    'epochs': 15,
    'batch_size': 64,
    'train_set': {
        'total': len(train_df),
        'ratio': train_ratio,
    },
    'val_set': {
        'total': len(val_df),
        'ratio': val_ratio,
    },
    'test_set': {
        'total': len(test_df),
        'ratio': test_ratio,
    },
    'linguistic_features': ling_features,
    'accelator': str(device)
}

In [7]:
train_tokenized = tokenizer(train_df['full_text'].tolist(),
                            padding=True, 
                            return_tensors="np")

hyperparameters['max_seq_len'] = train_tokenized['input_ids'].shape[1]

train_df['input_ids'] = train_tokenized['input_ids'].tolist()
train_df['attention_mask'] = train_tokenized['attention_mask'].tolist()


print(len(train_df.sample(1).iloc[0]['input_ids']))

1929


In [8]:
val_tokenized = tokenizer(val_df['full_text'].tolist(),
                          max_length=hyperparameters['max_seq_len'],
                          padding='max_length', truncation=True, 
                          return_tensors="np")

val_df['input_ids'] = val_tokenized['input_ids'].tolist()
val_df['attention_mask'] = val_tokenized['attention_mask'].tolist()


print(len(val_df.sample(1).iloc[0]['input_ids']))

1929


In [9]:
test_tokenized = tokenizer(test_df['full_text'].tolist(),
                           max_length=hyperparameters['max_seq_len'], 
                           padding='max_length', truncation=True, 
                           return_tensors="np")

test_df['input_ids'] = test_tokenized['input_ids'].tolist()
test_df['attention_mask'] = test_tokenized['attention_mask'].tolist()

print(len(test_df.sample(1).iloc[0]['input_ids']))

1929


In [10]:
class MultiFeaturesDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.token_ids = df['input_ids'].values
        self.attention_mask = df['attention_mask'].values
        self.score = df['score'].values
        self.ling_features = []
        for feature in ling_features:
            self.ling_features.append(df[feature].values)

    def __len__(self):
        return len(self.score)

    def __getitem__(self, idx):
        features = []
        for feature in self.ling_features:
            features.append(feature[idx])

        features = torch.tensor(features, dtype=torch.float)

        score = torch.reshape(torch.tensor(
            self.score[idx], dtype=torch.float), (1,))

        return torch.tensor(self.token_ids[idx]), torch.tensor(self.attention_mask[idx]), features, score


train_dataset = MultiFeaturesDataset(train_df)
val_dataset = MultiFeaturesDataset(val_df)

In [11]:
train_dataloader = DataLoader(train_dataset, 
                              batch_size=hyperparameters['batch_size'], 
                              shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, 
                            batch_size=hyperparameters['batch_size'], 
                            shuffle=True, num_workers=4)

for token_ids, attention_mask, features, score in val_dataloader:
    print(token_ids.shape, attention_mask.shape, features.shape, score.shape)
    break

torch.Size([64, 1929]) torch.Size([64, 1929]) torch.Size([64, 116]) torch.Size([64, 1])


In [12]:
class MultiFeaturesModel(torch.nn.Module):
    def __init__(self, embedder,
                 lf_input_size, lf_hidden_size=64,
                 dropout=0.2):
        super(MultiFeaturesModel, self).__init__()
        # freeze
        for param in embedder.parameters():
            param.requires_grad = False
        # unfreeze the pooler
        for param in embedder.pooler.parameters():
            param.requires_grad = True
            
        self.embedder = embedder
        self.lf = torch.nn.Linear(lf_input_size,lf_hidden_size)
        # self.fc1 = torch.nn.Linear(lf_hidden_size + embedder.config.hidden_size, 256)
        # self.fc2 = torch.nn.Linear(256, 128)
        self.regressor = torch.nn.Linear(lf_hidden_size + embedder.config.hidden_size, 1)
        self.dropout = torch.nn.Dropout(dropout)
    
    def config(self):
        return {
            'embedder': self.embedder.config,
            'lf': {
                'input_size': self.lf.in_features,
                'hidden_size': self.lf.out_features
            },
            'regressor': {
                'input_size': self.regressor.in_features,
                'output_size': self.regressor.out_features
            }
        }

    def forward(self, token_ids, attention_mask, ling_features):
        embedded = self.embedder(token_ids, attention_mask=attention_mask, output_hidden_states=True)[1]
        if self.training:
            embedded = self.dropout(embedded)
            
        ling_features = self.lf(ling_features)
        ling_features = F.leaky_relu(ling_features)
        if self.training:
            ling_features = self.dropout(ling_features)
            
        features = torch.cat((embedded, ling_features), dim=1)

        score = self.regressor(features)
        return score


model = MultiFeaturesModel(embedder, 
                           len(ling_features), 256,
                           hyperparameters['dropout'])
# model.to(device)

inputs = next(iter(train_dataloader))[:-1]

# inputs.to(device)
model_summary = summary(model, input_data=inputs)

print(model_summary)

Input ids are automatically padded from 1929 to 2048 to be a multiple of `config.attention_window`: 128


Layer (type:depth-idx)                                       Output Shape              Param #
MultiFeaturesModel                                           [64, 1]                   --
├─LongformerModel: 1-1                                       [64, 1929, 768]           --
│    └─LongformerEmbeddings: 2-1                             [64, 2048, 768]           --
│    │    └─Embedding: 3-1                                   [64, 2048, 768]           (38,603,520)
│    │    └─Embedding: 3-2                                   [64, 2048, 768]           (3,147,264)
│    │    └─Embedding: 3-3                                   [64, 2048, 768]           (768)
│    │    └─LayerNorm: 3-4                                   [64, 2048, 768]           (1,536)
│    │    └─Dropout: 3-5                                     [64, 2048, 768]           --
│    └─LongformerEncoder: 2-2                                [64, 1929, 768]           --
│    │    └─ModuleList: 3-6                                  --     

In [13]:
# model_graph = draw_graph(model,
#                          input_data=inputs,
#                          expand_nested=True,
#                          depth=2)
# model_graph.visual_graph

In [14]:
def train(model, optimizer, criterion, train_dataloader, logging_steps=20):
    model.train()
    running_loss = 0.0

    for i, (token_ids, attention_mask, features, score) in enumerate(train_dataloader):

        output = model(token_ids.to(device),
                       attention_mask.to(device),
                       features.to(device))
    
        loss = criterion(output, score.to(device)).float()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        running_loss += loss.item()
        
#         if i == 5:
#             break
        
        if (i + 1) % (logging_steps) == 0 or (i + 1) == len(train_dataloader):
            wandb.log({'train_loss_steps': running_loss / (i + 1),
                       'learning_rate': optimizer.param_groups[0]['lr']})

    return running_loss / len(train_dataloader)


def evaluate(model, criterion, dataloader):
    model.eval()
    running_loss = 0.0
    all_scores = []
    predictions = []

    with torch.no_grad():
        for token_ids, attention_mask, features, score in dataloader:
            output = model(token_ids.to(device),
                           attention_mask.to(device),
                           features.to(device))

            loss = criterion(output, score.to(device)).float()
            
#             print(loss)

            running_loss += loss.item()
            all_scores.extend(score.cpu().numpy())
            predictions.extend(output.cpu().numpy())
            
#             break

    return running_loss / len(dataloader), torch.tensor(all_scores), torch.tensor(predictions)

In [15]:
class EarlyStopper:
    def __init__(self, patience=1, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = float('inf')

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

In [16]:
def logit_to_score(logit, min_score=1, max_score=6):
    scores = torch.clamp(torch.round(logit), min_score, max_score)
    scores = scores.long()
    return scores

In [17]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(),
                             lr=hyperparameters['lr'],
                             weight_decay=1e-8)
scheduler = ReduceLROnPlateau(optimizer, patience=3, factor=0.5)
early_stopper = EarlyStopper(patience=5, min_delta=1e-4)

train_losses, val_losses, val_kappa_scores, val_accuracies = [], [], [], []

hyperparameters['early_stopper'] = early_stopper.__dict__['patience']
hyperparameters['scheduler'] = scheduler.__dict__
hyperparameters['model'] = model.config()

hyperparameters

{'lr': 0.0001,
 'dropout': 0.3,
 'epochs': 15,
 'batch_size': 64,
 'train_set': {'total': 24345, 'ratio': 0.8},
 'val_set': {'total': 3043, 'ratio': 0.1},
 'test_set': {'total': 3044, 'ratio': 0.1},
 'linguistic_features': ['mean_num_adjectives_in_paragraph',
  '25th_percentile_num_adverbs_in_paragraph',
  '25th_percentile_num_adverbs_in_sentence',
  '75th_percentile_num_nouns_in_paragraph',
  'num_pronouns_in_essay',
  '25th_percentile_mean_word_lens_in_sentence',
  'num_verbs_in_essay',
  'min_mean_word_lens_in_paragraph',
  'min_num_verbs_in_paragraph',
  'num_proper_nouns_in_essay',
  'max_num_verbs_in_sentence',
  'max_num_misspelled_words_in_sentence',
  '25th_percentile_mean_word_lens_in_paragraph',
  'mean_num_verbs_in_paragraph',
  '75th_percentile_num_conjunctions_in_paragraph',
  '25th_percentile_num_sentences_in_paragraph',
  '25th_percentile_num_conjunctions_in_paragraph',
  '75th_percentile_num_pronouns_in_paragraph',
  'mean_mean_word_lens_in_sentence',
  'max_num_words_

In [18]:
torch.cuda.empty_cache()
model.to(device)

wandb.init(project='deep-essay-scoring', config=hyperparameters)

print("Start training...")

for epoch in range(hyperparameters['epochs']):
    train_loss = train(model, optimizer, criterion, train_dataloader)
    train_losses.append(train_loss)

    val_loss, val_scores, val_predictions = evaluate(
        model, criterion, val_dataloader)
    val_kappa = cohen_kappa_score(val_scores.cpu().numpy(),
                                  logit_to_score(
                                      val_predictions).cpu().numpy(),
                                  weights='quadratic')

    val_accuracy = torch.sum(val_scores == logit_to_score(
        val_predictions)).float() / len(val_scores)

    val_losses.append(val_loss)
    val_kappa_scores.append(val_kappa)
    val_accuracies.append(val_accuracy)

    scheduler.step(val_loss)

    wandb.log({'train_loss': train_loss, 'val_loss': val_loss,
              'val_kappa': val_kappa, 'epoch': epoch+1})

    print(f'Epoch: {epoch+1}, Train Loss: {train_loss}, Val Loss: {val_loss}, Val Kappa: {val_kappa}, Val Accuracy: {val_accuracy}')

#     break

    if early_stopper.early_stop(val_losses[-1]):
        print("Early stopping")
        break

[34m[1mwandb[0m: Currently logged in as: [33mminha-lehoang[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.17.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240513_110414-dihlsquk[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mlemon-surf-61[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/minha-lehoang/deep-essay-scoring[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/minha-lehoang/deep-essay-scoring/runs/dihlsquk[0m


Start training...
Epoch: 1, Train Loss: 14.843551349139275, Val Loss: 0.631370035931468, Val Kappa: 0.7439040182345746, Val Accuracy: 0.5356556177139282


  return running_loss / len(dataloader), torch.tensor(all_scores), torch.tensor(predictions)


Epoch: 2, Train Loss: 2.546282996186434, Val Loss: 0.4590562911083301, Val Kappa: 0.7659899749801204, Val Accuracy: 0.5839632153511047
Epoch: 3, Train Loss: 1.0678675015141645, Val Loss: 0.44059813891847927, Val Kappa: 0.7722874486508226, Val Accuracy: 0.5879066586494446
Epoch: 4, Train Loss: 0.6795372549160885, Val Loss: 0.4133504976828893, Val Kappa: 0.7811931997411804, Val Accuracy: 0.5898783802986145
Epoch: 5, Train Loss: 0.5694288711535336, Val Loss: 0.4055094936241706, Val Kappa: 0.7750359365709547, Val Accuracy: 0.590207040309906
Epoch: 6, Train Loss: 0.5133324607307204, Val Loss: 0.3916832556327184, Val Kappa: 0.7846470733448693, Val Accuracy: 0.5994085073471069
Epoch: 7, Train Loss: 0.4862338567343284, Val Loss: 0.3998178746551275, Val Kappa: 0.7862185107170478, Val Accuracy: 0.5990798473358154
Epoch: 8, Train Loss: 0.45995994397192175, Val Loss: 0.3964427023505171, Val Kappa: 0.7817607183242348, Val Accuracy: 0.5964508652687073
Epoch: 9, Train Loss: 0.44998757552912855, Val L

In [19]:
embedder_name = model_path.split('/')[-1]
num_epochs = hyperparameters['epochs']
model_name = f'multi_features-{embedder_name}-{num_epochs}_epochs'

# Save the model
torch.save(model.state_dict(), f'{model_name}.pth')

# Save the model summary
with open(f'{model_name}-summary.txt', 'w') as f:
    f.write(str(model_summary))

# save the embedder 
embedder.save_pretrained(f'{embedder_name}-{num_epochs}_epochs')

In [20]:
torch.cuda.empty_cache()

In [21]:
model.eval()
test_predictions = []

with torch.no_grad():
    for i, row in test_df.iterrows():
        token_ids = torch.tensor(row['input_ids']).unsqueeze(0)
        attention_mask = torch.tensor(row['attention_mask']).unsqueeze(0)
        row_ling_features = torch.tensor(row[ling_features].tolist()).unsqueeze(0)

        output = model(token_ids.to(device), 
                       attention_mask.to(device), 
                       row_ling_features.to(device))
        test_predictions.append(output.item())
        
submit_df = pd.DataFrame({
    'essay_id': test_df['essay_id'],
    'prediction': logit_to_score(torch.tensor(test_predictions)).cpu().numpy()
})
print(submit_df.shape)
submit_df.to_csv('submission.csv', index=False)

(3044, 2)
