# **Neural Network Model for Automated Essay Scoring 2.0 Kaggle Competition**

*Version 0.2*

## Utils

In [1]:
import sys
sys.path.append("/kaggle/input/automated-essay-scoring")

import random
import os
import datetime
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchinfo import summary
from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import AutoTokenizer, AutoModel

from modules.data import tokenize_text, LSCDataset, collate_fn, pair_encoding, pad_sequence
from modules.model import LSCModel
from modules.training import train, EarlyStopper
from modules.evaluate import evaluate, logit_to_score

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)

from kaggle_secrets import UserSecretsClient # type: ignore
import wandb # type: ignore

user_secrets = UserSecretsClient()
wandb_api = user_secrets.get_secret("wandb_ha")

wandb.login(key=wandb_api)

cuda


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

## Read data and preprocess

In [2]:
data_dir = '/kaggle/input/aes-linguistic'
# data_dir = "../output/"

with open(os.path.join(data_dir, 'features.txt'), 'r') as f:
    FEATURES = f.read().splitlines()
FEATURES = sorted(list(set(FEATURES)))

train_data = pd.read_csv(os.path.join(data_dir, 'train_linguistic.csv'))
extra_data = pd.read_csv(os.path.join(data_dir, 'extra_linguistic.csv'))

train_data = train_data.dropna(how='any')
extra_data = extra_data.dropna(how='any')

train_data = train_data[['essay_id', 'full_text', 'sentence', 'score'] + FEATURES]
extra_data = extra_data[['essay_id', 'full_text', 'sentence', 'score'] + FEATURES]

# group by essay_id, sentence -> list, everything else -> first
agg_dict = {col: 'first' for col in train_data.columns if col not in ['essay_id', 'sentence']}
agg_dict['sentence'] = lambda x: list(x)

train_data = train_data.groupby('essay_id').agg(agg_dict) 
extra_data = extra_data.groupby('essay_id').agg(agg_dict)

train_data = train_data[['full_text', 'sentence', 'score'] + sorted(FEATURES)]
extra_data = extra_data[['full_text', 'sentence', 'score'] + sorted(FEATURES)]

train_data = train_data.reset_index()
extra_data = extra_data.reset_index()

train_data.head()

Unnamed: 0,essay_id,full_text,sentence,score,25th_percentile_mean_word_lens_in_paragraph,25th_percentile_mean_word_lens_in_sentence,25th_percentile_num_adjectives_in_paragraph,25th_percentile_num_adjectives_in_sentence,25th_percentile_num_adverbs_in_paragraph,25th_percentile_num_adverbs_in_sentence,...,num_conjunctions_in_essay,num_misspelled_words_in_essay,num_nouns_in_essay,num_paragraphs,num_pronouns_in_essay,num_proper_nouns_in_essay,num_stop_words_in_essay,num_unique_words_in_essay,num_verbs_in_essay,num_words_in_essay
0,000d118,many people have car where they live. the thin...,"[many people have car where they live., the th...",3,54.826177,3.626506,38.0,2.0,15.0,1.0,...,50,546,107,1,53,20,287,389,65,546
1,000fe60,i am a scientist at nasa that is discussing th...,[i am a scientist at nasa that is discussing t...,3,10.81641,3.1875,2.0,0.0,2.0,0.0,...,26,373,54,5,53,8,233,318,44,373
2,001ab80,people always wish they had the same technolog...,[people always wish they had the same technolo...,4,17.818487,3.896552,10.0,1.0,8.0,1.0,...,35,607,114,4,49,0,344,533,70,607
3,001bdc0,"we all heard about venus, the planet without a...","[we all heard about venus, the planet without ...",4,15.264912,4.033333,2.0,0.0,4.0,0.0,...,35,510,118,5,32,14,233,461,57,510
4,002ba53,"dear, state senator this is a letter to argue ...","[dear, state senator, this is a letter to argu...",3,14.339735,3.99,6.0,1.0,1.0,0.0,...,18,419,90,6,22,6,205,334,35,419


## Split data into train, validation and test sets

In [3]:
all_data = pd.concat([train_data, extra_data], ignore_index=True)

# shuffle the data
all_data = all_data.sample(frac=1, random_state=random_seed)
all_data = all_data.reset_index(drop=True)

all_data = all_data[['essay_id', 'full_text', 'sentence', 'score'] + sorted(FEATURES)]

train_ratio, val_ratio, test_ratio = 0.7, 0.15, 0.15

train_df, val_df = train_test_split(all_data, test_size=val_ratio + test_ratio, 
                                    random_state=random_seed)
val_df, test_df = train_test_split(val_df, test_size=test_ratio/(val_ratio + test_ratio), 
                                   random_state=random_seed)

print(train_df.shape, val_df.shape, test_df.shape)

(21301, 142) (4565, 142) (4565, 142)


## Hyperparameters

In [4]:
hyperparameters = {
    'lr': 5e-6,
    'dropout': 0.3,
    'epochs': 3,
    'batch_size': 6,
    'train_set': {
        'total': len(train_df),
        'ratio': train_ratio,
    },
    'val_set': {
        'total': len(val_df),
        'ratio': val_ratio,
    },
    'test_set': {
        'total': len(test_df),
        'ratio': test_ratio,
    },
    'linguistic_features': FEATURES,
    'accelator': str(device)
}

## Embedding Models

In [5]:
sentence_model = "thenlper/gte-base"
sentence_tokenizer = AutoTokenizer.from_pretrained(sentence_model)
sentence_encoder = AutoModel.from_pretrained(sentence_model)

# essay_model = "/kaggle/input/essay-scoring-models/longformer-base-4096"
essay_model = "allenai/longformer-base-4096"
essay_tokenizer = AutoTokenizer.from_pretrained(essay_model)
essay_encoder = AutoModel.from_pretrained(essay_model)

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/618 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/219M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


### Essay Tokenize

In [6]:
train_tokenized, hyperparameters['max_seq_len'] = tokenize_text(
    train_df['full_text'].tolist(), essay_tokenizer)

train_df['essay_input_ids'] = train_tokenized['input_ids'].tolist()
train_df['essay_attention_mask'] = train_tokenized['attention_mask'].tolist()

print(len(train_df.sample(1).iloc[0]['essay_input_ids']))

1929


In [7]:
val_tokenized = tokenize_text(val_df['full_text'].tolist(), essay_tokenizer, 
                              False, hyperparameters['max_seq_len'])

val_df['essay_input_ids'] = val_tokenized['input_ids'].tolist()
val_df['essay_attention_mask'] = val_tokenized['attention_mask'].tolist()

print(len(val_df.sample(1).iloc[0]['essay_input_ids']))

test_tokenized = tokenize_text(test_df['full_text'].tolist(), essay_tokenizer,
                               False, hyperparameters['max_seq_len'])

test_df['essay_input_ids'] = test_tokenized['input_ids'].tolist()
test_df['essay_attention_mask'] = test_tokenized['attention_mask'].tolist()

print(len(test_df.sample(1).iloc[0]['essay_input_ids']))

1929
1929


In [8]:
hyperparameters['max_sentence_length'] = 256

## Dataset and DataLoader

In [9]:
train_dataset = LSCDataset(train_df, sentence_tokenizer, FEATURES,
                            hyperparameters['max_sentence_length'])

val_dataset = LSCDataset(val_df, sentence_tokenizer, FEATURES,
                            hyperparameters['max_sentence_length'])

train_dataloader = DataLoader(train_dataset, batch_size=hyperparameters['batch_size'],
                                shuffle=True, collate_fn=collate_fn)

val_dataloader = DataLoader(val_dataset, batch_size=hyperparameters['batch_size'],
                            shuffle=False, collate_fn=collate_fn)

for features, essay_input_ids, essay_attention_mask, sent_input_ids, sent_attention_mask, score in train_dataloader:
    print(features.shape)
    print(essay_input_ids.shape)
    print(essay_attention_mask.shape)
    print(sent_input_ids.shape)
    print(sent_attention_mask.shape)
    print(score.shape)
    break

torch.Size([6, 138])
torch.Size([6, 1929])
torch.Size([6, 1929])
torch.Size([6, 31, 256])
torch.Size([6, 31, 256])
torch.Size([6, 1])


## Model

In [10]:
model = LSCModel(essay_encoder, sentence_encoder,
                 len(FEATURES), 128, dropout=hyperparameters['dropout'])

model

LSCModel(
  (linguistic_module): LinguisticModule(
    (lf): Linear(in_features=138, out_features=128, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (semantic_module): SemanticModule(
    (essay_encoder): LongformerModel(
      (embeddings): LongformerEmbeddings(
        (word_embeddings): Embedding(50265, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (position_embeddings): Embedding(4098, 768, padding_idx=1)
      )
      (encoder): LongformerEncoder(
        (layer): ModuleList(
          (0-11): 12 x LongformerLayer(
            (attention): LongformerAttention(
              (self): LongformerSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_f

In [11]:
inputs = next(iter(train_dataloader))[:-1]
with torch.no_grad():
    model_summary = summary(model, input_data=inputs)
print(model_summary)

Input ids are automatically padded from 1929 to 2048 to be a multiple of `config.attention_window`: 512


Layer (type:depth-idx)                                            Output Shape              Param #
LSCModel                                                          [6, 1]                    --
├─LinguisticModule: 1-1                                           [6, 128]                  --
│    └─Linear: 2-1                                                [6, 128]                  17,792
│    └─Dropout: 2-2                                               [6, 128]                  --
├─SemanticModule: 1-2                                             [6, 768]                  --
│    └─LongformerModel: 2-3                                       [6, 768]                  --
│    │    └─LongformerEmbeddings: 3-1                             [6, 2048, 768]            (41,753,088)
│    │    └─LongformerEncoder: 3-2                                [6, 1929, 768]            (106,315,776)
│    │    └─LongformerPooler: 3-3                                 [6, 768]                  590,592
│    └─Dropout:

## Training and Evaluation

In [12]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(),
                             lr=hyperparameters['lr'],
                             weight_decay=1e-6)
scheduler = ReduceLROnPlateau(optimizer, patience=3, factor=0.1)
early_stopper = EarlyStopper(patience=3, min_delta=1e-4)

train_losses, val_losses, val_kappa_scores, val_accuracies = [], [], [], []

hyperparameters['early_stopper'] = early_stopper.__dict__['patience']
hyperparameters['scheduler'] = scheduler.__dict__
# hyperparameters['model'] = model.config()
hyperparameters['sentence_encoder'] = sentence_model
hyperparameters['essay_encoder'] = essay_model

hyperparameters

{'lr': 5e-06,
 'dropout': 0.3,
 'epochs': 3,
 'batch_size': 6,
 'train_set': {'total': 21301, 'ratio': 0.7},
 'val_set': {'total': 4565, 'ratio': 0.15},
 'test_set': {'total': 4565, 'ratio': 0.15},
 'linguistic_features': ['25th_percentile_mean_word_lens_in_paragraph',
  '25th_percentile_mean_word_lens_in_sentence',
  '25th_percentile_num_adjectives_in_paragraph',
  '25th_percentile_num_adjectives_in_sentence',
  '25th_percentile_num_adverbs_in_paragraph',
  '25th_percentile_num_adverbs_in_sentence',
  '25th_percentile_num_conjunctions_in_paragraph',
  '25th_percentile_num_conjunctions_in_sentence',
  '25th_percentile_num_misspelled_words_in_paragraph',
  '25th_percentile_num_misspelled_words_in_sentence',
  '25th_percentile_num_nouns_in_paragraph',
  '25th_percentile_num_nouns_in_sentence',
  '25th_percentile_num_pronouns_in_paragraph',
  '25th_percentile_num_pronouns_in_sentence',
  '25th_percentile_num_proper_nouns_in_paragraph',
  '25th_percentile_num_proper_nouns_in_sentence',
  '

In [13]:
torch.cuda.empty_cache()
model.to(device)

wandb.init(project='deep-essay-scoring',
           config=hyperparameters)  # type: ignore

print("Start training...")

for epoch in range(hyperparameters['epochs']):
    train_loss = train(model, optimizer, criterion, train_dataloader, 
                       device=device, is_log=True, logging_steps=160)
    train_losses.append(train_loss)

    val_loss, val_scores, val_predictions = evaluate(
        model, criterion, val_dataloader, device=device)
    val_kappa = cohen_kappa_score(val_scores.cpu().numpy(),
                                  logit_to_score(
                                      val_predictions).cpu().numpy(),
                                  weights='quadratic')

    val_accuracy = torch.sum(val_scores == logit_to_score(
        val_predictions)).float() / len(val_scores)

    val_losses.append(val_loss)
    val_kappa_scores.append(val_kappa)
    val_accuracies.append(val_accuracy)

    scheduler.step(val_loss)

    wandb.log({'train_loss': train_loss, 'val_loss': val_loss,
               'val_accuracy': val_accuracy, 'val_kappa': val_kappa,
               'epoch': epoch+1})

    print(f'Epoch: {epoch+1}, Train Loss: {train_loss}, Val Loss: {val_loss}, Val Kappa: {val_kappa}, Val Accuracy: {val_accuracy}')

#     break

    if early_stopper.early_stop(val_losses[-1]):
        print("Early stopping")
        break

[34m[1mwandb[0m: Currently logged in as: [33mminha-lehoang[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.17.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240518_025346-dhlg7752[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33molive-meadow-73[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/minha-lehoang/deep-essay-scoring[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/minha-lehoang/deep-essay-scoring/runs/dhlg7752[0m


Start training...


  return running_loss / len(dataloader), torch.tensor(all_scores), torch.tensor(predictions)


Epoch: 1, Train Loss: 5.498975431204044, Val Loss: 0.7156023032493332, Val Kappa: 0.680425461465907, Val Accuracy: 0.44906899333000183
Epoch: 2, Train Loss: 2.5730858781377353, Val Loss: 0.635135122554061, Val Kappa: 0.6893377541526474, Val Accuracy: 0.4611172080039978
Epoch: 3, Train Loss: 1.3953719171746555, Val Loss: 0.4813456409188592, Val Kappa: 0.7555793295485468, Val Accuracy: 0.5465498566627502


In [None]:
torch.cuda.empty_cache()

In [14]:
num_epochs = hyperparameters['epochs']
model_name = f"lsc_{num_epochs}_epochs.pth"

# save the model
torch.save(model.state_dict(), model_name)

# save the model summary
with open(f'model_summary_{num_epochs}_epochs.txt', 'w') as f:
    f.write(str(model_summary))

In [15]:
torch.cuda.empty_cache()

model.eval()
test_predictions = []

for i, row in test_df.iterrows():
    with torch.no_grad():
        essay_input_ids = torch.tensor(row['essay_input_ids']).unsqueeze(0)
        essay_attention_mask = torch.tensor(
            row['essay_attention_mask']).unsqueeze(0)
        features = torch.tensor([row[feature]
                                for feature in FEATURES]).unsqueeze(0)
        sentences = row['sentence']

        pair_encodings = pair_encoding(sentences, sentence_tokenizer,  hyperparameters['max_sentence_length'])

        sent_input_ids = pad_sequence([torch.cat([pair_encoding['input_ids'] for pair_encoding in pair_encodings])],
                                      batch_first=True, padding_value=1)
        sent_attention_mask = pad_sequence([torch.cat([pair_encoding['attention_mask'] for pair_encoding in pair_encodings])],
                                           batch_first=True, padding_value=0)

        output = model(features.to(device),
                       essay_input_ids.to(device),
                       essay_attention_mask.to(device),
                       sent_input_ids.to(device),
                       sent_attention_mask.to(device))

        test_predictions.append(output.item())

submit_df = pd.DataFrame({
    'essay_id': test_df['essay_id'],
    'score': logit_to_score(torch.tensor(test_predictions)).cpu().detach().numpy()
})
print(submit_df.shape)
submit_df.to_csv('submission.csv', index=False)

TypeError: string indices must be integers