# **Neural Network Model for Automated Essay Scoring 2.0 Kaggle Competition**

*Version 0.2*

## Utils

In [1]:
from distutils.dir_util import copy_tree

copy_tree('/kaggle/input/spellchecker', '/kaggle/working/')

!gzip '/kaggle/working/spellchecker/resources/en.json'

%pip install '/kaggle/working/pyspellchecker-0.8.1-py3-none-any.whl'

Processing ./pyspellchecker-0.8.1-py3-none-any.whl
Installing collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
import sys
sys.path.append("/kaggle/input/automated-essay-scoring")

import random
import os
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoTokenizer, AutoModel

from modules.linguistic_features import get_features
from modules.data import tokenize_text, LSCDataset, collate_fn, pair_encoding, pad_sequence
from modules.evaluate import evaluate, logit_to_score

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)

cuda


## Read data and preprocess

In [3]:
input_dir = "/kaggle/input/learning-agency-lab-automated-essay-scoring-2"
test_data = pd.read_csv(os.path.join(input_dir, 'test.csv'))

print(test_data.shape)

data_dir = "/kaggle/input/aes-linguistic"
with open(os.path.join(data_dir, 'features.txt'), 'r') as f:
    FEATURES = f.read().splitlines()

FEATURES = sorted(list(set(FEATURES)))

(3, 2)


In [4]:
test_df, _ = get_features(test_data, False)

agg_dict = {col: 'first' for col in test_df.columns if col not in ['essay_id', 'sentence']}
agg_dict['sentence'] = lambda x: list(x)

test_df = test_df.groupby('essay_id').agg(agg_dict).reset_index()

test_df = test_df[['essay_id', 'full_text', 'sentence'] + sorted(FEATURES)]

test_df

Unnamed: 0,essay_id,full_text,sentence,25th_percentile_mean_word_lens_in_paragraph,25th_percentile_mean_word_lens_in_sentence,25th_percentile_num_adjectives_in_paragraph,25th_percentile_num_adjectives_in_sentence,25th_percentile_num_adverbs_in_paragraph,25th_percentile_num_adverbs_in_sentence,25th_percentile_num_conjunctions_in_paragraph,...,num_conjunctions_in_essay,num_misspelled_words_in_essay,num_nouns_in_essay,num_paragraphs,num_pronouns_in_essay,num_proper_nouns_in_essay,num_stop_words_in_essay,num_unique_words_in_essay,num_verbs_in_essay,num_words_in_essay
0,000d118,many people have car where they live. the thin...,"[many people have car where they live., the th...",54.826177,3.626506,38.0,2.0,15.0,1.0,50.0,...,50,546,107,1,53,20,287,389,65,546
1,000fe60,i am a scientist at nasa that is discussing th...,[i am a scientist at nasa that is discussing t...,10.81641,3.1875,2.0,0.0,2.0,0.0,7.0,...,26,373,54,5,53,8,233,318,44,373
2,001ab80,people always wish they had the same technolog...,[people always wish they had the same technolo...,17.818487,3.896552,10.0,1.0,8.0,1.0,7.0,...,35,607,114,4,49,0,344,533,70,607


## Hyperparameters

In [5]:
hyperparameters = {
    'lr': 5e-6,
    'dropout': 0.3,
    'epochs': 3,
    'batch_size': 6,
    'linguistic_features': FEATURES,
    'accelator': str(device),
    'max_seq_len': 1929
}

## Embedding Models

In [6]:
# sentence_model = "thenlper/gte-base"
sentence_model = "/kaggle/input/essay-scoring-models/gte-base"
sentence_tokenizer = AutoTokenizer.from_pretrained(sentence_model)
sentence_encoder = AutoModel.from_pretrained(sentence_model)

essay_model = "/kaggle/input/essay-scoring-models/longformer-base-4096"
# essay_model = "allenai/longformer-base-4096"
essay_tokenizer = AutoTokenizer.from_pretrained(essay_model)
essay_encoder = AutoModel.from_pretrained(essay_model)

  return self.fget.__get__(instance, owner)()


### Essay Tokenize

In [7]:
test_tokenized = tokenize_text(test_df['full_text'].tolist(), essay_tokenizer,
                               False, hyperparameters['max_seq_len'])

test_df['essay_input_ids'] = test_tokenized['input_ids'].tolist()
test_df['essay_attention_mask'] = test_tokenized['attention_mask'].tolist()

print(len(test_df.sample(1).iloc[0]['essay_input_ids']))

1929


In [8]:
hyperparameters['max_sentence_length'] = 256

## Model

In [9]:
class LinguisticModule(nn.Module):
    def __init__(self, input_size, hidden_size=64):
        super(LinguisticModule, self).__init__()
        self.lf = torch.nn.Linear(input_size, hidden_size)
        self.dropout = torch.nn.Dropout(0.2)

    def forward(self, ling_features):
        outputs = self.lf(ling_features)
        outputs = F.leaky_relu(outputs)
        outputs = self.dropout(outputs)

        return outputs

class SemanticModule(nn.Module):
    def __init__(self, essay_encoder, dropout=0.1):
        super(SemanticModule, self).__init__()
        # freeze
        for param in essay_encoder.parameters():
            param.requires_grad = False
        # unfreeze the pooler
        for param in essay_encoder.pooler.parameters():
            param.requires_grad = True

        self.essay_encoder = essay_encoder
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, token_ids, attention_mask):
        outputs = self.essay_encoder(
            token_ids,
            attention_mask=attention_mask)

        pooled_outputs = outputs[1]
        pooled_outputs = self.dropout(pooled_outputs)

        return pooled_outputs
    
class CoherenceModule(nn.Module):
    def __init__(self, sentence_encoder, dropout=0.1):
        super(CoherenceModule, self).__init__()
        for param in sentence_encoder.parameters():
            param.requires_grad = False
        for param in sentence_encoder.pooler.parameters():
            param.requires_grad = True
        self.sentence_encoder = sentence_encoder

        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        outputs = self.sentence_encoder(
            input_ids,
            attention_mask=attention_mask)

        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)

        return pooled_output


class LSCModel(nn.Module):
    def __init__(self, essay_encoder, sentence_encoder,
                 input_lf_size, hidden_lf_size=64, dropout=0.1):
        super(LSCModel, self).__init__()
        self.linguistic_module = LinguisticModule(
            input_lf_size, hidden_lf_size)
        self.semantic_module = SemanticModule(essay_encoder, dropout=dropout)
        self.coherence_module = CoherenceModule(
            sentence_encoder, dropout=dropout)
        
        # self.fc = nn.Linear(
        #     hidden_lf_size + essay_encoder.config.hidden_size + sentence_encoder.config.hidden_size, 768)

        # self.regressor = nn.Linear(768, 1)

        self.regressor = nn.Linear(
            hidden_lf_size + essay_encoder.config.hidden_size + sentence_encoder.config.hidden_size, 1)

        self.dropout = nn.Dropout(dropout)

    def forward(self, ling_features, essay_token_ids, essay_attention_mask,
                sentence_token_ids, sentence_attention_mask):
        ling_outputs = self.linguistic_module(ling_features)

        sem_outputs = self.semantic_module(essay_token_ids, essay_attention_mask)

        coh_outputs = self.coherence_module(sentence_token_ids.view(-1, sentence_token_ids.shape[-1]), 
                                            sentence_attention_mask.view(-1, sentence_attention_mask.shape[-1]))

        coh_infor = coh_outputs.view(-1, sentence_token_ids.size(1), coh_outputs.size(-1))
        coh_infor, _ = torch.max(coh_infor, dim=1)

        outputs = torch.cat([ling_outputs, sem_outputs, coh_infor], dim=-1)

        # outputs = self.fc(outputs)
        outputs = F.leaky_relu(outputs)
        outputs = self.dropout(outputs)

        outputs = self.regressor(outputs)

        return outputs

In [10]:
model = LSCModel(essay_encoder, sentence_encoder,
                 len(FEATURES), 128, dropout=hyperparameters['dropout'])

model.to(device)

checkpoint_path = "/kaggle/input/essay-scoring-models/checkpoints/v0.2/winter-wood-74/lsc_7_epochs.pth"
model.load_state_dict(torch.load(checkpoint_path,
                                 map_location=device))

<All keys matched successfully>

## Inference

In [11]:
torch.cuda.empty_cache()

model.eval()
test_predictions = []

for i, row in test_df.iterrows():
    with torch.no_grad():
        essay_input_ids = torch.tensor(row['essay_input_ids']).unsqueeze(0)
        essay_attention_mask = torch.tensor(
            row['essay_attention_mask']).unsqueeze(0)
        features = torch.tensor([row[feature]
                                for feature in FEATURES]).unsqueeze(0)
        sentences = row['sentence']

        pair_encodings = pair_encoding(sentences, sentence_tokenizer,  hyperparameters['max_sentence_length'])

        sent_input_ids = pad_sequence([torch.cat([pair_encoding['input_ids'] for pair_encoding in pair_encodings])],
                                      batch_first=True, padding_value=1)
        sent_attention_mask = pad_sequence([torch.cat([pair_encoding['attention_mask'] for pair_encoding in pair_encodings])],
                                           batch_first=True, padding_value=0)

        output = model(features.to(device),
                       essay_input_ids.to(device),
                       essay_attention_mask.to(device),
                       sent_input_ids.to(device),
                       sent_attention_mask.to(device))

        test_predictions.append(output.item())

submit_df = pd.DataFrame({
    'essay_id': test_df['essay_id'],
    'score': logit_to_score(torch.tensor(test_predictions)).cpu().detach().numpy()
})
print(submit_df.shape)
submit_df.to_csv('submission.csv', index=False)

submit_df

Input ids are automatically padded from 1929 to 2048 to be a multiple of `config.attention_window`: 512


(3, 2)


Unnamed: 0,essay_id,score
0,000d118,2
1,000fe60,2
2,001ab80,3


In [12]:
import shutil

for f in os.listdir('/kaggle/working'):
    if f != 'submission.csv':
        if os.path.isfile(f):
            os.remove(f)
        else:
            shutil.rmtree(f)