# **Neural Network Model for Automated Essay Scoring 2.0 Kaggle Competition**

*Version 0.2*

## Utils

In [46]:
import random
import os
import datetime
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchinfo import summary
from torchview import draw_graph
from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import AutoTokenizer, AutoModel

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)

cpu


## Read data and preprocess

In [40]:
data_dir = '../../../output/'

with open(os.path.join(data_dir, 'features.txt'), 'r') as f:
    FEATURES = f.read().splitlines()
FEATURES = sorted(list(set(FEATURES)))

train_data = pd.read_csv(os.path.join(data_dir, 'train_linguistic.csv'))
extra_data = pd.read_csv(os.path.join(data_dir, 'extra_linguistic.csv'))

train_data = train_data[['essay_id', 'full_text', 'sentence', 'score'] + FEATURES]
extra_data = extra_data[['essay_id', 'full_text', 'sentence', 'score'] + FEATURES]

# group by essay_id, sentence -> list, everything else -> first
agg_dict = {col: 'first' for col in train_data.columns if col not in ['essay_id', 'sentence']}
agg_dict['sentence'] = lambda x: list(x)

train_data = train_data.groupby('essay_id').agg(agg_dict) 
extra_data = extra_data.groupby('essay_id').agg(agg_dict)

train_data = train_data[['full_text', 'sentence', 'score'] + FEATURES]
extra_data = extra_data[['full_text', 'sentence', 'score'] + FEATURES]

train_data = train_data.reset_index()
extra_data = extra_data.reset_index()

train_data.head()

Unnamed: 0,essay_id,full_text,sentence,score,25th_percentile_mean_word_lens_in_paragraph,25th_percentile_mean_word_lens_in_sentence,25th_percentile_num_adjectives_in_paragraph,25th_percentile_num_adjectives_in_sentence,25th_percentile_num_adverbs_in_paragraph,25th_percentile_num_adverbs_in_sentence,...,num_adjectives_in_essay,num_adverbs_in_essay,num_conjunctions_in_essay,num_misspelled_words_in_essay,num_nouns_in_essay,num_paragraphs,num_pronouns_in_essay,num_proper_nouns_in_essay,num_verbs_in_essay,num_words_in_essay
0,000d118,many people have car where they live. the thin...,"[many people have car where they live., the th...",3,54.826177,3.626506,38.0,2.0,15.0,1.0,...,38,15,50,546,107,1,53,20,65,546
1,000fe60,i am a scientist at nasa that is discussing th...,[i am a scientist at nasa that is discussing t...,3,10.81641,3.1875,2.0,0.0,2.0,0.0,...,12,20,26,373,54,5,53,8,44,373
2,001ab80,people always wish they had the same technolog...,[people always wish they had the same technolo...,4,17.818487,3.896552,10.0,1.0,8.0,1.0,...,43,45,35,607,114,4,49,0,70,607
3,001bdc0,"we all heard about venus, the planet without a...","[we all heard about venus, the planet without ...",4,15.264912,4.033333,2.0,0.0,4.0,0.0,...,33,19,35,510,118,5,32,14,57,510
4,002ba53,"dear, state senator this is a letter to argue ...","[dear, state senator, this is a letter to argu...",3,14.339735,3.99,6.0,1.0,1.0,0.0,...,42,9,18,419,90,6,22,6,35,419


## Split data into train, validation and test sets

In [42]:
all_data = pd.concat([train_data, extra_data], ignore_index=True)

# shuffle the data
all_data = all_data.sample(frac=1, random_state=random_seed)
all_data = all_data.reset_index(drop=True)

all_data = all_data[['essay_id', 'full_text', 'sentence', 'score'] + sorted(FEATURES)]

train_ratio, val_ratio, test_ratio = 0.8, 0.1, 0.1

train_df, val_df = train_test_split(all_data, test_size=val_ratio + test_ratio, 
                                    random_state=random_seed)
val_df, test_df = train_test_split(val_df, test_size=test_ratio/(val_ratio + test_ratio), 
                                   random_state=random_seed)

print(train_df.shape, val_df.shape, test_df.shape)

(24344, 120) (3043, 120) (3044, 120)


## Hyperparameters

In [44]:
hyperparameters = {
    'lr': 1e-4,
    'dropout': 0.3,
    'epochs': 15,
    'batch_size': 32,
    'train_set': {
        'total': len(train_df),
        'ratio': train_ratio,
    },
    'val_set': {
        'total': len(val_df),
        'ratio': val_ratio,
    },
    'test_set': {
        'total': len(test_df),
        'ratio': test_ratio,
    },
    'linguistic_features': FEATURES,
    'accelator': str(device)
}

## Embedding Models

In [48]:
sentence_model = 'roberta-base'
sentence_tokenizer = AutoTokenizer.from_pretrained(sentence_model)
sentence_encoder = AutoModel.from_pretrained(sentence_model)

essay_model = "allenai/longformer-base-4096"
essay_tokenizer = AutoTokenizer.from_pretrained(essay_model)
essay_encoder = AutoModel.from_pretrained(essay_model)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


### Essay Tokenize

In [49]:
train_essay_tokenized = essay_tokenizer(train_df['full_text'].tolist(), 
                                        padding=True, return_tensors='pt')

hyperparameters['max_essay_length'] = train_essay_tokenized['input_ids'].shape[1]

train_df['essay_input_ids'] = train_essay_tokenized['input_ids'].tolist()
train_df['essay_attention_mask'] = train_essay_tokenized['attention_mask'].tolist()

print(hyperparameters['max_essay_length'])

1929


In [50]:
val_essay_tokenized = essay_tokenizer(val_df['full_text'].tolist(),
                                        padding='max_length', max_length=hyperparameters['max_essay_length'],
                                        return_tensors='pt')

val_df['essay_input_ids'] = val_essay_tokenized['input_ids'].tolist()
val_df['essay_attention_mask'] = val_essay_tokenized['attention_mask'].tolist()

test_essay_tokenized = essay_tokenizer(test_df['full_text'].tolist(),
                                        padding='max_length', max_length=hyperparameters['max_essay_length'],
                                        return_tensors='pt')

test_df['essay_input_ids'] = test_essay_tokenized['input_ids'].tolist()
test_df['essay_attention_mask'] = test_essay_tokenized['attention_mask'].tolist()

In [52]:
hyperparameters['max_sentence_length'] = 512

## Dataset and DataLoader

In [59]:
from torch.nn.utils.rnn import pad_sequence


class MultiFeaturesCoherenceDataset(Dataset):
    def __init__(self, df: pd.DataFrame, max_sent_length=512):
        self.essay_token_ids = df['essay_input_ids'].values
        self.essay_attention_masks = df['essay_attention_mask'].values
        self.score = df['score'].values
        self.ling_features = []
        for feature in FEATURES:
            self.ling_features.append(df[feature].values)
        self.sentence = df['sentence'].values

        self.max_sent_length = max_sent_length

    def __len__(self):
        return len(self.score)

    def __getitem__(self, idx):
        # essay tokenization
        essay_input_ids = torch.tensor(
            self.essay_token_ids[idx], dtype=torch.long)
        essay_attention_mask = torch.tensor(
            self.essay_attention_masks[idx], dtype=torch.long)

        # linguistic features
        features = []
        for feature in self.ling_features:
            features.append(feature[idx])
        features = torch.tensor(features, dtype=torch.float)

        # sentence pair tokenization
        sentences = self.sentence[idx]

        pair_encodings = []
        for i in range(len(sentences) - 1):
            pair = sentences[i:i + 2]
            pair_encoding = sentence_tokenizer(
                pair[0], pair[1],
                max_length=self.max_sent_length,
                padding='max_length',
                truncation=True,
                return_token_type_ids=True,
                return_attention_mask=True,
                return_tensors='pt')
            pair_encodings.append(pair_encoding)

        score = torch.tensor(self.score[idx], dtype=torch.float32)

        return essay_input_ids, essay_attention_mask, features, pair_encodings, score


train_dataset = MultiFeaturesCoherenceDataset(train_df,
                                              hyperparameters['max_sentence_length'])
val_dataset = MultiFeaturesCoherenceDataset(val_df,
                                            hyperparameters['max_sentence_length'])
test_dataset = MultiFeaturesCoherenceDataset(test_df,
                                             hyperparameters['max_sentence_length'])

In [70]:
def collate_fn(batch):
    essay_input_ids, essay_attention_mask, features, pair_encodings, score = zip(*batch)

    essay_input_ids = torch.stack(essay_input_ids, dim=0)
    essay_attention_mask = torch.stack(essay_attention_mask, dim=0)
    features = torch.stack(features, dim=0)
    score = torch.stack(score, dim=0).view(-1, 1)

    sent_input_ids = pad_sequence([torch.cat([pair_encoding['input_ids'] for pair_encoding in encodings])
                                for encodings in pair_encodings], 
                                batch_first=True, padding_value=sentence_tokenizer.pad_token_id)
    sent_attention_mask = pad_sequence([torch.cat([pair_encoding['attention_mask']
                                  for pair_encoding in encodings]) for encodings in pair_encodings], 
                                  batch_first=True, padding_value=sentence_tokenizer.pad_token_id)
    sent_token_type_ids = pad_sequence([torch.cat([pair_encoding['token_type_ids']
                                    for pair_encoding in encodings]) for encodings in pair_encodings], 
                                    batch_first=True, padding_value=-1)
    
    return essay_input_ids, essay_attention_mask, features, sent_input_ids, sent_attention_mask, sent_token_type_ids, score

train_dataloader = DataLoader(train_dataset, batch_size=hyperparameters['batch_size'],
                                shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=hyperparameters['batch_size'],
                                shuffle=False, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=hyperparameters['batch_size'],
                                shuffle=False, collate_fn=collate_fn)

for essay_input_ids, essay_attention_mask, features, sent_input_ids, sent_attention_mask, sent_token_type_ids, score in train_dataloader:
    # print(essay_input_ids.shape, essay_attention_mask.shape, features.shape, sent_input_ids.shape, sent_attention_mask.shape, sent_token_type_ids.shape, score.shape)
    print(essay_input_ids.shape)
    print(essay_attention_mask.shape)
    print(features.shape)
    print(sent_input_ids.shape)
    print(sent_attention_mask.shape)
    print(sent_token_type_ids.shape)
    print(score.shape)
    break

torch.Size([32, 1929])
torch.Size([32, 1929])
torch.Size([32, 116])
torch.Size([32, 41, 512])
torch.Size([32, 41, 512])
torch.Size([32, 41, 512])
torch.Size([32, 1])


## Model

In [66]:
class MultiFeaturesCoherenceModel(torch.nn.Module):
    def __init__(self, essay_encoder, sentence_encoder, 
                 lf_input_dim, lf_hidden_dim, 
                 dropout=0.3):
        super(MultiFeaturesCoherenceModel, self).__init__()

        # freeze the essay encoder, except the last layer (pooler)
        for param in essay_encoder.parameters():
            param.requires_grad = False
        for param in essay_encoder.pooler.parameters():
            param.requires_grad = True
        self.essay_encoder = essay_encoder

        # linear layer for linguistic features
        self.lf = torch.nn.Linear(lf_input_dim, lf_hidden_dim)

        # sentence encoder
        self.sentence_encoder = sentence_encoder
        # max pooling layer for sentence pairs encoding
        # (B, N, E) -> (B, E)
        self.max_pool = torch.nn.AdaptiveMaxPool1d(1)

        # dropout layer
        self.dropout = torch.nn.Dropout(dropout)
        # activation function
        self.activation = torch.nn.LeakyReLU()

        # linear layer for final prediction
        self.regressor = torch.nn.Linear(essay_encoder.config.hidden_size 
                                  + sentence_encoder.config.hidden_size
                                  + lf_hidden_dim, 1)
        
    def forward(self, essay_input_ids, essay_attention_mask, features, 
                sent_input_ids, sent_attention_mask, sent_token_type_ids):
        # essay encoding
        essay_outputs = self.essay_encoder(input_ids=essay_input_ids,
                                           attention_mask=essay_attention_mask)
        essay_pooler_output = essay_outputs.pooler_output
        essay_pooler_output = self.dropout(essay_pooler_output)

        # linguistic features
        lf_output = self.lf(features)
        lf_output = self.activation(lf_output)
        lf_output = self.dropout(lf_output)

        # sentence pair encoding
        # sent_outputs = self.sentence_encoder(input_ids=sent_input_ids,
        #                                      attention_mask=sent_attention_mask,
        #                                      token_type_ids=sent_token_type_ids)
        # sent_pooler_output = sent_outputs.last_hidden_state
        # sent_pooler_output = self.max_pool(sent_pooler_output.permute(0, 2, 1)).squeeze()
        # sent_pooler_output = self.dropout(sent_pooler_output)
        sent_pooler_outputs = []
        for i in range(sent_input_ids.shape[1]):
            sent_outputs = self.sentence_encoder(input_ids=sent_input_ids[:, i],
                                                 attention_mask=sent_attention_mask[:, i],
                                                 token_type_ids=sent_token_type_ids[:, i])
            sent_pooler_output = sent_outputs.pooler_output
            sent_pooler_output = self.dropout(sent_pooler_output)
            sent_pooler_outputs.append(sent_pooler_output)
        sent_pooler_output = torch.stack(sent_pooler_outputs, dim=1)
        sent_pooler_output = self.max_pool(sent_pooler_output.permute(0, 2, 1)).squeeze()
        sent_pooler_output = self.dropout(sent_pooler_output)

        # concatenate essay, sentence, and linguistic features
        score = self.regressor(torch.cat([essay_pooler_output, sent_pooler_output, lf_output], dim=1))

        return score
    
model = MultiFeaturesCoherenceModel(essay_encoder, sentence_encoder,
                                    len(FEATURES), 128, hyperparameters['dropout'])

model

MultiFeaturesCoherenceModel(
  (essay_encoder): LongformerModel(
    (embeddings): LongformerEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(4098, 768, padding_idx=1)
    )
    (encoder): LongformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x LongformerLayer(
          (attention): LongformerAttention(
            (self): LongformerSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
              (

In [68]:
inputs = next(iter(train_dataloader))[:-1]

summary(model, input_data=inputs)

Input ids are automatically padded from 1929 to 2048 to be a multiple of `config.attention_window`: 512


KeyboardInterrupt: 