# <header>**Baseline model for Automated Essay Scoring competition**</header>

*Version 0.1.1*

In [11]:
# !pip install torchview
%pip install torchinfo

Note: you may need to restart the kernel to use updated packages.


# Utils

In [12]:
import random
# from kaggle_secrets import UserSecretsClient # type: ignore
# import wandb # type: ignore
import os
import datetime
import json
import pandas as pd
import numpy as np
import re

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchinfo import summary
# from torchview import draw_graph
from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import LongformerModel, LongformerTokenizer

from modules.data import tokenize_text, MultiFeaturesDataset
from modules.model import MultiFeaturesModel
from modules.training import train, EarlyStopper
from modules.evaluate import evaluate, logit_to_score

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# user_secrets = UserSecretsClient()
# wandb_api = user_secrets.get_secret("wandb_ha")

# wandb.login(key=wandb_api)

random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)

cpu


In [13]:
# model_path = '/kaggle/input/essay-scoring-models/longformer-base-4096'
model_path = 'allenai/longformer-base-4096'
tokenizer = LongformerTokenizer.from_pretrained(model_path)
embedder = LongformerModel.from_pretrained(model_path, attention_window=128)



In [14]:
# data_dir = "/kaggle/input/aes-linguistic"
data_dir = "../../../output/"

with open(os.path.join(data_dir, 'features.txt'), 'r') as f:
    FEATURES = f.read().splitlines()
    
FEATURES = sorted(list(set(FEATURES)))

# data_dir = "../../output/"
train_data = pd.read_csv(os.path.join(data_dir, 'train_linguistic.csv'))
extra_data = pd.read_csv(os.path.join(data_dir, 'extra_linguistic.csv'))

extra_data = extra_data[~extra_data['full_text'].isna()]
train_data = train_data[~train_data['full_text'].isna()]

train_data = train_data[['essay_id', 'full_text', 'score'] + FEATURES]
extra_data = extra_data[['essay_id', 'full_text', 'score'] + FEATURES]

train_data = train_data.drop_duplicates()
extra_data = extra_data.drop_duplicates()

train_data = train_data.reset_index(drop=True)
extra_data = extra_data.reset_index(drop=True)

print(train_data.shape, extra_data.shape)

train_data.sample(5)

(17307, 119) (13125, 119)


Unnamed: 0,essay_id,full_text,score,25th_percentile_mean_word_lens_in_paragraph,25th_percentile_mean_word_lens_in_sentence,25th_percentile_num_adjectives_in_paragraph,25th_percentile_num_adjectives_in_sentence,25th_percentile_num_adverbs_in_paragraph,25th_percentile_num_adverbs_in_sentence,25th_percentile_num_conjunctions_in_paragraph,...,num_adjectives_in_essay,num_adverbs_in_essay,num_conjunctions_in_essay,num_misspelled_words_in_essay,num_nouns_in_essay,num_paragraphs,num_pronouns_in_essay,num_proper_nouns_in_essay,num_verbs_in_essay,num_words_in_essay
12696,bb4c434,"people tend to use there cars so much, they ba...",3,20.159982,3.419643,3.5,0.0,2.0,0.0,8.0,...,36,43,48,606,117,8,66,4,82,606
4625,44e88b0,imagine being a top scientist at nasa and viki...,3,20.475649,3.752422,7.0,1.0,2.0,0.0,3.0,...,41,13,17,427,108,5,22,4,40,427
733,0ba78ec,the face of mars could not be created by alien...,3,13.52431,3.920867,5.25,0.75,2.0,0.75,4.25,...,23,12,17,241,51,3,20,5,15,241
16885,f96c287,many people belive that the face on mars was c...,3,9.158929,3.719048,4.0,0.25,1.0,0.0,2.0,...,22,9,12,269,62,5,13,13,27,269
3334,317173f,driverless cars are coming soon or later? peop...,4,22.430627,3.631579,4.0,0.0,3.0,0.0,8.0,...,35,29,60,643,116,6,86,0,83,643


Split train, val, test set

In [15]:
all_data = pd.concat([train_data, extra_data], ignore_index=True)

# shuffle the data
all_data = all_data.sample(frac=1, random_state=random_seed)
all_data = all_data.reset_index(drop=True)

train_ratio, val_ratio, test_ratio = 0.8, 0.1, 0.1

train_df, val_df = train_test_split(all_data, test_size=val_ratio + test_ratio, 
                                    random_state=random_seed)
val_df, test_df = train_test_split(val_df, test_size=test_ratio/(val_ratio + test_ratio), 
                                   random_state=random_seed)

print(train_df.shape, val_df.shape, test_df.shape)

(24345, 119) (3043, 119) (3044, 119)


Hyperparameters

In [16]:
hyperparameters = {
    'lr': 1e-4,
    'dropout': 0.3,
    'epochs': 15,
    'batch_size': 64,
    'train_set': {
        'total': len(train_df),
        'ratio': train_ratio,
    },
    'val_set': {
        'total': len(val_df),
        'ratio': val_ratio,
    },
    'test_set': {
        'total': len(test_df),
        'ratio': test_ratio,
    },
    'linguistic_features': FEATURES,
    'accelator': str(device)
}

In [17]:
train_tokenized, max_seq_len = tokenize_text(train_df['full_text'].tolist(), tokenizer)

hyperparameters['max_seq_len'] = max_seq_len

train_df['input_ids'] = train_tokenized['input_ids'].tolist()
train_df['attention_mask'] = train_tokenized['attention_mask'].tolist()


print(len(train_df.sample(1).iloc[0]['input_ids']))

1929


In [18]:
val_tokenized = tokenize_text(val_df['full_text'].tolist(), tokenizer, 
                              False, hyperparameters['max_seq_len'])

val_df['input_ids'] = val_tokenized['input_ids'].tolist()
val_df['attention_mask'] = val_tokenized['attention_mask'].tolist()

print(len(val_df.sample(1).iloc[0]['input_ids']))

1929


In [19]:
test_tokenized = tokenize_text(test_df['full_text'].tolist(), tokenizer,
                               False, hyperparameters['max_seq_len'])

test_df['input_ids'] = test_tokenized['input_ids'].tolist()
test_df['attention_mask'] = test_tokenized['attention_mask'].tolist()

print(len(test_df.sample(1).iloc[0]['input_ids']))

1929


In [20]:
train_dataset = MultiFeaturesDataset(train_df, FEATURES)
val_dataset = MultiFeaturesDataset(val_df, FEATURES)

train_dataloader = DataLoader(train_dataset, 
                              batch_size=hyperparameters['batch_size'], 
                              shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, 
                            batch_size=hyperparameters['batch_size'], 
                            shuffle=True, num_workers=4)

for token_ids, attention_mask, features, score in val_dataloader:
    print(token_ids.shape, attention_mask.shape, features.shape, score.shape)
    break

torch.Size([64, 1929]) torch.Size([64, 1929]) torch.Size([64, 116]) torch.Size([64, 1])


In [21]:
model = MultiFeaturesModel(embedder, 
                           len(FEATURES), 256,
                           hyperparameters['dropout'])
model.to(device)

inputs = next(iter(train_dataloader))[:-1]
inputs = [i.to(device) for i in inputs]
model_summary = summary(model, input_data=inputs)

print(model_summary)

MultiFeaturesModel(
  (embedder): LongformerModel(
    (embeddings): LongformerEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(4098, 768, padding_idx=1)
    )
    (encoder): LongformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x LongformerLayer(
          (attention): LongformerAttention(
            (self): LongformerSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
              (value_global):

In [13]:
# model_graph = draw_graph(model,
#                          input_data=inputs,
#                          expand_nested=True,
#                          depth=2)
# model_graph.visual_graph

In [17]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(),
                             lr=hyperparameters['lr'],
                             weight_decay=1e-8)
scheduler = ReduceLROnPlateau(optimizer, patience=3, factor=0.5)
early_stopper = EarlyStopper(patience=5, min_delta=1e-4)

train_losses, val_losses, val_kappa_scores, val_accuracies = [], [], [], []

hyperparameters['early_stopper'] = early_stopper.__dict__['patience']
hyperparameters['scheduler'] = scheduler.__dict__
hyperparameters['model'] = model.config()

hyperparameters

{'lr': 0.0001,
 'dropout': 0.3,
 'epochs': 15,
 'batch_size': 64,
 'train_set': {'total': 24345, 'ratio': 0.8},
 'val_set': {'total': 3043, 'ratio': 0.1},
 'test_set': {'total': 3044, 'ratio': 0.1},
 'linguistic_features': ['mean_num_adjectives_in_paragraph',
  '25th_percentile_num_adverbs_in_paragraph',
  '25th_percentile_num_adverbs_in_sentence',
  '75th_percentile_num_nouns_in_paragraph',
  'num_pronouns_in_essay',
  '25th_percentile_mean_word_lens_in_sentence',
  'num_verbs_in_essay',
  'min_mean_word_lens_in_paragraph',
  'min_num_verbs_in_paragraph',
  'num_proper_nouns_in_essay',
  'max_num_verbs_in_sentence',
  'max_num_misspelled_words_in_sentence',
  '25th_percentile_mean_word_lens_in_paragraph',
  'mean_num_verbs_in_paragraph',
  '75th_percentile_num_conjunctions_in_paragraph',
  '25th_percentile_num_sentences_in_paragraph',
  '25th_percentile_num_conjunctions_in_paragraph',
  '75th_percentile_num_pronouns_in_paragraph',
  'mean_mean_word_lens_in_sentence',
  'max_num_words_

In [18]:
torch.cuda.empty_cache()
model.to(device)

wandb.init(project='deep-essay-scoring', config=hyperparameters) # type: ignore

print("Start training...")

for epoch in range(hyperparameters['epochs']):
    train_loss = train(model, optimizer, criterion,
                    train_dataloader, device=device)
    train_losses.append(train_loss)

    val_loss, val_scores, val_predictions = evaluate(
        model, criterion, val_dataloader, device=device)
    val_kappa = cohen_kappa_score(val_scores.cpu().numpy(),
                                  logit_to_score(
                                      val_predictions).cpu().numpy(),
                                  weights='quadratic')

    val_accuracy = torch.sum(val_scores == logit_to_score(
        val_predictions)).float() / len(val_scores)

    val_losses.append(val_loss)
    val_kappa_scores.append(val_kappa)
    val_accuracies.append(val_accuracy)

    scheduler.step(val_loss)

    wandb.log({'train_loss': train_loss, 'val_loss': val_loss, # type: ignore
              'val_kappa': val_kappa, 'epoch': epoch+1})

    print(f'Epoch: {epoch+1}, Train Loss: {train_loss}, Val Loss: {val_loss}, Val Kappa: {val_kappa}, Val Accuracy: {val_accuracy}')

#     break

    if early_stopper.early_stop(val_losses[-1]):
        print("Early stopping")
        break

[34m[1mwandb[0m: Currently logged in as: [33mminha-lehoang[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.17.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240513_110414-dihlsquk[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mlemon-surf-61[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/minha-lehoang/deep-essay-scoring[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/minha-lehoang/deep-essay-scoring/runs/dihlsquk[0m


Start training...
Epoch: 1, Train Loss: 14.843551349139275, Val Loss: 0.631370035931468, Val Kappa: 0.7439040182345746, Val Accuracy: 0.5356556177139282


  return running_loss / len(dataloader), torch.tensor(all_scores), torch.tensor(predictions)


Epoch: 2, Train Loss: 2.546282996186434, Val Loss: 0.4590562911083301, Val Kappa: 0.7659899749801204, Val Accuracy: 0.5839632153511047
Epoch: 3, Train Loss: 1.0678675015141645, Val Loss: 0.44059813891847927, Val Kappa: 0.7722874486508226, Val Accuracy: 0.5879066586494446
Epoch: 4, Train Loss: 0.6795372549160885, Val Loss: 0.4133504976828893, Val Kappa: 0.7811931997411804, Val Accuracy: 0.5898783802986145
Epoch: 5, Train Loss: 0.5694288711535336, Val Loss: 0.4055094936241706, Val Kappa: 0.7750359365709547, Val Accuracy: 0.590207040309906
Epoch: 6, Train Loss: 0.5133324607307204, Val Loss: 0.3916832556327184, Val Kappa: 0.7846470733448693, Val Accuracy: 0.5994085073471069
Epoch: 7, Train Loss: 0.4862338567343284, Val Loss: 0.3998178746551275, Val Kappa: 0.7862185107170478, Val Accuracy: 0.5990798473358154
Epoch: 8, Train Loss: 0.45995994397192175, Val Loss: 0.3964427023505171, Val Kappa: 0.7817607183242348, Val Accuracy: 0.5964508652687073
Epoch: 9, Train Loss: 0.44998757552912855, Val L

In [19]:
embedder_name = model_path.split('/')[-1]
num_epochs = hyperparameters['epochs']
model_name = f'multi_features-{embedder_name}-{num_epochs}_epochs'

# Save the model
torch.save(model.state_dict(), f'{model_name}.pth')

# Save the model summary
with open(f'{model_name}-summary.txt', 'w') as f:
    f.write(str(model_summary))

# save the embedder 
embedder.save_pretrained(f'{embedder_name}-{num_epochs}_epochs')

In [20]:
torch.cuda.empty_cache()

In [21]:
model.eval()
test_predictions = []

with torch.no_grad():
    for i, row in test_df.iterrows():
        token_ids = torch.tensor(row['input_ids']).unsqueeze(0)
        attention_mask = torch.tensor(row['attention_mask']).unsqueeze(0)
        row_ling_features = torch.tensor(row[FEATURES].tolist()).unsqueeze(0)

        output = model(token_ids.to(device), 
                       attention_mask.to(device), 
                       row_ling_features.to(device))
        test_predictions.append(output.item())
        
submit_df = pd.DataFrame({
    'essay_id': test_df['essay_id'],
    'prediction': logit_to_score(torch.tensor(test_predictions)).cpu().numpy()
})
print(submit_df.shape)
submit_df.to_csv('submission.csv', index=False)

(3044, 2)
