# **Neural Network Model for Automated Essay Scoring 2.0 Kaggle Competition**

*Version 0.2*

## Utils

In [None]:
import sys
sys.path.append("/kaggle/input/automated-essay-scoring")

import random
import os
import datetime
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchinfo import summary
from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import AutoTokenizer, AutoModel

from modules.data import tokenize_text, LSCDataset, collate_fn, pair_encoding, pad_sequence
from modules.model import LSCModel
from modules.training import train, EarlyStopper
from modules.evaluate import evaluate, logit_to_score

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)

from kaggle_secrets import UserSecretsClient # type: ignore
import wandb # type: ignore

user_secrets = UserSecretsClient()
wandb_api = user_secrets.get_secret("wandb_ha")

wandb.login(key=wandb_api)

## Read data and preprocess

In [None]:
data_dir = '/kaggle/input/aes-linguistic'
# data_dir = "../output/"

with open(os.path.join(data_dir, 'features.txt'), 'r') as f:
    FEATURES = f.read().splitlines()
FEATURES = sorted(list(set(FEATURES)))

train_data = pd.read_csv(os.path.join(data_dir, 'train_linguistic.csv'))
extra_data = pd.read_csv(os.path.join(data_dir, 'extra_linguistic.csv'))

train_data = train_data.dropna(how='any')
extra_data = extra_data.dropna(how='any')

train_data = train_data[['essay_id', 'full_text', 'sentence', 'score'] + FEATURES]
extra_data = extra_data[['essay_id', 'full_text', 'sentence', 'score'] + FEATURES]

# group by essay_id, sentence -> list, everything else -> first
agg_dict = {col: 'first' for col in train_data.columns if col not in ['essay_id', 'sentence']}
agg_dict['sentence'] = lambda x: list(x)

train_data = train_data.groupby('essay_id').agg(agg_dict) 
extra_data = extra_data.groupby('essay_id').agg(agg_dict)

train_data = train_data[['full_text', 'sentence', 'score'] + sorted(FEATURES)]
extra_data = extra_data[['full_text', 'sentence', 'score'] + sorted(FEATURES)]

train_data = train_data.reset_index()
extra_data = extra_data.reset_index()

train_data.head()

## Split data into train, validation and test sets

In [None]:
all_data = pd.concat([train_data, extra_data], ignore_index=True)

# shuffle the data
all_data = all_data.sample(frac=1, random_state=random_seed)
all_data = all_data.reset_index(drop=True)

all_data = all_data[['essay_id', 'full_text', 'sentence', 'score'] + sorted(FEATURES)]

train_ratio, val_ratio, test_ratio = 0.7, 0.15, 0.15

train_df, val_df = train_test_split(all_data, test_size=val_ratio + test_ratio, 
                                    random_state=random_seed)
val_df, test_df = train_test_split(val_df, test_size=test_ratio/(val_ratio + test_ratio), 
                                   random_state=random_seed)

print(train_df.shape, val_df.shape, test_df.shape)

## Hyperparameters

In [None]:
hyperparameters = {
    'lr': 1e-6,
    'dropout': 0.5,
    'epochs': 4,
    'batch_size': 4,
    'train_set': {
        'total': len(train_df),
        'ratio': train_ratio,
    },
    'val_set': {
        'total': len(val_df),
        'ratio': val_ratio,
    },
    'test_set': {
        'total': len(test_df),
        'ratio': test_ratio,
    },
    'linguistic_features': FEATURES,
    'accelator': str(device)
}

## Embedding Models

In [None]:
# sentence_model = "thenlper/gte-base"
sentence_model = "/kaggle/input/essay-scoring-models/gte-base"
sentence_tokenizer = AutoTokenizer.from_pretrained(sentence_model)
sentence_encoder = AutoModel.from_pretrained(sentence_model)

essay_model = "/kaggle/input/essay-scoring-models/longformer-base-4096"
# essay_model = "allenai/longformer-base-4096"
essay_tokenizer = AutoTokenizer.from_pretrained(essay_model)
essay_encoder = AutoModel.from_pretrained(essay_model)

### Essay Tokenize

In [None]:
train_tokenized, hyperparameters['max_seq_len'] = tokenize_text(
    train_df['full_text'].tolist(), essay_tokenizer)

train_df['essay_input_ids'] = train_tokenized['input_ids'].tolist()
train_df['essay_attention_mask'] = train_tokenized['attention_mask'].tolist()

print(len(train_df.sample(1).iloc[0]['essay_input_ids']))

In [None]:
val_tokenized = tokenize_text(val_df['full_text'].tolist(), essay_tokenizer, 
                              False, hyperparameters['max_seq_len'])

val_df['essay_input_ids'] = val_tokenized['input_ids'].tolist()
val_df['essay_attention_mask'] = val_tokenized['attention_mask'].tolist()

print(len(val_df.sample(1).iloc[0]['essay_input_ids']))

test_tokenized = tokenize_text(test_df['full_text'].tolist(), essay_tokenizer,
                               False, hyperparameters['max_seq_len'])

test_df['essay_input_ids'] = test_tokenized['input_ids'].tolist()
test_df['essay_attention_mask'] = test_tokenized['attention_mask'].tolist()

print(len(test_df.sample(1).iloc[0]['essay_input_ids']))

In [None]:
hyperparameters['max_sentence_length'] = 300

## Dataset and DataLoader

In [None]:
train_dataset = LSCDataset(train_df, sentence_tokenizer, FEATURES,
                            hyperparameters['max_sentence_length'])

val_dataset = LSCDataset(val_df, sentence_tokenizer, FEATURES,
                            hyperparameters['max_sentence_length'])

train_dataloader = DataLoader(train_dataset, batch_size=hyperparameters['batch_size'],
                                shuffle=True, collate_fn=collate_fn)

val_dataloader = DataLoader(val_dataset, batch_size=hyperparameters['batch_size'],
                            shuffle=False, collate_fn=collate_fn)

for features, essay_input_ids, essay_attention_mask, sent_input_ids, sent_attention_mask, score in train_dataloader:
    print(features.shape)
    print(essay_input_ids.shape)
    print(essay_attention_mask.shape)
    print(sent_input_ids.shape)
    print(sent_attention_mask.shape)
    print(score.shape)
    break

## Model

In [None]:
model = LSCModel(essay_encoder=essay_encoder,  essay_unfreeze='pooler' ,
                 sentence_encoder=sentence_encoder, sentence_unfreeze='none',
                 input_lf_size=len(FEATURES), hidden_lf_size=len(FEATURES), 
                 hidden_list=[],
                 dropout=hyperparameters['dropout'])

model

In [None]:
inputs = next(iter(train_dataloader))[:-1]
with torch.no_grad():
    model_summary = summary(model, input_data=inputs)
print(model_summary)

In [None]:
# %pip install torchview
# from torchview import draw_graph

# with torch.no_grad():
#     model_graph = draw_graph(model,
#                          input_data=inputs,
#                          expand_nested=True,
#                          depth=2)
    
# model_graph.visual_graph

## Training and Evaluation

In [None]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(),
                             lr=hyperparameters['lr'],
                             weight_decay=1e-8)
scheduler = ReduceLROnPlateau(optimizer, patience=2, factor=0.1)
early_stopper = EarlyStopper(patience=3, min_delta=1e-4)

train_losses, val_losses, val_kappa_scores, val_accuracies = [], [], [], []

hyperparameters['early_stopper'] = early_stopper.__dict__['patience']
hyperparameters['scheduler'] = scheduler.__dict__
# hyperparameters['model'] = model.config()
hyperparameters['sentence_encoder'] = sentence_model
hyperparameters['essay_encoder'] = essay_model

hyperparameters

In [None]:
torch.cuda.empty_cache()
model.to(device)

wandb.init(project='deep-essay-scoring',
           config=hyperparameters)  # type: ignore

print("Start training...")

for epoch in range(hyperparameters['epochs']):
    train_loss = train(model, optimizer, criterion, train_dataloader, 
                       device=device, is_log=True, logging_steps=320)
    train_losses.append(train_loss)

    val_loss, val_scores, val_predictions = evaluate(
        model, criterion, val_dataloader, device=device)
    val_kappa = cohen_kappa_score(val_scores.cpu().numpy(),
                                  logit_to_score(
                                      val_predictions).cpu().numpy(),
                                  weights='quadratic')

    val_accuracy = torch.sum(val_scores == logit_to_score(
        val_predictions)).float() / len(val_scores)

    val_losses.append(val_loss)
    val_kappa_scores.append(val_kappa)
    val_accuracies.append(val_accuracy)

    scheduler.step(val_loss)

    wandb.log({'train_loss': train_loss, 'val_loss': val_loss,
               'val_accuracy': val_accuracy, 'val_kappa': val_kappa,
               'epoch': epoch+1})

    print(f'Epoch: {epoch+1}, Train Loss: {train_loss}, Val Loss: {val_loss}, Val Kappa: {val_kappa}, Val Accuracy: {val_accuracy}')

#     break

    if early_stopper.early_stop(val_losses[-1]):
        print("Early stopping")
        break

In [None]:
torch.cuda.empty_cache()

In [None]:
num_epochs = hyperparameters['epochs']
model_name = f"lsc_{num_epochs}_epochs.pth"

# save the model
torch.save(model.state_dict(), model_name)

# save the model summary
with open(f'model_summary_{num_epochs}_epochs.txt', 'w') as f:
    f.write(str(model_summary))

In [None]:
torch.cuda.empty_cache()

model.eval()
test_predictions = []

for i, row in test_df.iterrows():
    with torch.no_grad():
        essay_input_ids = torch.tensor(row['essay_input_ids']).unsqueeze(0)
        essay_attention_mask = torch.tensor(
            row['essay_attention_mask']).unsqueeze(0)
        features = torch.tensor([row[feature]
                                for feature in sorted(FEATURES)]).unsqueeze(0)
        sentences = row['sentence']

        pair_encodings = pair_encoding(sentences, sentence_tokenizer,  hyperparameters['max_sentence_length'])

        sent_input_ids = pad_sequence([torch.cat([pair_encoding['input_ids'] for pair_encoding in pair_encodings])],
                                      batch_first=True, padding_value=1)
        sent_attention_mask = pad_sequence([torch.cat([pair_encoding['attention_mask'] for pair_encoding in pair_encodings])],
                                           batch_first=True, padding_value=0)

        output = model(features.to(device),
                       essay_input_ids.to(device),
                       essay_attention_mask.to(device),
                       sent_input_ids.to(device),
                       sent_attention_mask.to(device))

        test_predictions.append(output.item())

submit_df = pd.DataFrame({
    'essay_id': test_df['essay_id'],
    'score': logit_to_score(torch.tensor(test_predictions)).cpu().detach().numpy()
})
print(submit_df.shape)
submit_df.to_csv('submission.csv', index=False)