# <header>**Baseline model for Automated Essay Scoring competition**</header>

*Version 0.1.1*

In [None]:
# !pip install torchview
# !pip install torchinfo

# Utils

In [None]:
import random
# from kaggle_secrets import UserSecretsClient
# import wandb
import os
import datetime
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import spacy
nlp = spacy.load("en_core_web_sm")

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
# from torchinfo import summary
# from torchview import draw_graph
from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import LongformerModel, LongformerTokenizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# user_secrets = UserSecretsClient()
# wandb_api = user_secrets.get_secret("wandb_ha")

# wandb.login(key=wandb_api)

random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)

In [None]:
model_path = '/kaggle/input/essay-scoring-models/longformer-base-4096'
tokenizer = LongformerTokenizer.from_pretrained(model_path)
embedder = LongformerModel.from_pretrained(model_path, attention_window=128)

# Data loading and preprocessing

In [None]:
input_dir = "/kaggle/input/learning-agency-lab-automated-essay-scoring-2"
extra_dir = "/kaggle/input/persaude-corpus-2/"

# data_dir = "../../output/"
train_data = pd.read_csv(os.path.join(input_dir, 'train.csv'))
test_data = pd.read_csv(os.path.join(input_dir, 'test.csv'))
extra_data = pd.read_csv(os.path.join(extra_dir, 'persuade_2.0_human_scores_demo_id_github.csv'))

extra_data = extra_data[~extra_data['full_text'].isna()]
train_data = train_data[~train_data['full_text'].isna()]

extra_data = extra_data.rename(columns={'essay_id_comp': 'essay_id',
                                              'holistic_essay_score': 'score'})
extra_data = extra_data[['essay_id', 'full_text', 'score']]
extra_data = extra_data[~extra_data['full_text'].isin(train_data['full_text'])]
extra_data = extra_data.reset_index(drop=True)

print(train_data.shape, test_data.shape, extra_data.shape)

FEATURES = set()

In [None]:
from distutils.dir_util import copy_tree

copy_tree('/kaggle/input/spellchecker', '/kaggle/working/')

!gzip '/kaggle/working/spellchecker/resources/en.json'

!pip install '/kaggle/working/pyspellchecker-0.8.1-py3-none-any.whl'

from spellchecker import SpellChecker

spell = SpellChecker()

In [None]:
def preprocess_text(text: str):
    text = text.lower()
    # text = removeHTML(text)
    text = re.sub("http\w+", '', text)  # remove urls
    text = re.sub(r"\s+", " ", text)  # remove extra spaces
#     x = expandContractions(x)
    text = re.sub(r"\.+", ".", text)  # remove extra periods
    text = re.sub(r"\,+", ",", text)  # remove extra commas
    text = text.strip()  # remove leading and trailing spaces
    return text

def is_misspelled(words: list):
    return len([spell.unknown(word) for word in words])

In [None]:
def get_paragraphs(data_df: pd.DataFrame):
    data_df['paragraph'] = data_df['full_text'].apply(
        lambda x: x.split("\n\n"))

    # preprocess paragraphs
    data_df['paragraph'] = data_df['paragraph'].apply(
        lambda x: [preprocess_text(para) for para in x])

    # drop empty paragraphs
    data_df['paragraph'] = data_df['paragraph'].apply(
        lambda x: [para for para in x if para.strip()])

    return data_df


def get_sentences(data_df: pd.DataFrame):
    # nlp.add_pipe('sentencizer')
    if 'sentencizer' not in nlp.pipe_names:
        nlp.add_pipe('sentencizer')
    data_df['sentence'] = data_df['paragraph'].apply(
        lambda x: [i.sent for i in nlp(x).sents])
    return data_df


def get_tokens(data_df: pd.DataFrame):
    data_df['words'] = data_df['sentence'].apply(
        lambda x: [word.text for word in x if word.text])
    data_df['lemmas'] = data_df['sentence'].apply(
        lambda x: [word.lemma_ for word in x if word.text])
    data_df['pos'] = data_df['sentence'].apply(
        lambda x: [word.pos_ for word in x if word.text])
    data_df['is_stop'] = data_df['sentence'].apply(
        lambda x: [word.is_stop for word in x if word.text])

    return data_df

In [None]:
def get_features_in_essays(data_df: pd.DataFrame, column_name: str, feature_name: str):
    group = data_df.copy()
    new_columns = {}
    new_columns['mean_' + feature_name +
                '_in_essay'] = group[column_name].mean()
    FEATURES.add('mean_' + feature_name + '_in_essay')

    new_columns['max_' + feature_name +
                '_in_essay'] = group[column_name].max()
    FEATURES.add('max_' + feature_name + '_in_essay')

    new_columns['min_' + feature_name +
                '_in_essay'] = group[column_name].min()
    FEATURES.add('min_' + feature_name + '_in_essay')

    new_columns['25th_percentile_' + feature_name +
                '_in_essay'] = np.percentile(group[column_name], 25)
    FEATURES.add('25th_percentile_' + feature_name + '_in_essay')

    new_columns['75th_percentile_' + feature_name +
                '_in_essay'] = np.percentile(group[column_name], 75)
    FEATURES.add('75th_percentile_' + feature_name + '_in_essay')

    data_df = pd.add([data_df, pd.DataFrame(new_columns)], axis=1)

    return data_df

def get_features_in_paragraphs(data_df: pd.DataFrame, column_name: str, feature_name: str):
    new_columns = {}
    group = data_df.copy().groupby(['essay_id'])[column_name]

    new_columns['mean_' + feature_name +
                '_in_paragraph'] = group.transform('mean')
    FEATURES.add('mean_' + feature_name + '_in_paragraph')

    new_columns['max_' + feature_name +
                '_in_paragraph'] = group.transform('max')
    FEATURES.add('max_' + feature_name + '_in_paragraph')

    new_columns['min_' + feature_name +
                '_in_paragraph'] = group.transform('min')
    FEATURES.add('min_' + feature_name + '_in_paragraph')

    new_columns['25th_percentile_' + feature_name +
                '_in_paragraph'] = group.transform(lambda x: np.percentile(x, 25))
    FEATURES.add('25th_percentile_' + feature_name + '_in_paragraph')

    new_columns['75th_percentile_' + feature_name +
                '_in_paragraph'] = group.transform(lambda x: np.percentile(x, 75))
    FEATURES.add('75th_percentile_' + feature_name + '_in_paragraph')

    data_df = pd.concat([data_df, pd.DataFrame(new_columns)], axis=1)

    return data_df

def get_features_in_sentences(data_df: pd.DataFrame, column_name: str, feature_name: str):
    new_columns = {}
    group = data_df.copy().groupby(['essay_id'])[column_name]

    new_columns['mean_' + feature_name +
                '_in_sentence'] = group.transform('mean')
    FEATURES.add('mean_' + feature_name + '_in_sentence')

    new_columns['max_' + feature_name +
                '_in_sentence'] = group.transform('max')
    FEATURES.add('max_' + feature_name + '_in_sentence')

    new_columns['min_' + feature_name +
                '_in_sentence'] = group.transform('min')
    FEATURES.add('min_' + feature_name + '_in_sentence')

    new_columns['25th_percentile_' + feature_name +
                '_in_sentence'] = group.transform(lambda x: np.percentile(x, 25))
    FEATURES.add('25th_percentile_' + feature_name + '_in_sentence')

    new_columns['75th_percentile_' + feature_name +
                '_in_sentence'] = group.transform(lambda x: np.percentile(x, 75))
    FEATURES.add('75th_percentile_' + feature_name + '_in_sentence')

    data_df = pd.concat([data_df, pd.DataFrame(new_columns)], axis=1)

    return data_df

def get_features_multi_levels(data_df: pd.DataFrame, column_name: str, feature_name: str):
    data_df = get_features_in_sentences(data_df, column_name, feature_name)
    data_df[feature_name + '_in_paragraph'] = data_df.groupby(
        ['essay_id', 'paragraph'])[column_name].transform('sum')
    data_df = get_features_in_paragraphs(
        data_df, feature_name + '_in_paragraph', feature_name)
    data_df[feature_name +
            '_in_essay'] = data_df.groupby('essay_id')[column_name].transform('sum')
    FEATURES.add(feature_name + '_in_essay')

    return data_df

In [None]:
def get_features(data_df: pd.DataFrame):
    data_df = get_paragraphs(data_df).explode('paragraph')

    data_df['full_text'] = data_df['full_text'].apply(preprocess_text)

    data_df = get_sentences(data_df).explode('sentence')

    data_df = get_tokens(data_df)
    data_df['sentence'] = data_df['sentence'].apply(lambda x: x.text)

    # get paragraph features
    data_df['num_paragraphs'] = data_df.groupby(
        'essay_id')['paragraph'].transform('nunique')
    FEATURES.add('num_paragraphs')

    # get number of sentences features
    data_df['num_sents_in_paragraph'] = data_df.groupby(['essay_id', 'paragraph'])[
        'sentence'].transform('nunique')
    data_df = get_features_in_paragraphs(
        data_df, 'num_sents_in_paragraph', 'num_sentences')
    
    data_df['num_sents_in_essay'] = data_df.groupby('essay_id')[
        'sentence'].transform('nunique')

    # get number of words features
    data_df['num_words_in_sentence'] = data_df['words'].apply(len)
    data_df = get_features_multi_levels(
        data_df, 'num_words_in_sentence', 'num_words')

    # get length of words features
    data_df['mean_word_lens_in_sentence'] = data_df['words'].apply(
        lambda x: np.mean([len(word) for word in x]))
    data_df = get_features_multi_levels(
        data_df, 'mean_word_lens_in_sentence', 'mean_word_lens')

    # get number of proper nouns features
    data_df['num_proper_nouns_in_sentence'] = data_df['pos'].apply(
        lambda x: np.count_nonzero(['PROPN' in pos for pos in x]))
    data_df = get_features_multi_levels(
        data_df, 'num_proper_nouns_in_sentence', 'num_proper_nouns')

    # get number of nouns features
    data_df['num_nouns_in_sentence'] = data_df['pos'].apply(
        lambda x: np.count_nonzero(['NOUN' in pos for pos in x]))
    data_df = get_features_multi_levels(
        data_df, 'num_nouns_in_sentence', 'num_nouns')

    # get number of verbs features
    data_df['num_verbs_in_sentence'] = data_df['pos'].apply(
        lambda x: np.count_nonzero(['VERB' in pos for pos in x]))
    data_df = get_features_multi_levels(
        data_df, 'num_verbs_in_sentence', 'num_verbs')

    # get number of adjectives features
    data_df['num_adjectives_in_sentence'] = data_df['pos'].apply(
        lambda x: np.count_nonzero(['ADJ' in pos for pos in x]))
    data_df = get_features_multi_levels(
        data_df, 'num_adjectives_in_sentence', 'num_adjectives')

    # get number of adverbs features
    data_df['num_adverbs_in_sentence'] = data_df['pos'].apply(
        lambda x: np.count_nonzero(['ADV' in pos for pos in x]))
    data_df = get_features_multi_levels(
        data_df, 'num_adverbs_in_sentence', 'num_adverbs')

    # get number of pronouns features
    data_df['num_pronouns_in_sentence'] = data_df['pos'].apply(
        lambda x: np.count_nonzero(['PRON' in pos for pos in x]))
    data_df = get_features_multi_levels(
        data_df, 'num_pronouns_in_sentence', 'num_pronouns')

    # get number of conjunctions features
    data_df['num_conjunctions_in_sentence'] = data_df['pos'].apply(
        lambda x: np.count_nonzero(['CONJ' in pos for pos in x]))
    data_df = get_features_multi_levels(
        data_df, 'num_conjunctions_in_sentence', 'num_conjunctions')

    # get number of misspelled words features
    data_df['num_misspelled_words_in_sentence'] = data_df['lemmas'].apply(
        lambda x: is_misspelled(x))
    data_df = get_features_multi_levels(
        data_df, 'num_misspelled_words_in_sentence', 'num_misspelled_words')

    if 'score' in data_df.columns:
        data_df = data_df[['essay_id', 'full_text', 'score'] + list(FEATURES)]
    else:
        data_df = data_df[['essay_id', 'full_text'] + list(FEATURES)]

    data_df = data_df.drop_duplicates()
    
    data_df = data_df.reset_index()

    return data_df

print("Getting features for train set")
train_data = get_features(train_data)

print("Getting features for test set")
test_data = get_features(test_data)

print("Getting features for extra set")
extra_data = get_features(extra_data)

print(train_data.shape, test_data.shape, extra_data.shape)

Split train, val, test set

In [None]:
all_data = pd.concat([train_data, extra_data], ignore_index=True)

print("Total numbers of feature:", len(FEATURES))

# shuffle the data
all_data = all_data.sample(frac=1, random_state=random_seed)
all_data = all_data.reset_index(drop=True)

train_ratio, val_ratio = 0.8, 0.2

train_df, val_df = train_test_split(all_data, test_size=val_ratio, 
                                    random_state=random_seed)

test_df = test_data

print("Number of train, val, test samples:")
print(train_df.shape, val_df.shape, test_df.shape)

Hyperparameters

In [None]:
hyperparameters = {
    'lr': 5e-5,
    'dropout': 0.5,
    'epochs': 10,
    'batch_size': 64,
    'train_set': {
        'total': len(train_df),
    },
    'val_set': {
        'total': len(val_df),
        'ratio': val_ratio,
    },
    'test_set': {
        'total': len(test_df),
    },
    'linguistic_features': FEATURES,
    'accelator': str(device)
}

print(hyperparameters)

In [None]:
print("Tokenizing train set")

train_tokenized = tokenizer(train_df['full_text'].tolist(),
                            padding=True, 
                            return_tensors="np")

hyperparameters['max_seq_len'] = train_tokenized['input_ids'].shape[1]

train_df['input_ids'] = train_tokenized['input_ids'].tolist()
train_df['attention_mask'] = train_tokenized['attention_mask'].tolist()


print(len(train_df.sample(1).iloc[0]['input_ids']))

In [None]:
print("Tokenizing val set")

val_tokenized = tokenizer(val_df['full_text'].tolist(),
                          max_length=hyperparameters['max_seq_len'],
                          padding='max_length', truncation=True, 
                          return_tensors="np")

val_df['input_ids'] = val_tokenized['input_ids'].tolist()
val_df['attention_mask'] = val_tokenized['attention_mask'].tolist()


print(len(val_df.sample(1).iloc[0]['input_ids']))

In [None]:
print("Tokenizing test set")

test_tokenized = tokenizer(test_df['full_text'].tolist(),
                           max_length=hyperparameters['max_seq_len'], 
                           padding='max_length', truncation=True, 
                           return_tensors="np")

test_df['input_ids'] = test_tokenized['input_ids'].tolist()
test_df['attention_mask'] = test_tokenized['attention_mask'].tolist()

print(len(test_df.sample(1).iloc[0]['input_ids']))

In [None]:
class MultiFeaturesDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.token_ids = df['input_ids'].values
        self.attention_mask = df['attention_mask'].values
        self.score = df['score'].values
        self.ling_features = []
        for feature in FEATURES:
            self.ling_features.append(df[feature].values)

    def __len__(self):
        return len(self.score)

    def __getitem__(self, idx):
        features = []
        for feature in self.ling_features:
            features.append(feature[idx])

        features = torch.tensor(features, dtype=torch.float)

        score = torch.reshape(torch.tensor(
            self.score[idx], dtype=torch.float), (1,))

        return torch.tensor(self.token_ids[idx]), torch.tensor(self.attention_mask[idx]), features, score


train_dataset = MultiFeaturesDataset(train_df)
val_dataset = MultiFeaturesDataset(val_df)

In [None]:
train_dataloader = DataLoader(
    train_dataset, batch_size=hyperparameters['batch_size'], shuffle=True)
val_dataloader = DataLoader(
    val_dataset, batch_size=hyperparameters['batch_size'], shuffle=True)

for token_ids, attention_mask, features, score in val_dataloader:
    print(token_ids.shape, attention_mask.shape, features.shape, score.shape)
    break

In [None]:
class MultiFeaturesModel(torch.nn.Module):
    def __init__(self, embedder,
                 lf_input_size, lf_hidden_size=64,
                 dropout=0.2):
        super(MultiFeaturesModel, self).__init__()
        # freeze
        for param in embedder.parameters():
            param.requires_grad = False
        # unfreeze the pooler
        for param in embedder.pooler.parameters():
            param.requires_grad = True
            
        self.embedder = embedder
        self.lf = torch.nn.Linear(lf_input_size,lf_hidden_size)
        self.regressor = torch.nn.Linear(lf_hidden_size + embedder.config.hidden_size, 1)
        self.dropout = torch.nn.Dropout(dropout)
    
    def config(self):
        return {
            'embedder': self.embedder.config,
            'lf': {
                'input_size': self.lf.in_features,
                'hidden_size': self.lf.out_features
            },
            'regressor': {
                'input_size': self.regressor.in_features,
                'output_size': self.regressor.out_features
            }
        }

    def forward(self, token_ids, attention_mask, ling_features):
        embedded = self.embedder(token_ids, attention_mask=attention_mask, output_hidden_states=True)[1]
        if self.training:
            embedded = self.dropout(embedded)
            
        ling_features = self.lf(ling_features)
        ling_features = F.leaky_relu(ling_features)
        if self.training:
            ling_features = self.dropout(ling_features)
            
        features = torch.cat((embedded, ling_features), dim=1)
        score = self.regressor(features)
        return score


model = MultiFeaturesModel(embedder, 
                           len(FEATURES), 128,
                           hyperparameters['dropout'])
# model.to(device)
model

In [None]:
# model_graph = draw_graph(model,
#                          input_data=inputs,
#                          expand_nested=True,
#                          depth=2)
# model_graph.visual_graph

In [None]:
def train(model, optimizer, criterion, train_dataloader, logging_steps=20):
    model.train()
    running_loss = 0.0

    for i, (token_ids, attention_mask, features, score) in enumerate(train_dataloader):

        output = model(token_ids.to(device),
                       attention_mask.to(device),
                       features.to(device))
    
        loss = criterion(output, score.to(device)).float()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        running_loss += loss.item()
        
        # if (i + 1) % (logging_steps) == 0 or (i + 1) == len(train_dataloader):
        #     wandb.log({'train_loss_steps': running_loss / (i + 1),
        #                'learning_rate': optimizer.param_groups[0]['lr']})

    return running_loss / len(train_dataloader)


def evaluate(model, criterion, dataloader):
    model.eval()
    running_loss = 0.0
    all_scores = []
    predictions = []

    with torch.no_grad():
        for token_ids, attention_mask, features, score in dataloader:
            output = model(token_ids.to(device),
                           attention_mask.to(device),
                           features.to(device))

            loss = criterion(output, score.to(device)).float()
            
            running_loss += loss.item()
            all_scores.extend(score.cpu().numpy())
            predictions.extend(output.cpu().numpy())
            
    return running_loss / len(dataloader), torch.tensor(all_scores), torch.tensor(predictions)

In [None]:
class EarlyStopper:
    def __init__(self, patience=1, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = float('inf')

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

In [None]:
def logit_to_score(logit, min_score=1, max_score=6):
    scores = torch.clamp(torch.round(logit), min_score, max_score)
    scores = scores.long()
    return scores

In [None]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(),
                             lr=hyperparameters['lr'],
                             weight_decay=1e-8)
scheduler = ReduceLROnPlateau(optimizer, patience=3, factor=0.5)
early_stopper = EarlyStopper(patience=5, min_delta=1e-6)

train_losses, val_losses, val_kappa_scores, val_accuracies = [], [], [], []

hyperparameters['early_stopper'] = early_stopper.__dict__['patience']
hyperparameters['scheduler'] = scheduler.__dict__
hyperparameters['model'] = model.config()

hyperparameters

In [None]:
torch.cuda.empty_cache()
model.to(device)

# wandb.init(project='deep-essay-scoring', config=hyperparameters)

print("Start training...")

for epoch in range(hyperparameters['epochs']):
    train_loss = train(model, optimizer, criterion, train_dataloader)
    train_losses.append(train_loss)

    val_loss, val_scores, val_predictions = evaluate(
        model, criterion, val_dataloader)
    val_kappa = cohen_kappa_score(val_scores.cpu().numpy(),
                                  logit_to_score(
                                      val_predictions).cpu().numpy(),
                                  weights='quadratic')

    val_accuracy = torch.sum(val_scores == logit_to_score(
        val_predictions)).float() / len(val_scores)

    val_losses.append(val_loss)
    val_kappa_scores.append(val_kappa)
    val_accuracies.append(val_accuracy)

    scheduler.step(val_loss)

    # wandb.log({'train_loss': train_loss, 'val_loss': val_loss,
    #           'val_kappa': val_kappa, 'epoch': epoch+1})

    print(f'Epoch: {epoch+1}, Train Loss: {train_loss}, Val Loss: {val_loss}, Val Kappa: {val_kappa}, Val Accuracy: {val_accuracy}')

#     break

#     if early_stopper.early_stop(val_losses[-1]):
#         print("Early stopping")
#         break

In [None]:
# # Get current date and time
# now = datetime.datetime.now()
# now_str = now.strftime("%YY-%mm-%dd") 

# embedder_name = model_path.split('/')[-1]
# model_name = f'multi_features_{embedder_name}_{now_str}_model'

# # Save the model
# with open(f'{model_name}_summary.txt', 'w') as f:
#     f.write(str(model_summary))

# torch.save(model.state_dict(), f'{model_name}.pth')

In [None]:
torch.cuda.empty_cache()

In [None]:
model.eval()
test_predictions = []

with torch.no_grad():
    for i, row in test_df.iterrows():
        token_ids = torch.tensor(row['input_ids']).unsqueeze(0)
        attention_mask = torch.tensor(row['attention_mask']).unsqueeze(0)
        row_ling_features = torch.tensor(row[list(FEATURES)].tolist()).unsqueeze(0)

        output = model(token_ids.to(device), 
                       attention_mask.to(device), 
                       row_ling_features.to(device))
        test_predictions.append(output.item())
        
submit_df = pd.DataFrame({
    'essay_id': test_df['essay_id'],
    'prediction': logit_to_score(torch.tensor(test_predictions)).cpu().numpy()
})
print(submit_df.shape)
submit_df.to_csv('submission.csv', index=False)