In [None]:
import random

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from string import punctuation, printable
import re

import spacy
nlp = spacy.load("en_core_web_sm")


import sklearn

import torch
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)

In [None]:
from distutils.dir_util import copy_tree

copy_tree('/kaggle/input/spellchecker', '/kaggle/working/')

!gzip '/kaggle/working/spellchecker/resources/en.json'

!pip install '/kaggle/working/pyspellchecker-0.8.1-py3-none-any.whl'

In [None]:
from spellchecker import SpellChecker

spell = SpellChecker()

In [None]:
input_dir = "/kaggle/input/learning-agency-lab-automated-essay-scoring-2"
test_data = pd.read_csv(os.path.join(input_dir, 'test.csv'))

print(test_data.shape)

In [None]:
FEATURES = []

def preprocess_text(text: str):
    text = text.lower()
    # text = removeHTML(text)
    text = re.sub("http\w+", '', text)  # remove urls
    text = re.sub(r"\s+", " ", text)  # remove extra spaces
#     x = expandContractions(x)
    text = re.sub(r"\.+", ".", text)  # remove extra periods
    text = re.sub(r"\,+", ",", text)  # remove extra commas
    text = text.strip()  # remove leading and trailing spaces
    return text

def is_misspelled(words: list):
    return len([spell.unknown(word) for word in words])


def get_paragraphs(data_df: pd.DataFrame):
    data_df['paragraph'] = data_df['full_text'].apply(
        lambda x: x.split("\n\n"))

    # preprocess paragraphs
    data_df['paragraph'] = data_df['paragraph'].apply(
        lambda x: [preprocess_text(para) for para in x])

    # drop empty paragraphs
    data_df['paragraph'] = data_df['paragraph'].apply(
        lambda x: [para for para in x if para.strip()])

    return data_df

def get_sentences(data_df: pd.DataFrame):
    # nlp.add_pipe('sentencizer')
    if 'sentencizer' not in nlp.pipe_names:
        nlp.add_pipe('sentencizer')
    data_df['sentence'] = data_df['paragraph'].apply(
        lambda x: [i.sent for i in nlp(x).sents])
    return data_df


def get_tokens(data_df: pd.DataFrame):
    data_df['words'] = data_df['sentence'].apply(
        lambda x: [word.text for word in x if word.text])
    data_df['lemmas'] = data_df['sentence'].apply(
        lambda x: [word.lemma_ for word in x if word.text])
    data_df['pos'] = data_df['sentence'].apply(
        lambda x: [word.pos_ for word in x if word.text])
    data_df['is_stop'] = data_df['sentence'].apply(
        lambda x: [word.is_stop for word in x if word.text])

    return data_df

def get_features_in_essays(data_df: pd.DataFrame, column_name: str, feature_name: str):
    new_columns = {}
    new_columns['mean_' + feature_name +
                '_in_essay'] = data_df.copy()[column_name].mean()
    FEATURES.append('mean_' + feature_name + '_in_essay')

    new_columns['max_' + feature_name +
                '_in_essay'] = data_df[column_name].max()
    FEATURES.append('max_' + feature_name + '_in_essay')

    new_columns['min_' + feature_name +
                '_in_essay'] = data_df[column_name].min()
    FEATURES.append('min_' + feature_name + '_in_essay')

    new_columns['25th_percentile_' + feature_name +
                '_in_essay'] = np.percentile(data_df[column_name], 25)
    FEATURES.append('25th_percentile_' + feature_name + '_in_essay')

    new_columns['75th_percentile_' + feature_name +
                '_in_essay'] = np.percentile(data_df[column_name], 75)
    FEATURES.append('75th_percentile_' + feature_name + '_in_essay')

    data_df = pd.concat([data_df, pd.DataFrame(new_columns)], axis=1)

    return data_df


def get_features_in_paragraphs(data_df: pd.DataFrame, column_name: str, feature_name: str):
    new_columns = {}
    group = data_df.copy().groupby(['essay_id'])[column_name]

    new_columns['mean_' + feature_name +
                '_in_paragraph'] = group.transform('mean')
    FEATURES.append('mean_' + feature_name + '_in_paragraph')

    new_columns['max_' + feature_name +
                '_in_paragraph'] = group.transform('max')
    FEATURES.append('max_' + feature_name + '_in_paragraph')

    new_columns['min_' + feature_name +
                '_in_paragraph'] = group.transform('min')
    FEATURES.append('min_' + feature_name + '_in_paragraph')

    new_columns['25th_percentile_' + feature_name +
                '_in_paragraph'] = group.transform(lambda x: np.percentile(x, 25))
    FEATURES.append('25th_percentile_' + feature_name + '_in_paragraph')

    new_columns['75th_percentile_' + feature_name +
                '_in_paragraph'] = group.transform(lambda x: np.percentile(x, 75))
    FEATURES.append('75th_percentile_' + feature_name + '_in_paragraph')

    data_df = pd.concat([data_df, pd.DataFrame(new_columns)], axis=1)

    return data_df


def get_features_in_sentences(data_df: pd.DataFrame, column_name: str, feature_name: str):
    new_columns = {}
    group = data_df.copy().groupby(['essay_id'])[column_name]

    new_columns['mean_' + feature_name +
                '_in_sentence'] = group.transform('mean')
    FEATURES.append('mean_' + feature_name + '_in_sentence')

    new_columns['max_' + feature_name +
                '_in_sentence'] = group.transform('max')
    FEATURES.append('max_' + feature_name + '_in_sentence')

    new_columns['min_' + feature_name +
                '_in_sentence'] = group.transform('min')
    FEATURES.append('min_' + feature_name + '_in_sentence')

    new_columns['25th_percentile_' + feature_name +
                '_in_sentence'] = group.transform(lambda x: np.percentile(x, 25))
    FEATURES.append('25th_percentile_' + feature_name + '_in_sentence')

    new_columns['75th_percentile_' + feature_name +
                '_in_sentence'] = group.transform(lambda x: np.percentile(x, 75))
    FEATURES.append('75th_percentile_' + feature_name + '_in_sentence')

    data_df = pd.concat([data_df, pd.DataFrame(new_columns)], axis=1)

    return data_df

def get_features_multi_levels(data_df: pd.DataFrame, column_name: str, feature_name: str):
    data_df = get_features_in_sentences(data_df, column_name, feature_name)
    data_df[feature_name + '_in_paragraph'] = data_df.groupby(
        ['essay_id', 'paragraph'])[column_name].transform('sum')
    data_df = get_features_in_paragraphs(
        data_df, feature_name + '_in_paragraph', feature_name)
    data_df[feature_name +
            '_in_essay'] = data_df.groupby('essay_id')[column_name].transform('sum')
    FEATURES.append(feature_name + '_in_essay')

    return data_df


def get_features(data_df: pd.DataFrame):
    data_df = get_paragraphs(data_df).explode('paragraph')

    data_df['full_text'] = data_df['full_text'].apply(preprocess_text)

    data_df = get_sentences(data_df).explode('sentence')

    data_df = get_tokens(data_df)
    data_df['sentence'] = data_df['sentence'].apply(lambda x: x.text)

    # get paragraph features
    data_df['num_paragraphs'] = data_df.groupby(
        'essay_id')['paragraph'].transform('nunique')
    FEATURES.append('num_paragraphs')

    # get number of sentences features
    data_df['num_sents_in_paragraph'] = data_df.groupby(['essay_id', 'paragraph'])[
        'sentence'].transform('nunique')
    data_df = get_features_in_paragraphs(
        data_df, 'num_sents_in_paragraph', 'num_sentences')
    
    data_df['num_sents_in_essay'] = data_df.groupby('essay_id')[
        'sentence'].transform('nunique')

    # get number of words features
    data_df['num_words_in_sentence'] = data_df['words'].apply(len)
    data_df = get_features_multi_levels(
        data_df, 'num_words_in_sentence', 'num_words')

    # get length of words features
    data_df['mean_word_lens_in_sentence'] = data_df['words'].apply(
        lambda x: np.mean([len(word) for word in x]))
    data_df = get_features_multi_levels(
        data_df, 'mean_word_lens_in_sentence', 'mean_word_lens')

    # get number of proper nouns features
    data_df['num_proper_nouns_in_sentence'] = data_df['pos'].apply(
        lambda x: np.count_nonzero(['PROPN' in pos for pos in x]))
    data_df = get_features_multi_levels(
        data_df, 'num_proper_nouns_in_sentence', 'num_proper_nouns')

    # get number of nouns features
    data_df['num_nouns_in_sentence'] = data_df['pos'].apply(
        lambda x: np.count_nonzero(['NOUN' in pos for pos in x]))
    data_df = get_features_multi_levels(
        data_df, 'num_nouns_in_sentence', 'num_nouns')

    # get number of verbs features
    data_df['num_verbs_in_sentence'] = data_df['pos'].apply(
        lambda x: np.count_nonzero(['VERB' in pos for pos in x]))
    data_df = get_features_multi_levels(
        data_df, 'num_verbs_in_sentence', 'num_verbs')

    # get number of adjectives features
    data_df['num_adjectives_in_sentence'] = data_df['pos'].apply(
        lambda x: np.count_nonzero(['ADJ' in pos for pos in x]))
    data_df = get_features_multi_levels(
        data_df, 'num_adjectives_in_sentence', 'num_adjectives')

    # get number of adverbs features
    data_df['num_adverbs_in_sentence'] = data_df['pos'].apply(
        lambda x: np.count_nonzero(['ADV' in pos for pos in x]))
    data_df = get_features_multi_levels(
        data_df, 'num_adverbs_in_sentence', 'num_adverbs')

    # get number of pronouns features
    data_df['num_pronouns_in_sentence'] = data_df['pos'].apply(
        lambda x: np.count_nonzero(['PRON' in pos for pos in x]))
    data_df = get_features_multi_levels(
        data_df, 'num_pronouns_in_sentence', 'num_pronouns')

    # get number of conjunctions features
    data_df['num_conjunctions_in_sentence'] = data_df['pos'].apply(
        lambda x: np.count_nonzero(['CONJ' in pos for pos in x]))
    data_df = get_features_multi_levels(
        data_df, 'num_conjunctions_in_sentence', 'num_conjunctions')

    # get number of misspelled words features
    data_df['num_misspelled_words_in_sentence'] = data_df['lemmas'].apply(
        lambda x: is_misspelled(x))
    data_df = get_features_multi_levels(
        data_df, 'num_misspelled_words_in_sentence', 'num_misspelled_words')

    data_df = data_df[['essay_id', 'full_text', 'score'] + FEATURES]

    data_df = data_df.drop_duplicates()

    return data_df

test_df = get_features(test_data)

test_df = test_df.drop_duplicates()
test_df = test_df.loc[:,~test_df.columns.duplicated()].copy()

test_df = test_df.reset_index(drop=True)

print(test_df.shape)

In [None]:
model_path = '/kaggle/input/essay-scoring-models/longformer-base-4096'
tokenizer = AutoTokenizer.from_pretrained(model_path)
embedder = AutoModel.from_pretrained(model_path)

In [None]:
hyperparameters = {
    'batch_size': 64,
    'test_set': {
        'total': len(test_df),
    },
    'linguistic_features': FEATURES,
    'accelator': str(device),
    'max_seq_len': 2048
}

In [None]:
test_tokenized = tokenizer(test_df['full_text'].tolist(),
                           max_length=hyperparameters['max_seq_len'], 
                           padding='max_length', truncation=True, 
                           return_tensors="np")

test_df['input_ids'] = test_tokenized['input_ids'].tolist()
test_df['attention_mask'] = test_tokenized['attention_mask'].tolist()

print(len(test_df.sample(1).iloc[0]['input_ids']))

In [None]:
class MultiFeaturesModel(torch.nn.Module):
    def __init__(self, embedder,
                 lf_input_size, lf_hidden_size=64,
                 dropout=0.2):
        super(MultiFeaturesModel, self).__init__()
        # freeze
        for param in embedder.parameters():
            param.requires_grad = False
        # unfreeze the pooler
        for param in embedder.pooler.parameters():
            param.requires_grad = True
            
        self.embedder = embedder
        self.lf = torch.nn.Linear(lf_input_size,lf_hidden_size)
        # self.fc1 = torch.nn.Linear(lf_hidden_size + embedder.config.hidden_size, 256)
        # self.fc2 = torch.nn.Linear(256, 128)
        self.regressor = torch.nn.Linear(lf_hidden_size + embedder.config.hidden_size, 1)
        self.dropout = torch.nn.Dropout(dropout)
    
    def config(self):
        return {
            'embedder': self.embedder.config,
            'lf': {
                'input_size': self.lf.in_features,
                'hidden_size': self.lf.out_features
            },
            'regressor': {
                'input_size': self.regressor.in_features,
                'output_size': self.regressor.out_features
            }
        }

    def forward(self, token_ids, attention_mask, ling_features):
        embedded = self.embedder(token_ids, attention_mask=attention_mask, output_hidden_states=True)[1]
        if self.training:
            embedded = self.dropout(embedded)
            
        ling_features = self.lf(ling_features)
        ling_features = F.leaky_relu(ling_features)
        if self.training:
            ling_features = self.dropout(ling_features)
            
        features = torch.cat((embedded, ling_features), dim=1)

        # fc1 = self.fc1(features)
        # fc1 = F.leaky_relu(fc1)
        # if self.training:
        #     fc1 = self.dropout(fc1)
        
        # fc2 = self.fc2(fc1)
        # fc2 = F.leaky_relu(fc2)
        # if self.training:
        #     fc2 = self.dropout(fc2)
            
        # score = self.regressor(fc2)
        score = self.regressor(features)
        return score


model = MultiFeaturesModel(embedder, 
                           len(FEATURES), 128,
                           hyperparameters['dropout'])

model.load_state_dict(torch.load('/kaggle/input/essay-scoring-models/hopeful-paper-52/multi_features_longformer-base-4096_2024Y-05m-10d_model.pth',
                                 map_location=device))

model.to(device)

In [None]:
def logit_to_score(logit, min_score=1, max_score=6):
    scores = torch.clamp(torch.round(logit), min_score, max_score)
    scores = scores.long()
    return scores

In [None]:
model.eval()
test_predictions = []

with torch.no_grad():
    for i, row in test_df.iterrows():
        token_ids = torch.tensor(row['input_ids']).unsqueeze(0)
        attention_mask = torch.tensor(row['attention_mask']).unsqueeze(0)
        row_ling_features = torch.tensor(row[FEATURES].tolist()).unsqueeze(0)

        output = model(token_ids.to(device), 
                       attention_mask.to(device), 
                       row_ling_features.to(device))
        test_predictions.append(output.item())
        
submit_df = pd.DataFrame({
    'essay_id': test_df['essay_id'],
    'prediction': logit_to_score(torch.tensor(test_predictions)).cpu().numpy()
})
print(submit_df.shape)
submit_df.to_csv('submission.csv', index=False)