In [1]:
import random

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from string import punctuation, printable
import re

import spacy
nlp = spacy.load("en_core_web_sm")


import sklearn

import torch
import torch.nn.functional as F
from transformers import LongformerModel, LongformerTokenizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)

cuda


In [2]:
from distutils.dir_util import copy_tree

copy_tree('/kaggle/input/spellchecker', '/kaggle/working/')

%gzip '/kaggle/working/spellchecker/resources/en.json'

%pip install '/kaggle/working/pyspellchecker-0.8.1-py3-none-any.whl'

Processing ./pyspellchecker-0.8.1-py3-none-any.whl
Installing collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.1


In [3]:
from spellchecker import SpellChecker

spell = SpellChecker()

In [4]:
input_dir = "/kaggle/input/learning-agency-lab-automated-essay-scoring-2"
test_data = pd.read_csv(os.path.join(input_dir, 'test.csv'))

print(test_data.shape)

data_dir = "/kaggle/input/aes-linguistic"
with open(os.path.join(data_dir, 'features.txt'), 'r') as f:
    FEATURES = f.read().splitlines()

FEATURES = sorted(list(set(FEATURES)))

(3, 2)


['mean_num_verbs_in_paragraph',
 'mean_num_pronouns_in_paragraph',
 'num_words_in_essay',
 'mean_num_verbs_in_sentence',
 'max_num_pronouns_in_paragraph',
 'max_num_proper_nouns_in_sentence',
 '25th_percentile_num_sentences_in_paragraph',
 'min_num_adverbs_in_paragraph',
 'max_num_conjunctions_in_sentence',
 '75th_percentile_num_misspelled_words_in_sentence',
 'mean_num_adjectives_in_paragraph',
 '25th_percentile_num_proper_nouns_in_paragraph',
 'mean_word_lens_in_essay',
 '25th_percentile_num_words_in_sentence',
 '75th_percentile_mean_word_lens_in_sentence',
 'min_num_conjunctions_in_sentence',
 'max_num_words_in_sentence',
 '25th_percentile_mean_word_lens_in_sentence',
 'max_mean_word_lens_in_sentence',
 'max_num_adverbs_in_sentence',
 '25th_percentile_num_words_in_paragraph',
 '75th_percentile_num_conjunctions_in_paragraph',
 'min_num_nouns_in_sentence',
 '75th_percentile_num_nouns_in_paragraph',
 'mean_num_sentences_in_paragraph',
 '25th_percentile_num_conjunctions_in_sentence',
 '

In [5]:
def preprocess_text(text: str):
    text = text.lower()
    # text = removeHTML(text)
    text = re.sub("http\w+", '', text)  # remove urls
    text = re.sub(r"\s+", " ", text)  # remove extra spaces
#     x = expandContractions(x)
    text = re.sub(r"\.+", ".", text)  # remove extra periods
    text = re.sub(r"\,+", ",", text)  # remove extra commas
    text = text.strip()  # remove leading and trailing spaces
    return text

In [6]:
def is_misspelled(words: list):
    return len([spell.unknown(word) for word in words])

In [7]:
def get_paragraphs(data_df: pd.DataFrame):
    data_df['paragraph'] = data_df['full_text'].apply(
        lambda x: x.split("\n\n"))

    # preprocess paragraphs
    data_df['paragraph'] = data_df['paragraph'].apply(
        lambda x: [preprocess_text(para) for para in x])

    # drop empty paragraphs
    data_df['paragraph'] = data_df['paragraph'].apply(
        lambda x: [para for para in x if para.strip()])

    return data_df


def get_sentences(data_df: pd.DataFrame):
    # nlp.add_pipe('sentencizer')
    if 'sentencizer' not in nlp.pipe_names:
        nlp.add_pipe('sentencizer')
    data_df['sentence'] = data_df['paragraph'].apply(
        lambda x: [i.sent for i in nlp(x).sents])
    return data_df


def get_tokens(data_df: pd.DataFrame):
    data_df['words'] = data_df['sentence'].apply(
        lambda x: [word.text for word in x if word.text])
    data_df['lemmas'] = data_df['sentence'].apply(
        lambda x: [word.lemma_ for word in x if word.text])
    data_df['pos'] = data_df['sentence'].apply(
        lambda x: [word.pos_ for word in x if word.text])
    data_df['is_stop'] = data_df['sentence'].apply(
        lambda x: [word.is_stop for word in x if word.text])

    return data_df

In [8]:
def get_features_in_essays(data_df: pd.DataFrame, column_name: str, feature_name: str):
    group = data_df.copy()
    new_columns = {}
    new_columns['mean_' + feature_name +
                '_in_essay'] = group[column_name].mean()

    new_columns['max_' + feature_name +
                '_in_essay'] = group[column_name].max()

    new_columns['min_' + feature_name +
                '_in_essay'] = group[column_name].min()

    new_columns['25th_percentile_' + feature_name +
                '_in_essay'] = np.percentile(group[column_name], 25)

    new_columns['75th_percentile_' + feature_name +
                '_in_essay'] = np.percentile(group[column_name], 75)

    data_df = pd.add([data_df, pd.DataFrame(new_columns)], axis=1)

    return data_df

In [9]:
def get_features_in_paragraphs(data_df: pd.DataFrame, column_name: str, feature_name: str):
    new_columns = {}
    group = data_df.copy().groupby(['essay_id'])[column_name]

    new_columns['mean_' + feature_name +
                '_in_paragraph'] = group.transform('mean')

    new_columns['max_' + feature_name +
                '_in_paragraph'] = group.transform('max')

    new_columns['min_' + feature_name +
                '_in_paragraph'] = group.transform('min')

    new_columns['25th_percentile_' + feature_name +
                '_in_paragraph'] = group.transform(lambda x: np.percentile(x, 25))

    new_columns['75th_percentile_' + feature_name +
                '_in_paragraph'] = group.transform(lambda x: np.percentile(x, 75))

    data_df = pd.concat([data_df, pd.DataFrame(new_columns)], axis=1)

    return data_df

In [10]:
def get_features_in_sentences(data_df: pd.DataFrame, column_name: str, feature_name: str):
    new_columns = {}
    group = data_df.copy().groupby(['essay_id'])[column_name]

    new_columns['mean_' + feature_name +
                '_in_sentence'] = group.transform('mean')

    new_columns['max_' + feature_name +
                '_in_sentence'] = group.transform('max')

    new_columns['min_' + feature_name +
                '_in_sentence'] = group.transform('min')

    new_columns['25th_percentile_' + feature_name +
                '_in_sentence'] = group.transform(lambda x: np.percentile(x, 25))

    new_columns['75th_percentile_' + feature_name +
                '_in_sentence'] = group.transform(lambda x: np.percentile(x, 75))

    data_df = pd.concat([data_df, pd.DataFrame(new_columns)], axis=1)

    return data_df

In [11]:
def get_features_multi_levels(data_df: pd.DataFrame, column_name: str, feature_name: str):
    data_df = get_features_in_sentences(data_df, column_name, feature_name)
    data_df[feature_name + '_in_paragraph'] = data_df.groupby(
        ['essay_id', 'paragraph'])[column_name].transform('sum')
    data_df = get_features_in_paragraphs(
        data_df, feature_name + '_in_paragraph', feature_name)
    data_df[feature_name +
            '_in_essay'] = data_df.groupby('essay_id')[column_name].transform('sum')

    return data_df

In [14]:
def get_features(data_df: pd.DataFrame,  save: bool = False, path: str = None):
    data_df = get_paragraphs(data_df).explode('paragraph')

    data_df['full_text'] = data_df['full_text'].apply(preprocess_text)

    data_df = get_sentences(data_df).explode('sentence')

    data_df = get_tokens(data_df)
    data_df['sentence'] = data_df['sentence'].apply(lambda x: x.text)

    # get paragraph features
    data_df['num_paragraphs'] = data_df.groupby(
        'essay_id')['paragraph'].transform('nunique')

    # get number of sentences features
    data_df['num_sents_in_paragraph'] = data_df.groupby(['essay_id', 'paragraph'])[
        'sentence'].transform('nunique')
    data_df = get_features_in_paragraphs(
        data_df, 'num_sents_in_paragraph', 'num_sentences')
    
    data_df['num_sents_in_essay'] = data_df.groupby('essay_id')[
        'sentence'].transform('nunique')

    # get number of words features
    data_df['num_words_in_sentence'] = data_df['words'].apply(len)
    data_df = get_features_multi_levels(
        data_df, 'num_words_in_sentence', 'num_words')

    # get length of words features
    data_df['mean_word_lens_in_sentence'] = data_df['words'].apply(
        lambda x: np.mean([len(word) for word in x]))
    data_df = get_features_multi_levels(
        data_df, 'mean_word_lens_in_sentence', 'mean_word_lens')

    # get number of proper nouns features
    data_df['num_proper_nouns_in_sentence'] = data_df['pos'].apply(
        lambda x: np.count_nonzero(['PROPN' in pos for pos in x]))
    data_df = get_features_multi_levels(
        data_df, 'num_proper_nouns_in_sentence', 'num_proper_nouns')

    # get number of nouns features
    data_df['num_nouns_in_sentence'] = data_df['pos'].apply(
        lambda x: np.count_nonzero(['NOUN' in pos for pos in x]))
    data_df = get_features_multi_levels(
        data_df, 'num_nouns_in_sentence', 'num_nouns')

    # get number of verbs features
    data_df['num_verbs_in_sentence'] = data_df['pos'].apply(
        lambda x: np.count_nonzero(['VERB' in pos for pos in x]))
    data_df = get_features_multi_levels(
        data_df, 'num_verbs_in_sentence', 'num_verbs')

    # get number of adjectives features
    data_df['num_adjectives_in_sentence'] = data_df['pos'].apply(
        lambda x: np.count_nonzero(['ADJ' in pos for pos in x]))
    data_df = get_features_multi_levels(
        data_df, 'num_adjectives_in_sentence', 'num_adjectives')

    # get number of adverbs features
    data_df['num_adverbs_in_sentence'] = data_df['pos'].apply(
        lambda x: np.count_nonzero(['ADV' in pos for pos in x]))
    data_df = get_features_multi_levels(
        data_df, 'num_adverbs_in_sentence', 'num_adverbs')

    # get number of pronouns features
    data_df['num_pronouns_in_sentence'] = data_df['pos'].apply(
        lambda x: np.count_nonzero(['PRON' in pos for pos in x]))
    data_df = get_features_multi_levels(
        data_df, 'num_pronouns_in_sentence', 'num_pronouns')

    # get number of conjunctions features
    data_df['num_conjunctions_in_sentence'] = data_df['pos'].apply(
        lambda x: np.count_nonzero(['CONJ' in pos for pos in x]))
    data_df = get_features_multi_levels(
        data_df, 'num_conjunctions_in_sentence', 'num_conjunctions')

    # get number of misspelled words features
    data_df['num_misspelled_words_in_sentence'] = data_df['lemmas'].apply(
        lambda x: is_misspelled(x))
    data_df = get_features_multi_levels(
        data_df, 'num_misspelled_words_in_sentence', 'num_misspelled_words')

    data_df = data_df[['essay_id', 'full_text', 'paragraph', 'sentence'] + FEATURES]

    data_df = data_df.drop_duplicates()

#     print(data_df.shape, data_df[['essay_id', 'full_text',  'score'] + list(FEATURES)].drop_duplicates().shape)

    if save:
        data_df.to_csv(path, index=False)
        with open(os.path.join(os.path.dirname(path), 'features.txt'), 'w') as f:
            for item in FEATURES:
                f.write("%s\n" % item)

    return data_df

test_df = get_features(test_data)

test_df = test_df[['essay_id', 'full_text'] + list(FEATURES)]
test_df = test_df.drop_duplicates()

print(test_df.shape)
print(len(FEATURES))

test_df

(3, 118)
116


Unnamed: 0,essay_id,full_text,mean_num_verbs_in_paragraph,mean_num_pronouns_in_paragraph,num_words_in_essay,mean_num_verbs_in_sentence,max_num_pronouns_in_paragraph,max_num_proper_nouns_in_sentence,25th_percentile_num_sentences_in_paragraph,min_num_adverbs_in_paragraph,...,min_num_proper_nouns_in_sentence,num_misspelled_words_in_essay,mean_num_misspelled_words_in_sentence,max_num_adverbs_in_paragraph,num_nouns_in_essay,min_num_pronouns_in_paragraph,25th_percentile_num_verbs_in_paragraph,75th_percentile_num_adverbs_in_sentence,75th_percentile_num_adjectives_in_paragraph,min_num_adjectives_in_paragraph
0,000d118,many people have car where they live. the thin...,65.0,53.0,546,5.0,53,5,13.0,15,...,0,546,42.0,15,107,53,65.0,1.0,38.0,38
1,000fe60,i am a scientist at nasa that is discussing th...,9.761905,12.0,373,2.095238,15,2,3.0,0,...,0,373,17.761905,9,54,6,10.0,1.0,6.0,0
2,001ab80,people always wish they had the same technolog...,19.88,13.76,607,2.8,22,0,4.0,7,...,0,607,24.28,19,114,5,15.0,2.0,14.0,9


In [13]:
model_path = '/kaggle/input/essay-scoring-models/longformer-base-4096'
tokenizer = LongformerTokenizer.from_pretrained(model_path)
embedder = LongformerModel.from_pretrained(model_path, attention_window=128)

  return self.fget.__get__(instance, owner)()


In [14]:
hyperparameters = {
    'batch_size': 64,
    'dropout': 0.3,
    'test_set': {
        'total': len(test_df),
    },
    'linguistic_features': FEATURES,
    'accelator': str(device),
    'max_seq_len': 1929
}

In [26]:
test_tokenized = tokenizer(test_df['full_text'].tolist(),
                           max_length=hyperparameters['max_seq_len'], 
                           padding='max_length', truncation=True, 
                           return_tensors="np")

test_df['input_ids'] = test_tokenized['input_ids'].tolist()
test_df['attention_mask'] = test_tokenized['attention_mask'].tolist()

print(len(test_df.sample(1).iloc[0]['input_ids']))

1929


In [19]:
class MultiFeaturesModel(torch.nn.Module):
    def __init__(self, embedder,
                 lf_input_size, lf_hidden_size=64,
                 dropout=0.2):
        super(MultiFeaturesModel, self).__init__()
        # freeze
        for param in embedder.parameters():
            param.requires_grad = False
        # unfreeze the pooler
        for param in embedder.pooler.parameters():
            param.requires_grad = True
            
        self.embedder = embedder
        self.lf = torch.nn.Linear(lf_input_size,lf_hidden_size)
        # self.fc1 = torch.nn.Linear(lf_hidden_size + embedder.config.hidden_size, 256)
        # self.fc2 = torch.nn.Linear(256, 128)
        self.regressor = torch.nn.Linear(lf_hidden_size + embedder.config.hidden_size, 1)
        self.dropout = torch.nn.Dropout(dropout)
    
    def config(self):
        return {
            'embedder': self.embedder.config,
            'lf': {
                'input_size': self.lf.in_features,
                'hidden_size': self.lf.out_features
            },
            'regressor': {
                'input_size': self.regressor.in_features,
                'output_size': self.regressor.out_features
            }
        }

    def forward(self, token_ids, attention_mask, ling_features):
        embedded = self.embedder(token_ids, attention_mask=attention_mask, output_hidden_states=True)[1]
        embedded = self.dropout(embedded)
            
        ling_features = self.lf(ling_features)
        ling_features = F.leaky_relu(ling_features)
        ling_features = self.dropout(ling_features)
            
        features = torch.cat((embedded, ling_features), dim=1)

        score = self.regressor(features)
        return score

model = MultiFeaturesModel(embedder, 
                           len(FEATURES), 256,
                           hyperparameters['dropout'])
model.to(device)


model.state_dict()['regressor.weight']

tensor([[ 0.0061, -0.0151, -0.0282,  ..., -0.0039, -0.0213,  0.0058]],
       device='cuda:0')

In [20]:
model.load_state_dict(torch.load('/kaggle/input/essay-scoring-models/checkpoints/lemon-surf-61/multi_features-longformer-base-4096-15_epochs.pth',
                                 map_location=device))

model.state_dict()['regressor.weight']

tensor([[-0.0280, -0.0165,  0.0128,  ...,  0.0006, -0.0009, -0.0120]],
       device='cuda:0')

In [17]:
def logit_to_score(logit, min_score=1, max_score=6):
    scores = torch.clamp(torch.round(logit), min_score, max_score)
    scores = scores.long()
    return scores

In [27]:
model.eval()
test_predictions = []
test_ids = []

for i, row in test_df.iterrows():
    with torch.no_grad():
        token_ids = torch.tensor(row['input_ids']).unsqueeze(0)
        attention_mask = torch.tensor(row['attention_mask']).unsqueeze(0)
        row_ling_features = torch.tensor(row[list(FEATURES)].tolist()).unsqueeze(0)

        output = model(token_ids.to(device), 
                       attention_mask.to(device), 
                       row_ling_features.to(device))
        print(output)
        test_predictions.append(output.item())
        test_ids.append(row['essay_id'])
        
submit_df = pd.DataFrame({
    'essay_id': test_ids,
    'prediction': logit_to_score(torch.tensor(test_predictions)).cpu().numpy()
})
print(submit_df.shape)
submit_df.to_csv('submission.csv', index=False)

submit_df

tensor([[-70.7109]], device='cuda:0')
tensor([[-33.3468]], device='cuda:0')
tensor([[-57.8386]], device='cuda:0')
(3, 2)


Unnamed: 0,essay_id,prediction
0,000d118,1
1,000fe60,1
2,001ab80,1
