In [1]:
import random

import os
import datetime
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from string import punctuation, printable
import re

import spacy
nlp = spacy.load("en_core_web_sm")


import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR
from transformers import AutoModel, AutoTokenizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)

cpu


In [2]:
from distutils.dir_util import copy_tree

copy_tree('/kaggle/input/spellchecker', '/kaggle/working/')

!gzip '/kaggle/working/spellchecker/resources/en.json'

!pip install '/kaggle/working/pyspellchecker-0.8.1-py3-none-any.whl'

  pid, fd = os.forkpty()


Processing ./pyspellchecker-0.8.1-py3-none-any.whl
Installing collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.1


In [3]:
from spellchecker import SpellChecker

spell = SpellChecker()

In [4]:
input_dir = "/kaggle/input/learning-agency-lab-automated-essay-scoring-2"
train_data = pd.read_csv(os.path.join(input_dir, 'train.csv'))
test_data = pd.read_csv(os.path.join(input_dir, 'test.csv'))
extra_data = pd.read_csv('/kaggle/input/persaude-corpus-2/persuade_2.0_human_scores_demo_id_github.csv')

extra_data = extra_data.rename(columns={'essay_id_comp': 'essay_id',
                                        'holistic_essay_score': 'score'})
extra_data = extra_data[['essay_id', 'full_text', 'score']]

extra_data = extra_data[~extra_data['full_text'].isin(train_data['full_text'])]

print(train_data.shape, test_data.shape, extra_data.shape)

train_data.sample(5)

(17307, 3) (3, 2) (13125, 3)


Unnamed: 0,essay_id,full_text,score
12696,bb4c434,"People tend to use there cars so much, they ba...",3
4625,44e88b0,Imagine being a top scientist at NASA and Viki...,3
733,0ba78ec,The face of Mars could not be created by alien...,3
16885,f96c287,Many people belive that the face on Mars was c...,3
3334,317173f,Driverless Cars are coming soon or later? Peop...,4


In [5]:
def count_misspelled(words):
    words = [word for word in words if word.strip()]

    words = set(words) - {"'s", "n't"} - set(punctuation)

    return spell.unknown(words)


def get_sentences(text: str):
    text = "".join(filter(lambda x: x in printable, text)).strip()
    doc = nlp(text)
    return [sent.text for sent in doc.sents]

def preprocess_text(text: str):
    text = re.sub(r'\s+', ' ', text)
    text = "".join(filter(lambda x: x in printable, text)).strip()
    return text


def get_features(data_df: pd.DataFrame, output_dir='./output/data_features.csv', save=True):
    data_df['full_text'] = data_df['full_text'].apply(preprocess_text)

    # parse the text into paragraphs
    data_df['paragraphs'] = data_df['full_text'].apply(lambda x: x.split("\n\n"))
    data_df['num_paragraphs'] = data_df['paragraphs'].apply(len)

    # into sentences
    data_df['sentences'] = data_df['full_text'].apply(get_sentences)
    data_df['num_sentences'] = data_df['sentences'].apply(len)

    # Create lists to hold the results
    words = []
    lemmas = []
    pos = []
    is_stop_word = []

    # Process the texts in batches
    for i, doc in enumerate(nlp.pipe(data_df['full_text'], batch_size=50)):
        print(f"Processing batch {i+1}/{len(data_df)}", end="\r")

        words.append([token.text for token in doc])
        lemmas.append([token.lemma_ for token in doc])
        pos.append([token.pos_ for token in doc])
        is_stop_word.append([token.is_stop for token in doc])

    data_df['words'] = words
    data_df['lemma'] = lemmas
    data_df['pos'] = pos
    data_df['is_stop_word'] = is_stop_word

    data_df['num_words'] = data_df['words'].apply(len)

    data_df['num_conjunctions'] = data_df['pos'].apply(
        lambda x: len([pos for pos in x if pos == 'CCONJ']))

    data_df['num_distinct_words'] = data_df['lemma'].apply(
        lambda x: len(set(x)))

    data_df['num_misspell'] = data_df['lemma'].apply(count_misspelled)
    data_df['num_misspell'] = data_df['num_misspell'].apply(len)

    data_df['mean_word_len'] = data_df['lemma'].apply(lambda x: np.mean(
        [len(word) for word in x if word.strip() and word not in punctuation]))

    data_df['mean_sent_len'] = data_df['sentences'].apply(
        lambda x: np.mean([len([token.text for token in sent]) for sent in x]))

    if save:
        data_df.to_csv(output_dir, index=False)

    return data_df

train_data = get_features(train_data)
test_data = get_features(test_data)
extra_data = get_features(extra_data)

In [6]:
all_data = pd.concat([train_data, extra_data], ignore_index=True)
# shuffle the data
all_data = all_data.sample(frac=1, random_state=random_seed)
all_data = all_data.reset_index(drop=True)

train_ratio, val_ratio = 0.8, 0.2

train_df= all_data.iloc[:int(train_ratio*len(all_data))]
val_df = all_data.iloc[int(train_ratio*len(all_data)):]
test_df = test_data

print(train_df.shape, val_df.shape, test_df.shape)

(24345, 20) (6087, 20) (3, 19)


In [7]:
model_path = '/kaggle/input/essay-scoring-models/longformer-base-4096'
tokenizer = AutoTokenizer.from_pretrained(model_path)
embedder = AutoModel.from_pretrained(model_path)

  return self.fget.__get__(instance, owner)()


In [8]:
ling_features = ['num_paragraphs', 'num_words', 'num_conjunctions',
                 'num_distinct_words', 'num_misspell',
                 'mean_word_len', 'num_sentences', 'mean_sent_len']

hyperparameters = {
    'lr': 1e-5,
    'dropout': 0.25,
    'epochs': 3,
    'batch_size': 40,
    'max_seq_len': 4096,
    'ling_features_hidden_size': 128,
    'embedding_model': model_path,
    'train_set': {
        'total': len(train_df),
        'ratio': train_ratio,
    },
    'val_set': {
        'total': len(val_df),
        'ratio': val_ratio,
    },
    'test_set': {
        'total': len(test_df),
    },
    'linguistic_features': ling_features,
    'accelator': str(device)
}

In [None]:
train_tokenized = tokenizer(train_df['full_text'].tolist(),
                            max_length=hyperparameters['max_seq_len'],
                            padding='max_length', truncation=True, 
                            return_tensors="np")

train_df['input_ids'] = train_tokenized['input_ids'].tolist()
train_df['attention_mask'] = train_tokenized['attention_mask'].tolist()


print(len(train_df.sample(1).iloc[0]['input_ids']))

In [None]:
val_tokenized = tokenizer(val_df['full_text'].tolist(),
                          max_length=hyperparameters['max_seq_len'],
                          padding='max_length', truncation=True, 
                          return_tensors="np")

val_df['input_ids'] = val_tokenized['input_ids'].tolist()
val_df['attention_mask'] = val_tokenized['attention_mask'].tolist()


print(len(val_df.sample(1).iloc[0]['input_ids']))

In [9]:
test_tokenized = tokenizer(test_df['full_text'].tolist(),
                           max_length=hyperparameters['max_seq_len'], 
                           padding='max_length', truncation=True, 
                           return_tensors="np")

test_df['input_ids'] = test_tokenized['input_ids'].tolist()
test_df['attention_mask'] = test_tokenized['attention_mask'].tolist()

print(len(test_df.sample(1).iloc[0]['input_ids']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['input_ids'] = train_tokenized['input_ids'].tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['attention_mask'] = train_tokenized['attention_mask'].tolist()


2560
2560


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df['input_ids'] = val_tokenized['input_ids'].tolist()


2560
2560


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df['attention_mask'] = val_tokenized['attention_mask'].tolist()


In [10]:
class MultiFeaturesDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        # self.text = df['clean_text'].values
        self.token_ids = df['input_ids'].values
        self.attention_mask = df['attention_mask'].values
        self.score = df['score'].values
        self.ling_features = []
        for feature in ling_features:
            self.ling_features.append(df[feature].values)

    def __len__(self):
        return len(self.score)

    def __getitem__(self, idx):
        features = []
        for feature in self.ling_features:
            features.append(feature[idx])

        features = torch.tensor(features, dtype=torch.float)

        score = torch.reshape(torch.tensor(
            self.score[idx], dtype=torch.float), (1,))

        return torch.tensor(self.token_ids[idx]), torch.tensor(self.attention_mask[idx]), features, score


train_dataset = MultiFeaturesDataset(train_df)
val_dataset = MultiFeaturesDataset(val_df)

In [11]:
train_dataloader = DataLoader(
    train_dataset, batch_size=hyperparameters['batch_size'], shuffle=True)
val_dataloader = DataLoader(
    val_dataset, batch_size=hyperparameters['batch_size'], shuffle=True)

for token_ids, attention_mask, features, score in train_dataloader:
    print(token_ids.shape, attention_mask.shape, features.shape, score.shape)
    break

torch.Size([40, 2560]) torch.Size([40, 2560]) torch.Size([40, 11]) torch.Size([40, 1])


In [12]:
class MultiFeaturesModel(torch.nn.Module):
    def __init__(self, embedder,
                 lf_input_size, lf_hidden_size=64,
                 dropout=0.2):
        super(MultiFeaturesModel, self).__init__()
        # freeze
        for param in embedder.parameters():
            param.requires_grad = False

        self.embedder = embedder
        self.lf = torch.nn.Linear(lf_input_size,
                                  lf_hidden_size)
        self.fc = torch.nn.Linear(
            lf_hidden_size + embedder.config.hidden_size, 128)
        self.regressor = torch.nn.Linear(128, 1)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, token_ids, attention_mask, ling_features):
        embedded = self.embedder(token_ids,
                                 attention_mask=attention_mask).last_hidden_state[:, 0, :]
        if self.training:
            embedded = self.dropout(embedded)
        ling_features = F.leaky_relu(self.lf(ling_features))
        if self.training:
            ling_features = self.dropout(ling_features)
        features = torch.cat((embedded, ling_features), dim=1)
        features = F.leaky_relu(self.fc(features))
        if self.training:
            features = self.dropout(features)
        score = self.regressor(features)
        return score


model = MultiFeaturesModel(embedder, len(ling_features),
                           hyperparameters['ling_features_hidden_size'],
                           hyperparameters['dropout'])

model.load_state_dict(torch.load('/kaggle/input/essay-scoring-models/checkpoints/multi_features_longformer-base-4096_model_2024-04-29_10-26.pth',
                                 map_location=device))

In [13]:
def train(model, optimizer, criterion, train_dataloader, logging_steps=100):
    model.train()
    running_loss = 0.0

    for token_ids, attention_mask, features, score in train_dataloader:

        output = model(token_ids.to(device), 
                        attention_mask.to(device), 
                        features.to(device))
        loss = criterion(output, score.to(device)).float()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        running_loss += loss.item()

    return running_loss / len(train_dataloader)


def evaluate(model, criterion, dataloader):
    model.eval()
    running_loss = 0.0
    all_scores = []
    predictions = []

    with torch.no_grad():
        for token_ids, attention_mask, features, score in dataloader:
            output = model(token_ids.to(device), 
                           attention_mask.to(device), 
                           features.to(device))

            loss = criterion(output, score.to(device))

            running_loss += loss.item()
            all_scores.extend(score.cpu().numpy())
            predictions.extend(output.cpu().numpy())

    return running_loss / len(dataloader), np.array(all_scores), np.array(predictions)

In [14]:
class EarlyStopper:
    def __init__(self, patience=1, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = float('inf')

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

In [15]:
criterion = torch.nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(),
                              lr=hyperparameters['lr'])
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=1, factor=0.1)
early_stopper = EarlyStopper(patience=3, min_delta=1e-3)

train_losses, val_losses, val_kappa_scores = [], [], []

hyperparameters['early_stopper'] = early_stopper.__dict__['patience']
hyperparameters['optimizer'] = optimizer.__dict__
hyperparameters['scheduler'] = scheduler.__dict__
hyperparameters['model'] = dict(model.__dict__['_modules'])


def logit_to_score(logit, min_score=1, max_score=6):
    scores = np.clip(np.round(logit), min_score, max_score)
    scores = np.dtype(np.int32).type(scores)
    return


hyperparameters

AttributeError: 'collections.OrderedDict' object has no attribute 'parameters'

In [None]:
torch.cuda.empty_cache()

for epoch in range(hyperparameters['epochs']):
    train_loss = train(model, optimizer, criterion, train_dataloader)
    val_loss, val_scores, val_predictions = evaluate(
        model, criterion, val_dataloader)

    val_kappa = cohen_kappa_score(
        logit_to_score(val_predictions), val_scores,
        labels=[1, 2, 3, 4, 5, 6],
        weights='quadratic')

    train_losses.append(train_loss)
    val_losses.append(val_loss)
    val_kappa_scores.append(val_kappa)

    scheduler.step(val_loss)

    print(f'Epoch: {epoch+1}, Train Loss: {train_loss}, Val Loss: {val_loss}, Val Kappa: {val_kappa}')
    
    if early_stopper.early_stop(val_losses[-1]):
        break

In [None]:
model.eval()
test_predictions = []

with torch.no_grad():
    for row in test_df.itertuples():
        token_ids = torch.tensor(row.input_ids).unsqueeze(0)
        attention_mask = torch.tensor(row.attention_mask).unsqueeze(0)
        features = torch.tensor([row[1:11]]).float()
        output = model(token_ids.to(device), 
                       attention_mask.to(device), 
                       features.to(device))
        test_predictions.append(output.cpu().numpy()[0][0])

test_kappa = cohen_kappa_score(
    logit_to_score(test_predictions), test_df['score'].values, weights='quadratic')

print(f'Test Kappa: {test_kappa}')

submit_df = pd.DataFrame({
    'essay_id': test_df['essay_id'],
    'prediction': logit_to_score(test_predictions)
})
submit_df.to_csv('submission.csv', index=False)