In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv


In [2]:
# install the required python packages
#!pip install datasets transformers evaluate sentencepiece accelerate --quiet

#!pip install evaluate --quiet

In [3]:
# put all imports
import pandas as pd
from datasets import Dataset
import re
import numpy as np
from sklearn.model_selection import train_test_split
import tqdm.notebook as tq
import torch
from sklearn.metrics import cohen_kappa_score

In [4]:
# Add seed values to reduce randomness

import random

seed = 1024

random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [5]:
# create regex compiler
def create_re():
    # Dictionary of English Contractions
    contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have"}

    # Regular expression for finding contractions
    return re.compile('(%s)' % '|'.join(contractions_dict.keys()))

# Function for expanding contractions
def expand_contractions(text):
    # Dictionary of English Contractions
    contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have"}
    def replace(match):
        return contractions_dict[match.group(0)]
    return create_re().sub(replace, text)

def perform_eda(df):
    df.rename(columns={"score": "label"}, inplace=True)
    # Check for missing values in the input dataset
    missing_values = df.isnull().sum()
    print(missing_values)
    
    # Expanding Contractions in the reviews
    df['full_text']=df['full_text'].apply(lambda essay_text:expand_contractions(essay_text))
    
    # keep lowercase words
    df['full_text']=df['full_text'].apply(lambda x: x.lower())
    df.drop(['essay_id'], axis=1, inplace=True)
    
    if 'label' in df:
        df['label'] = df['label'].map(lambda x: x-1)
    return Dataset.from_pandas(df)

In [6]:
train_df, eval_df = train_test_split(pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv'), test_size=0.25)
train_dataset = perform_eda(train_df)
eval_dataset = perform_eda(eval_df)



essay_id     0
full_text    0
label        0
dtype: int64
essay_id     0
full_text    0
label        0
dtype: int64


In [7]:
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

#print(device)

#if torch.cuda.is_available():
#    print('Num GPUs:', torch.cuda.device_count())
#    print('GPU Type:', torch.cuda.get_device_name(0))

In [8]:
# use gpu if available
#import torch

#device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [10]:
# Tokenize the input data
def preprocess_function(input):
    return tokenizer(input["full_text"], 
                     truncation=True, 
                     #padding=True,
                     #add_special_tokens = True,
                     max_length=512
                     #padding="max_length"
                    )

In [11]:
# Tokenized input data
train_tokenized_dataset = train_dataset.map(preprocess_function, batched=True).remove_columns(['full_text', '__index_level_0__'])
eval_tokenized_dataset = eval_dataset.map(preprocess_function, batched=True).remove_columns(['full_text', '__index_level_0__'])

Map:   0%|          | 0/12980 [00:00<?, ? examples/s]

Map:   0%|          | 0/4327 [00:00<?, ? examples/s]

In [12]:
print(train_tokenized_dataset)

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 12980
})


In [13]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

2024-05-11 04:54:53.078440: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-11 04:54:53.078547: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-11 04:54:53.220743: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [14]:
#import evaluate
#accuracy = evaluate.load("accuracy")

In [15]:
import numpy as np


def compute_metrics(eval_pred):
    preds, labels = eval_pred
    score = cohen_kappa_score(labels, preds.argmax(-1), weights='quadratic')
    return { 'qwk':score }

In [16]:
#id2label = {1: "one", 2: "two", 3: "three", 4: "four", 5: "five", 6: "six"}
#label2id = {"one": 1, "two": 2, "three": 3, "four": 4, "five": 5, "six": 6}

In [17]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/deberta-v3-large", num_labels=6
    #, id2label=id2label, label2id=label2id
)

# Apparently by default the model uses 512 tokens, resizing it to 1024 to match
#with the tokenizer

#model.resize_token_embeddings(len(tokenizer)) 

pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
print(model)

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 1024, padding_idx=0)
      (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-23): 24 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (key_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (value_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-07, element

In [19]:
#import os
#os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
#os.environ["CUDA_VISIBLE_DEVICES"]= "2"

#os.environ['CUDA_LAUNCH_BLOCKING'] = 1

In [20]:
# Train the model

#training_args = TrainingArguments(
#    output_dir="learning_agency_lab_v1",
#    #fp16=True,
#    learning_rate=2e-5,
#    per_device_train_batch_size=16,
#    per_device_eval_batch_size=16,
#    num_train_epochs=4,
#    weight_decay=0.01,
#    evaluation_strategy="epoch",
#    metric_for_best_model='qwk',
#    save_strategy="epoch",
#    push_to_hub=False,
#    report_to='none',
#    load_best_model_at_end=True,
#    save_total_limit=1,
#    warmup_ratio=0.0,
#    lr_scheduler_type='linear',
#    optim='adamw_torch',
#    logging_first_step=True
#)

if torch.cuda.device_count() > 1:
    NUM_CORES = 4
else:
    NUM_CORES = 1

training_args = TrainingArguments(
    output_dir='/kaggle/working/learning_agency_lab/v1', 
    fp16=True,
    learning_rate=2e-5,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    report_to="none",
    push_to_hub=False,
    evaluation_strategy="steps",
    do_eval=True,
    eval_steps=100,
    save_total_limit=1,
    save_strategy="steps",
    save_steps=100,
    logging_steps=100,
    lr_scheduler_type='linear',
    metric_for_best_model="qwk",
    greater_is_better=True,
    warmup_ratio=0.1,
    weight_decay=0.01,
    save_safetensors=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=eval_tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [21]:
#train the model
trainer.train()

Step,Training Loss,Validation Loss,Qwk
100,1.4091,1.048621,0.645059
200,1.1065,1.016851,0.684712
300,1.0056,0.888289,0.764464
400,0.9397,0.891268,0.733333
500,0.9211,0.839466,0.794167
600,0.8396,0.854808,0.797885
700,0.8413,0.805897,0.807962
800,0.7904,0.778151,0.814138


TrainOutput(global_step=811, training_loss=0.9787113657714994, metrics={'train_runtime': 6331.3384, 'train_samples_per_second': 2.05, 'train_steps_per_second': 0.128, 'total_flos': 1.1630032876251504e+16, 'train_loss': 0.9787113657714994, 'epoch': 1.0})

In [22]:
trainer.save_model("/kaggle/working/learning_agency_lab/v1/")