In [1]:
# Import necessary libraries
import os
from pathlib import Path

import pandas as pd
from fastai.imports import *
from torch.utils.data import DataLoader
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import Dataset, DatasetDict
from sklearn.metrics import confusion_matrix

In [18]:
# Load training and test data
path_to_raw_data = Path("../data/raw")
df = pd.read_csv(path_to_raw_data/"train.csv")
eval_df = pd.read_csv(path_to_raw_data / "test.csv")

In [19]:
# Perform basic exploratory data analysis
df.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,"I think that students would benefit from learning at home,because they wont have to change and get up early in the morning to shower and do there hair. taking only classes helps them because at there house they'll be pay more attention. they will be comfortable at home.\n\nThe hardest part of school is getting ready. you wake up go brush your teeth and go to your closet and look at your cloths. after you think you picked a outfit u go look in the mirror and youll either not like it or you look and see a stain. Then you'll have to change. with the online classes you can wear anything and st...",3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it do the best on you no matter what is happening it can change your mind. sometimes you need to wake up and look what is around you because problems are the best way to change what you want to change along time ago. A\n\nproblem is a change for you because it can make you see different and help you to understand how tings wok.\n\nFirst of all it can make you see different then the others. For example i remember that when i came to the United States i think that nothing was going to change me because i think that nothing was going to change me bec...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school policy of having a grade b average that unfair. Because many students have a C average. So that means that they cant go out for sports or other activities they want to do bad. That's like taking everything they have. What if kids want to become good at something, but now they cant because of that school policy. If they have a C average they should still be able to go out for sports or activities. A C average isn't that bad, its higher then a D average. If the school police was if you have a D average of lower they shouldn't do sports or activities....",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,"The best time in life is when you become yourself. I agree that the greatest accomplishment, is when you be yourself in a world that constantly trying to make you something else. Because you make your own choices, you become more happy, and you respect others.\n\nFirst, you make your own choices by being yourself. Becoming yourself means that you should be able to make your own choices and not be shy or afraid of what you're doing. Because you're defining yourself by doing those things that you want. Some people follow others, therefore, they don't make their own choices. People are afraid...",4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other people can change people to become better persons you can have an impact of kindess with a homeles that can change his life or with some who needed they are going to know you are a nice person if you are a nice person everywhere you go people is going to like your personality so you have to be a nice person with others like a old women triying to cross the road thats a impact of kindness when you do that you feel a greate person you can change people in the way they think by helping others treating nice other people give them some advice when you s...,2.5,3.0,3.0,3.0,2.5,2.5


In [4]:
FULL_TEXT = "full_text"
TEXT_ID = "text_id"
TARGET_LABELS = {"cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"}
GRADES = ["F", "D", "D+", "C", "C+", "B", "B+", "A", "A+"]

score_to_grade_dict = dict(zip(np.arange(1, 5.5, 0.5), GRADES))

In [5]:
# Get tokenizer used by preexisting model
model_name = "microsoft/deberta-v3-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
# Tokenize data
def tokenize_text(ds):
    return tokenizer(ds[FULL_TEXT])

In [7]:
ds = Dataset.from_pandas(df)
tokenize_text(ds[0])

{'input_ids': [1, 273, 428, 272, 598, 338, 1591, 292, 1101, 288, 425, 261, 10405, 306, 13521, 286, 264, 575, 263, 350, 322, 737, 267, 262, 1066, 264, 3038, 263, 333, 343, 1245, 260, 787, 364, 2141, 1530, 349, 401, 288, 343, 669, 306, 280, 436, 282, 794, 310, 1251, 260, 306, 296, 282, 1800, 288, 425, 260, 279, 9769, 465, 265, 563, 269, 646, 1041, 260, 274, 4302, 322, 424, 5063, 290, 3261, 263, 424, 264, 290, 6928, 263, 468, 288, 290, 36738, 260, 385, 274, 428, 274, 2847, 266, 5776, 3636, 424, 468, 267, 262, 4520, 263, 274, 436, 814, 298, 334, 278, 289, 274, 468, 263, 398, 266, 10997, 260, 1060, 274, 280, 436, 286, 264, 575, 260, 275, 262, 535, 2141, 274, 295, 1929, 784, 263, 992, 425, 263, 274, 13521, 389, 264, 2148, 314, 339, 264, 1929, 260, 370, 598, 1048, 413, 10796, 416, 563, 260, 306, 814, 413, 278, 416, 306, 1873, 289, 335, 306, 4302, 322, 260, 347, 598, 333, 462, 264, 4984, 397, 260, 272, 2884, 349, 333, 2374, 262, 2444, 263, 1563, 277, 343, 4178, 326, 1138, 306, 488, 1109, 264, 

In [8]:
tokenized_ds = ds.map(tokenize_text, batched=True, remove_columns=(FULL_TEXT, TEXT_ID))

                                                                 

In [9]:
tokenized_ds["cohesion"]

[3.5,
 2.5,
 3.0,
 4.5,
 2.5,
 3.5,
 3.5,
 2.5,
 3.0,
 3.0,
 3.0,
 3.5,
 3.5,
 3.5,
 4.0,
 2.0,
 3.0,
 3.0,
 3.0,
 3.0,
 2.5,
 2.0,
 3.0,
 4.0,
 4.0,
 2.0,
 3.0,
 4.5,
 3.0,
 2.5,
 4.0,
 3.0,
 3.5,
 3.5,
 2.0,
 4.0,
 3.0,
 4.0,
 4.0,
 1.0,
 4.0,
 2.5,
 2.5,
 3.5,
 4.0,
 2.5,
 4.0,
 3.0,
 3.0,
 3.0,
 2.5,
 2.5,
 2.5,
 3.0,
 3.5,
 3.5,
 3.5,
 3.0,
 3.5,
 3.0,
 4.5,
 2.0,
 2.0,
 2.5,
 3.0,
 2.0,
 3.0,
 3.5,
 2.5,
 3.5,
 2.5,
 3.5,
 3.0,
 3.0,
 2.0,
 3.0,
 3.0,
 3.5,
 3.0,
 4.0,
 3.5,
 3.0,
 2.0,
 3.0,
 4.0,
 3.0,
 2.0,
 2.5,
 2.5,
 2.0,
 3.5,
 2.0,
 2.0,
 4.0,
 2.5,
 4.0,
 3.5,
 3.5,
 4.0,
 2.0,
 2.0,
 2.5,
 3.5,
 3.0,
 5.0,
 3.5,
 2.5,
 3.5,
 3.0,
 3.5,
 2.5,
 3.5,
 3.0,
 2.0,
 2.5,
 3.5,
 3.0,
 3.5,
 2.5,
 2.5,
 3.5,
 3.0,
 3.0,
 3.0,
 3.0,
 2.0,
 3.5,
 4.0,
 2.0,
 3.0,
 4.0,
 3.5,
 2.5,
 2.5,
 3.0,
 3.5,
 3.5,
 3.0,
 2.5,
 3.0,
 3.0,
 3.0,
 3.5,
 4.5,
 2.5,
 3.5,
 3.0,
 4.0,
 3.0,
 2.0,
 3.0,
 3.5,
 2.5,
 3.5,
 3.0,
 3.5,
 3.0,
 4.0,
 3.5,
 4.0,
 4.0,
 3.0,
 3.5,
 3.0,
 4.0,
 2.5,
 2.0

In [10]:
# Create HuggingFace datasets
def get_datasets_dict(ds: Dataset):
    return ds.train_test_split(0.25, seed=42)

In [11]:
datasets_dict = get_datasets_dict(tokenized_ds)

In [12]:
# Map scores to equivalent grades
def map_score_to_grade(list_scores):
    return list(map(lambda score: score_to_grade_dict[score], list_scores))

In [13]:
%%time
map_score_to_grade(ds["cohesion"])

CPU times: user 2.58 ms, sys: 118 µs, total: 2.7 ms
Wall time: 2.77 ms


['B',
 'C',
 'C+',
 'A',
 'C',
 'B',
 'B',
 'C',
 'C+',
 'C+',
 'C+',
 'B',
 'B',
 'B',
 'B+',
 'D+',
 'C+',
 'C+',
 'C+',
 'C+',
 'C',
 'D+',
 'C+',
 'B+',
 'B+',
 'D+',
 'C+',
 'A',
 'C+',
 'C',
 'B+',
 'C+',
 'B',
 'B',
 'D+',
 'B+',
 'C+',
 'B+',
 'B+',
 'F',
 'B+',
 'C',
 'C',
 'B',
 'B+',
 'C',
 'B+',
 'C+',
 'C+',
 'C+',
 'C',
 'C',
 'C',
 'C+',
 'B',
 'B',
 'B',
 'C+',
 'B',
 'C+',
 'A',
 'D+',
 'D+',
 'C',
 'C+',
 'D+',
 'C+',
 'B',
 'C',
 'B',
 'C',
 'B',
 'C+',
 'C+',
 'D+',
 'C+',
 'C+',
 'B',
 'C+',
 'B+',
 'B',
 'C+',
 'D+',
 'C+',
 'B+',
 'C+',
 'D+',
 'C',
 'C',
 'D+',
 'B',
 'D+',
 'D+',
 'B+',
 'C',
 'B+',
 'B',
 'B',
 'B+',
 'D+',
 'D+',
 'C',
 'B',
 'C+',
 'A+',
 'B',
 'C',
 'B',
 'C+',
 'B',
 'C',
 'B',
 'C+',
 'D+',
 'C',
 'B',
 'C+',
 'B',
 'C',
 'C',
 'B',
 'C+',
 'C+',
 'C+',
 'C+',
 'D+',
 'B',
 'B+',
 'D+',
 'C+',
 'B+',
 'B',
 'C',
 'C',
 'C+',
 'B',
 'B',
 'C+',
 'C',
 'C+',
 'C+',
 'C+',
 'B',
 'A',
 'C',
 'B',
 'C+',
 'B+',
 'C+',
 'D+',
 'C+',
 'B',
 'C'

In [14]:
# Create metric evaluation function
def metric_func(*eval_predictions):
    pred_values = map_score_to_grade(tokenized_ds["cohesion"])
    true_values = map_score_to_grade(tokenized_ds["syntax"])
    return confusion_matrix(pred_values, true_values)

In [15]:
%%time
metric_func()

CPU times: user 19.4 ms, sys: 4.2 ms, total: 23.5 ms
Wall time: 23.3 ms


array([[ 33,   7,  21,  57,   0,   7,   0,   0,   0],
       [ 10,   7,   1,   8,   0,   0,   0,   0,   0],
       [ 10,   0, 370, 119, 107, 373,   0,   9,   0],
       [ 46,   3, 190, 156,   8, 131,   0,   0,   0],
       [  0,   0,  38,   3, 332, 233,   5, 179,   0],
       [  1,   0, 246,  45, 282, 440,   0,  82,   0],
       [  0,   0,   0,   0,   1,   0,  13,  10,   3],
       [  0,   0,   1,   0, 109,  66,   6, 130,   3],
       [  0,   0,   0,   0,   0,   0,   5,   0,   5]])

In [16]:
learning_rate, batch_size = 1e-5, 128
num_epochs = 4

In [34]:
help(Trainer)

Help on class Trainer in module transformers.trainer:

class Trainer(builtins.object)
 |  Trainer(model: Union[transformers.modeling_utils.PreTrainedModel, torch.nn.modules.module.Module] = None, args: transformers.training_args.TrainingArguments = None, data_collator: Optional[transformers.data.data_collator.DataCollator] = None, train_dataset: Optional[torch.utils.data.dataset.Dataset] = None, eval_dataset: Union[torch.utils.data.dataset.Dataset, Dict[str, torch.utils.data.dataset.Dataset], NoneType] = None, tokenizer: Optional[transformers.tokenization_utils_base.PreTrainedTokenizerBase] = None, model_init: Optional[Callable[[], transformers.modeling_utils.PreTrainedModel]] = None, compute_metrics: Optional[Callable[[transformers.trainer_utils.EvalPrediction], Dict]] = None, callbacks: Optional[List[transformers.trainer_callback.TrainerCallback]] = None, optimizers: Tuple[torch.optim.optimizer.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), preprocess_logits_for_metric

In [83]:
# Create a simple model for making predictions
def get_trainer():
    training_args = TrainingArguments('outputs', learning_rate=learning_rate, warmup_ratio=0.1, lr_scheduler_type='cosine',
    evaluation_strategy="epoch", per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size*2,
    num_train_epochs=num_epochs, weight_decay=0.01, report_to='none')
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(TARGET_LABELS))
    return Trainer(model, training_args, train_dataset=datasets_dict['train'], eval_dataset=datasets_dict['test'],
                   tokenizer=tokenizer)

In [84]:
trainer = get_trainer()

Downloading pytorch_model.bin: 100%|██████████| 286M/286M [00:05<00:00, 48.5MB/s] 
Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expec

In [None]:
trainer.train()

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
# Load data into dataloaders with transformations

In [None]:
# Train and validate model

In [None]:
# Train on complete training data

In [None]:
# Make predictions on test data and create submission file