# Imports and Configuration !NB! This notebook was ran once for an example of completion

In [1]:
#https://pytorch-lightning.readthedocs.io/en/stable/advanced/transfer_learning.html and other sources from site

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from transformers import AutoModel,AutoTokenizer, AutoConfig
import torch.nn as nn
import math
import torch.nn.functional as F
import torch
import pytorch_lightning as pl
import torch.utils.data as data
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from pytorch_lightning.callbacks.early_stopping import EarlyStopping


os.environ["TOKENIZERS_PARALLELISM"] = "false"

class CONF:
    model = "/kaggle/input/fb3models/microsoft-deberta-v3-base/model"
    tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/fb3models/microsoft-deberta-v3-base/tokenizer")
    max_token_len = 1000
    batch_size = 1
    train_path =  "/kaggle/input/feedback-prize-english-language-learning/train.csv"
    test_path =  "/kaggle/input/feedback-prize-english-language-learning/test.csv"
    num_workers =  2
    lr = 3e-4
    train_size = 0
    n_epochs = 1
    attributes = ['cohesion', 'syntax','vocabulary','phraseology','grammar','conventions']

conf = CONF()


In [2]:
train = pd.read_csv(conf.train_path)
test = pd.read_csv(conf.test_path)
train.full_text = train.full_text.str.strip().replace('\r\n', ' ')
train.full_text = train.full_text.str.replace('\n\n', ' ')
test.full_text = test.full_text.str.strip().replace('\r\n', ' ')
test.full_text = test.full_text.str.replace('\n\n', ' ')
train['text_words'] = train['full_text'].str.split(" ").apply(len)
train

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,text_words
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,261
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5,533
2,00299B378633,"Dear, Principal If u change the school policy ...",3.0,3.5,3.0,3.0,3.0,2.5,320
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0,728
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5,234
...,...,...,...,...,...,...,...,...,...
3906,FFD29828A873,I believe using cellphones in class for educat...,2.5,3.0,3.0,3.5,2.5,2.5,179
3907,FFD9A83B0849,"Working alone, students do not have to argue w...",4.0,4.0,4.0,4.0,3.5,3.0,465
3908,FFDC4011AC9C,"""A problem is a chance for you to do your best...",2.5,3.0,3.0,3.0,3.5,3.0,257
3909,FFE16D704B16,Many people disagree with Albert Schweitzer's ...,4.0,4.5,4.5,4.0,4.5,4.5,510


In [3]:
test

Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


In [4]:
print(train.full_text[0])
print(conf.tokenizer(train.full_text[0]))

I think that students would benefit from learning at home,because they wont have to change and get up early in the morning to shower and do there hair. taking only classes helps them because at there house they'll be pay more attention. they will be comfortable at home. The hardest part of school is getting ready. you wake up go brush your teeth and go to your closet and look at your cloths. after you think you picked a outfit u go look in the mirror and youll either not like it or you look and see a stain. Then you'll have to change. with the online classes you can wear anything and stay home and you wont need to stress about what to wear. most students usually take showers before school. they either take it before they sleep or when they wake up. some students do both to smell good. that causes them do miss the bus and effects on there lesson time cause they come late to school. when u have online classes u wont need to miss lessons cause you can get everything set up and go take a s

# Dataset

In [5]:
class Essays_Dataset(Dataset):
    def __init__(self, data, tokenizer = conf.tokenizer, attributes = conf.attributes, max_token_len=conf.max_token_len):
        self.tokenizer = tokenizer
        self.attributes = attributes
        self.max_token_len = max_token_len
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data.iloc[index]
        full_text = str(item.full_text)
        attributes = torch.FloatTensor(item[self.attributes])
        tokens = self.tokenizer.encode_plus(full_text,
                                            add_special_tokens=True,
                                            return_tensors='pt',
                                            truncation=True,
                                            padding='max_length',
                                            max_length=self.max_token_len,
                                            return_attention_mask = True)
        return {'input_ids': tokens.input_ids.flatten(), 'attention_mask': tokens.attention_mask.flatten(), 'labels': attributes}


In [6]:
class Test_Dataset(Dataset):
    def __init__(self, data, tokenizer = conf.tokenizer, max_token_len=conf.max_token_len):
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data.iloc[index]
        full_text = str(item.full_text)
        tokens = self.tokenizer.encode_plus(full_text,
                                            add_special_tokens=True,
                                            return_tensors='pt',
                                            truncation=True,
                                            padding='max_length',
                                            max_length=self.max_token_len,
                                            return_attention_mask = True)
        return {'input_ids': tokens.input_ids.flatten(), 'attention_mask': tokens.attention_mask.flatten(), 'labels': [0,0,0,0,0,0]}

# Model, loss, pooling, etc..

In [7]:
# borrowed from https://www.kaggle.com/code/lextoumbourou/feedback3-eda-hf-custom-trainer-sift
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [8]:
class Essays_Grader(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.config = AutoConfig.from_pretrained(conf.model)

        self.bert = AutoModel.from_pretrained(conf.model, return_dict = True)
        self.classifier = torch.nn.Linear(self.bert.config.hidden_size, 6)
            
        torch.nn.init.kaiming_uniform_(self.classifier.weight)

        self.pool = MeanPooling()
        self.loss = nn.CrossEntropyLoss()
        
    def forward(self, input_ids, attention_mask = None, labels = None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_states = outputs[0]
        pooled = self.pool(last_hidden_states, attention_mask)
        output = self.classifier(pooled)
        return output, labels
        
        
    def training_step(self, batch, batch_idx):
        outputs, labels = self(**batch)
        train_loss = self.loss(outputs.view(-1, 6), labels.view(-1, 6))
        self.log("train_loss", train_loss, prog_bar = True, logger=True)
        return train_loss

    def validation_step(self, batch, batch_idx):
        outputs, labels = self(**batch)
        val_loss = self.loss(outputs.view(-1, 6), labels.view(-1, 6))
        self.log("validation loss", val_loss, prog_bar = True, logger=True)
        return val_loss

    def predict_step(self, batch, batch_idx):
        outputs, labels = self(**batch)
        return outputs

    def configure_optimizers(self):
        optimizer = torch.optim.Adadelta(self.bert.parameters(), lr=conf.lr)
        return [optimizer], [torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.7)]

 # Training model using trainer

In [9]:
# trainer and fit
model = Essays_Grader()#.load_from_checkpoint("/kaggle/input/debertalarge/results/lightning_logs/version_0/checkpoints/epoch8-step17604.ckpt")
trainer = pl.Trainer(max_epochs=conf.n_epochs,accelerator='gpu', devices=2)#, callbacks=[EarlyStopping(monitor="validation loss", mode="min")])

train_loader = DataLoader(Essays_Dataset(train),batch_size = conf.batch_size, num_workers=conf.num_workers,  shuffle=False)
# valid_loader = DataLoader(Essays_Dataset(val))
conf.train_size = len(train_loader)

trainer.fit(model, train_loader)#, valid_loader)


  "num_workers>0, persistent_workers=False, and strategy=ddp_spawn"


Training: 0it [00:00, ?it/s]



In [10]:
# %reload_ext tensorboard
# %tensorboard --logdir ./lightning_logs/

# Predict with model

In [11]:
def predict_essay_grades(model, dl):
    trainer = pl.Trainer( accelerator='gpu', devices=1)
    predictions = trainer.predict(model, dl, return_predictions=True)
    flattened_predictions = np.stack([torch.Tensor(p) for batch in predictions for p in batch])
    return flattened_predictions

In [12]:
# model = Essays_Grader.load_from_checkpoint("/kaggle/input/debertalarge/results/lightning_logs/version_0/checkpoints/epoch8-step17604.ckpt")
predictions = predict_essay_grades(model, DataLoader(Test_Dataset(test)))

Predicting: 0it [00:00, ?it/s]

In [13]:
predictions

array([[ 0.12729052,  0.17398739,  0.3125157 ,  0.270515  ,  0.05762395,
         0.13382167],
       [ 0.25444916,  0.11131373,  0.07109548,  0.105574  , -0.01765854,
         0.1433475 ],
       [ 0.11680218,  0.14191961,  0.17819871,  0.2854613 ,  0.2518638 ,
         0.36729282]], dtype=float32)

In [14]:
for i,at in enumerate(conf.attributes):
    test[at] = predictions[:,i]

In [15]:
test = test.drop(columns=['full_text'])
test.to_csv("submission.csv", index=False)


In [16]:
test

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,0.127291,0.173987,0.312516,0.270515,0.057624,0.133822
1,000BAD50D026,0.254449,0.111314,0.071095,0.105574,-0.017659,0.143348
2,00367BB2546B,0.116802,0.14192,0.178199,0.285461,0.251864,0.367293
