In [None]:
import torch 
from torch import nn 
import torch.nn.functional as F
import numpy as np 
import pandas as pd 
from torch.utils.data import Dataset, DataLoader, TensorDataset
import pytorch_lightning as pl 
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn import model_selection
import transformers
from transformers import get_linear_schedule_with_warmup, AdamW

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"

# Data and Process

In [None]:
#taking only the id,excerpt,target,standard_error
df = pd.read_csv("../input/commonlitreadabilityprize/train.csv",usecols=["id","excerpt","target","standard_error"])
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv",usecols=["id","excerpt"])
print("train shape",df.shape)

In [None]:
# remove outlier
df = df[df['standard_error']!=0]
plt.scatter(df['target'], df['standard_error'])


In [None]:
#remove \n and replace \'s with 'sfrom the text
def prep_text(text_df):
    text_df = text_df.str.replace("\n","",regex=False) 
    return text_df.str.replace("\'s",r"s",regex=True).values
df["excerpt"] = prep_text(df["excerpt"])
test_df["excerpt"] = prep_text(test_df["excerpt"])

In [None]:
max_words = df["excerpt"].apply(lambda x: len(x.split())).max()
print("maximum words in instance:",max_words)

# RoBERTa test

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained("../input/roberta-transformers-pytorch/roberta-large/")
out = tokenizer(df["excerpt"][0])
print(out)

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained("../input/roberta-transformers-pytorch/roberta-large/")
out = tokenizer(df["excerpt"][0])
print(out)
print(tokenizer.encode(df["excerpt"][0]))
print(tokenizer.build_inputs_with_special_tokens(tokenizer.encode(df["excerpt"][0])))

# Create Folds

In [None]:
def create_folds(data, num_splits):
    # we create a new column called kfold and fill it with -1
    data["kfold"] = -1
    
    # the next step is to randomize the rows of the data
    data = data.sample(frac=1).reset_index(drop=True)

    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets
    data.loc[:, "bins"] = pd.cut(
        data["target"], bins=num_bins, labels=False
    )
    
    # initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=num_splits)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f
    
    # drop the bins column
    data = data.drop("bins", axis=1)

    # return dataframe with folds
    return data

# read training data
df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
df = df[df['standard_error']!=0]

# create folds
df = create_folds(df, num_splits=5)

# RoBERTa Model and Training Module

In [None]:
m1 = nn.Conv1d(205, 128, kernel_size=3, stride=1, padding=2)
m2 = nn.MaxPool1d(3)
m3 = nn.Conv1d(128, 64, kernel_size=3, stride=1, padding=2)
m4 = nn.MaxPool1d(3)
m5 = nn.Conv1d(64, 1, kernel_size=3, stride=1, padding=2)
m6 = nn.MaxPool1d(3)
input = torch.randn(8, 205, 768)
output = m1(input)
output = m2(output)
print(output.size())
output = m3(output)
output = m4(output)
print(output.size())
output = m5(output)
output = m6(output)
print(output.size())

In [None]:
BATCH_SIZE = 16
EPOCHS = 100
NUM_TRAIN_STEPS = int((df.shape[0]/BATCH_SIZE)*EPOCHS)
NUM_WARMUP_STEPS = 0
FOLDS = df.kfold.unique()
NUM_FOLDS = df.kfold.nunique() 

In [None]:
class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.eps = 1e-8
        
    def forward(self,output,target):
        return torch.sqrt(F.mse_loss(output,target)+self.eps)

In [None]:
class BertModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = transformers.AutoModel.from_pretrained("../input/roberta-transformers-pytorch/roberta-base/")
        #self.model = transformers.AutoModel.from_pretrained("../input/huggingface-bert/bert-large-uncased")
        #self.model = transformers.AutoModel.from_pretrained("../input/roberta-transformers-pytorch/roberta-large")
        self.drop = nn.Dropout(0.3)
        self.fc = nn.Linear(768,2)  # output to 2 dimensions, targets and errors
        
        # convolutional layer
        self.conv1 = nn.Conv1d(205, 128, kernel_size=3, stride=1, padding=3)
        self.conv2 = nn.Conv1d(128, 64, kernel_size=3, stride=1, padding=3)
        self.conv3 = nn.Conv1d(64, 1, kernel_size=3, stride=1, padding=3)
        self.ReLU = nn.ReLU()
        self.pool = nn.MaxPool1d(3)
        #self.fc_conv = nn.Linear(257,2)
        self.fc_conv = nn.Linear(30,2)
        
        #self.fc = nn.Linear(1024,2)
    
    def forward(self,inputs):
        out = self.model(**inputs) # output from BERT model
        last_hiddens = out[0]
        #print(last_hiddens.size())
        #out = self.drop(last_hiddens[:,0,:].squeeze(1))
        out = self.conv1(last_hiddens)
        out = self.ReLU(out)
        out = self.pool(out)
        out = self.conv2(out)
        out = self.ReLU(out)
        out = self.pool(out)
        out = self.conv3(out)
        out = self.ReLU(out)
        out = self.pool(out)
        #print(out.size())
        return self.fc_conv(out)
    
    def configure_optimizers(self):
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, # original : 0.01
            {'params': [p for n, p in self.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5) # original : 5e-5
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=NUM_WARMUP_STEPS, num_training_steps=NUM_TRAIN_STEPS)
        return [optimizer],[scheduler] 
    
    def loss_fn(self,output,target):
        return RMSELoss()(output.view(-1,2),target.view(-1,2))
    
    def training_step(self,batch,batch_idx):
        inputs = batch["inputs"]
        labels = batch["label"]
        output = self(inputs)
        loss = self.loss_fn(output,labels)
        return loss
    
    def validation_step(self,batch,batch_idx):
        inputs = batch["inputs"]
        labels = batch["label"]
        output = self(inputs)
        loss = self.loss_fn(output,labels)
        self.log("val_loss",loss,prog_bar=True)

# Tokenize Dataset and Dataloader

In [None]:
class BertDataset(Dataset):
    def __init__(self,texts,labels,max_len):
        super().__init__()
        self.texts = texts
        self.max_len = max_len
        self.labels = labels
        self.tokenizer = transformers.AutoTokenizer.from_pretrained("../input/roberta-transformers-pytorch/roberta-base/")
        #self.tokenizer = transformers.AutoTokenizer.from_pretrained("../input/huggingface-bert/bert-large-uncased")
    
    def __len__(self):
        return self.labels.shape[0]
    
    def __getitem__(self,idx):
        text = " ".join(self.texts[idx].split())
        label = self.labels[idx]
        inputs = self.tokenizer(text,return_tensors="pt",max_length = self.max_len, padding="max_length",truncation=True)
        return {
            "inputs":{"input_ids":inputs["input_ids"][0],
                      #"token_type_ids":inputs["token_type_ids"][0],
                      "attention_mask":inputs["attention_mask"][0],},
            "label":torch.tensor(label,dtype=torch.float)
        }

# Trainer

In [None]:
for fold in FOLDS:
    print("Fold :",fold)
    train_df, valid_df = df[df.kfold!=fold], df[df.kfold==fold]
    train_dataset = BertDataset(train_df.excerpt.values,(np.array([train_df.target.values,train_df.standard_error.values]).T),max_len=max_words)
    valid_dataset = BertDataset(valid_df.excerpt.values,(np.array([valid_df.target.values,valid_df.standard_error.values]).T),max_len=max_words)
    train_dloader = DataLoader(train_dataset,batch_size=BATCH_SIZE,shuffle=True,num_workers=4)
    valid_dloader = DataLoader(valid_dataset,batch_size=BATCH_SIZE,shuffle=False,num_workers=4)
    bert_model = BertModel() 
    trainer = pl.Trainer(gpus=-1,max_epochs=EPOCHS,callbacks=[EarlyStopping(monitor="val_loss",mode="min",patience=15)],checkpoint_callback=False)
    trainer.fit(model = bert_model,train_dataloader = train_dloader,val_dataloaders = valid_dloader)
    trainer.save_checkpoint(f"checkpoint_{fold}fold.ckpt") 

# check with training data

In [None]:
target_prediction = np.zeros(df.shape[0]) 
error_prediction = np.zeros(df.shape[0]) 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
for fold in FOLDS:
    print("Fold:",fold)
    loaded_model = BertModel.load_from_checkpoint(f"./checkpoint_{fold}fold.ckpt",map_location=device)
    loaded_model.to(device)
    loaded_model.eval() 
    #using the same BertDataset module of train, here dummy labels are provided
    check_dataset = BertDataset(df.excerpt.values,labels = (np.array([df.target.values,df.standard_error.values]).T),max_len=max_words)
    check_dataloader = DataLoader(check_dataset,batch_size=BATCH_SIZE,shuffle=False,num_workers=4)
    out_target = []
    out_error = []
    for batch in check_dataloader:
        x  = batch["inputs"]
        labels = batch["label"]
        for key in x.keys():
            x[key] = x[key].to(device)
        assert x["input_ids"].is_cuda, f"data is not in model device({loaded_model.device.type})"
        out = loaded_model(x)
        out = torch.squeeze(out, dim=1)
        #print(out.size())
        out_target_t = out[:,0]
        out_error_t = out[:,1]
        out_target.extend(out_target_t.cpu().detach().numpy())
        out_error.extend(out_error_t.cpu().detach().numpy())
        label_target = labels[:,0]
        label_error = labels[:,0]
        #print(out,labels)
    target_prediction += np.hstack(out_target)
    error_prediction += np.hstack(out_error)
    #target_label += np.hstack(label_target)
    #error_label += np.hstack(label_error)

In [None]:
plt.scatter(target_prediction/5, df['target'])

In [None]:
plt.scatter(target_prediction/5, error_prediction/5)

# Load Weights and Inference

In [None]:
test_dataset = BertDataset(test_df.excerpt.values,labels = np.ones([test_df.shape[0],2]),max_len=max_words)
test_dataloader = DataLoader(test_dataset,batch_size=BATCH_SIZE,shuffle=False,num_workers=4)

In [None]:
target_prediction = np.zeros(test_df.shape[0]) 
error_prediction = np.zeros(test_df.shape[0]) 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
for fold in FOLDS:
    print("Fold:",fold)
    loaded_model = BertModel.load_from_checkpoint(f"./checkpoint_{fold}fold.ckpt",map_location=device)
    loaded_model.to(device)
    loaded_model.eval() 
    #using the same BertDataset module of train, here dummy labels are provided
    test_dataset = BertDataset(test_df.excerpt.values,labels = np.ones([test_df.shape[0],2]),max_len=max_words)
    test_dataloader = DataLoader(test_dataset,batch_size=BATCH_SIZE,shuffle=False,num_workers=4)
    out_target = []
    out_error = []
    for batch in test_dataloader:
        x  = batch["inputs"]
        for key in x.keys():
            x[key] = x[key].to(device)
        assert x["input_ids"].is_cuda, f"data is not in model device({loaded_model.device.type})"
        out = loaded_model(x)
        out = torch.squeeze(out, dim=1)
        out_target_t = out[:,0]
        out_error_t = out[:,1]
        out_target.extend(out_target_t.cpu().detach().numpy())
        out_error.extend(out_error_t.cpu().detach().numpy())
    target_prediction += np.hstack(out_target)
    error_prediction += np.hstack(out_error)

In [None]:
test_df["target"] = target_prediction/NUM_FOLDS
sub = test_df.drop("excerpt",axis=1) 
sub.to_csv("submission.csv",index=False)

In [None]:
sub