In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/roberta-transformers-pytorch/roberta-large-mnli/config.json
/kaggle/input/roberta-transformers-pytorch/roberta-large-mnli/merges.txt
/kaggle/input/roberta-transformers-pytorch/roberta-large-mnli/vocab.json
/kaggle/input/roberta-transformers-pytorch/roberta-large-mnli/tokenizer_config.json
/kaggle/input/roberta-transformers-pytorch/roberta-large-mnli/pytorch_model.bin
/kaggle/input/roberta-transformers-pytorch/roberta-large-mnli/special_tokens_map.json
/kaggle/input/roberta-transformers-pytorch/roberta-large-mnli/added_tokens.json
/kaggle/input/roberta-transformers-pytorch/roberta-base/config.json
/kaggle/input/roberta-transformers-pytorch/roberta-base/merges.txt
/kaggle/input/roberta-transformers-pytorch/roberta-base/vocab.json
/kaggle/input/roberta-transformers-pytorch/roberta-base/tokenizer_config.json
/kaggle/input/roberta-transformers-pytorch/roberta-base/pytorch_model.bin
/kaggle/input/roberta-transformers-pytorch/roberta-base/special_tokens_map.json
/kaggle/input/ro

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
import numpy as np
import re
import seaborn as sns
from transformers import AutoModel,AutoConfig,AutoTokenizer,get_cosine_schedule_with_warmup
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import random
import os
import numpy as np
import torch
import torch.optim as optim
from sklearn.model_selection import StratifiedKFold
import joblib


In [3]:
test = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
sample = pd.read_csv("/kaggle/input/commonlitreadabilityprize/sample_submission.csv")

In [4]:
def feature(excerpt, stop_words):
    dialogue_counts=excerpt.count('"')/2
    num_sentence = len(sent_tokenize(excerpt))
    
    words = word_tokenize(re.sub("[^a-zA-Z]", " ", excerpt).lower())
    initial_num_words = len(words)
    target_words = [word for word in words if word not in stop_words]
    processed_num_words=len(target_words)
    text_shrinkage = processed_num_words/initial_num_words
    avg_sentence_length = initial_num_words / num_sentence
    
    return dialogue_counts, num_sentence, processed_num_words, text_shrinkage, avg_sentence_length

In [5]:
test["len"] = test.excerpt.apply(len)
test["excerpt"] = test.excerpt.apply(lambda x: x.replace("\n"," "))
stop_words = set(stopwords.words("english"))
test['feature'] = test.excerpt.apply(feature, stop_words= stop_words)
test["dialogue"], test["num_sentence"], test["num_processed_words"], test['text_shrinkage'], \
test["avg_sentence_length"] = zip(*test.feature)
test.drop(columns="feature", inplace=True)

In [6]:
config = {
    'lr': 2e-5,
    'wd':0.01,
    'batch_size':16,
    'valid_step':10,
    'max_len':500,
    'epochs':5,
    'nfolds':5,
    'seed':9527,
    'model_path':'../input/roberta-transformers-pytorch/roberta-base',
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(seed=config['seed'])

In [7]:
class CLRPDataset(Dataset):
    def __init__(self,df,tokenizer,feature,max_len=256, train=True):
        self.excerpt = df['excerpt'].to_numpy()
        self.train = train
        if self.train:
            self.targets = df['target'].to_numpy()
        self.feature = feature
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        
        feature = torch.tensor(self.feature[idx],dtype=torch.float,)
        
        if self.train:
            target = torch.tensor(self.targets[idx],dtype=torch.float) 
            return encode, feature, target
        else:
            return encode, feature
    
    def __len__(self):
        return len(self.excerpt)

In [8]:
class Model(nn.Module):
    def __init__(self,path):
        super(Model,self).__init__()
        self.config = AutoConfig.from_pretrained(path)
        self.config.update({'output_hidden_states':True,"hidden_dropout_prob": 0.0})
        self.roberta = AutoModel.from_pretrained(path,config=self.config)
        self.linear = nn.Linear(self.config.hidden_size*4+5, 1, 1)

    def forward(self,feature, **xb):
        x = self.roberta(**xb)
        x = torch.stack(x[2])
        x = torch.cat((x[-1], x[-2], x[-3], x[-4]),-1)
        x = x[:, 0]
        x = torch.cat((x,feature), -1)
        x = self.linear(x)
        return x

In [9]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim):
        super(AttentionHead, self).__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim
        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector
    
class ATHeadModel(nn.Module):
    def __init__(self,path):
        super(ATHeadModel,self).__init__()
        self.roberta = AutoModel.from_pretrained(path)  
        self.config = AutoConfig.from_pretrained(path)
        self.head = AttentionHead(self.config.hidden_size,self.config.hidden_size)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.config.hidden_size+5,1)

    def forward(self,feature,**xb):
        x = self.roberta(**xb)[0]
        x = self.head(x)
        x = self.dropout(x)
        x = torch.cat((x,feature), -1)
        x = self.linear(x)
        return x

In [10]:
class AttentionPoolingModel(nn.Module):
    def __init__(self, path):
        super(AttentionPoolingModel, self).__init__() 
        self.config = AutoConfig.from_pretrained(path)
        self.config.update({'output_hidden_states':True,"hidden_dropout_prob": 0.0})
        self.roberta = AutoModel.from_pretrained(path,config=self.config)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.config.hidden_size+5,1)

        q_t = np.random.normal(loc=0.0, scale=0.1, size=(1, self.config.hidden_size))
        self.q = nn.Parameter(torch.from_numpy(q_t).float())
        w_ht = np.random.normal(loc=0.0, scale=0.1, size=(self.config.hidden_size, self.config.hidden_size))
        self.w_h = nn.Parameter(torch.from_numpy(w_ht).float())

    def forward(self, feature, **xb):
        x = self.roberta(**xb)
        x = torch.stack(x[2])
        x = torch.stack([x[layer_i][:, 0].squeeze() for layer_i in range(1, self.config.num_hidden_layers+1)], dim=-1)
        x = x.view(-1, self.config.num_hidden_layers, self.config.hidden_size)
        x = self.attention(x)
        x = self.dropout(x)
        x = torch.cat((x,feature), -1)
        x = self.linear(x)
        return x

    def attention(self, h):
        v = torch.matmul(self.q, h.transpose(-2, -1)).squeeze(1)
        v = F.softmax(v, -1)
        v_temp = torch.matmul(v.unsqueeze(1), h).transpose(-2, -1)
        v = torch.matmul(self.w_h.transpose(1, 0), v_temp).squeeze(2)
        return v

In [11]:
def get_prediction(df,Fold,MODEL,path,model_path,device='cuda'):
    outputs = np.zeros(len(df))
    for f in range(Fold):
        model = MODEL(model_path)
        model.load_state_dict(torch.load(path+f"{f}/model.bin",map_location=device))
        model.eval()
        model.to(device)
        

        tokenizer = AutoTokenizer.from_pretrained(model_path)
        
        scaler = joblib.load(f"../input/clrp-models/scaler/scaler_fold{f}")
        df_feature = scaler.transform(df.iloc[:,5:].to_numpy())

        test_ds = CLRPDataset(df,tokenizer,df_feature, config['max_len'],train=False)
        test_dl = DataLoader(test_ds,
                            batch_size = config["batch_size"],
                            shuffle=False,
                            num_workers = 4,
                            pin_memory=True)
        
        
        
        predictions = list()
        for i, (inputs, feature) in enumerate(test_dl):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            outputs = model(feature.to(device), **inputs)
            outputs = outputs.cpu().detach().numpy().ravel().tolist()
            predictions.extend(outputs)

        torch.cuda.empty_cache()
        outputs+=np.array(predictions)
    return outputs/Fold

In [12]:
answer1=get_prediction(test,config["nfolds"],Model,"../input/clrp-models/concat_model",config['model_path'],device)
answer2=get_prediction(test,config["nfolds"],ATHeadModel,"../input/clrp-models/atHead_model",config['model_path'],device)
answer3=get_prediction(test,config["nfolds"],AttentionPoolingModel,"../input/clrp-models/atPool_model",config['model_path'],device)

In [13]:
answer=(answer1+answer2+answer3)/3
sample['target']=answer
sample.to_csv('submission.csv',index=False)