In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
import torch
import torchtext
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import vocab
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
import random
from typing import Tuple
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch import Tensor
import math
import time

In [2]:
main_df = pd.read_csv('/kaggle/input/feedback-prize-english-language-learning/train.csv')


In [3]:
test_df = pd.read_csv('/kaggle/input/feedback-prize-english-language-learning/test.csv')
test_df['cohesion'] = 1.0
test_df['syntax'] = 1.0
test_df['vocabulary'] = 1.0 
test_df['phraseology'] = 1.0 
test_df['grammar'] = 1.0
test_df['conventions'] = 1.0 

In [4]:
train_df, val_df = train_test_split(main_df, test_size=0.03, random_state=6)

In [5]:
tokenizer = get_tokenizer('spacy', language='en')
MAX_SENT_LEN = 1000

def mark_bos_eos(text):
    final_text = ''
    final_text = final_text.strip()
    l = text.split('.')[:-1]
    for sentence in l:
        final_text = final_text + ' <bos> ' + sentence + ' <eos> '
    final_text = final_text.replace('  ', ' ')
    final_text = final_text.strip()
    
    return final_text

def pad_sequence(tok_seq, pad_tok, max_len=1000):
    cur_len = len(tok_seq)
    if cur_len >= max_len:
        return tok_seq[:max_len]

    padding = [pad_tok]*(max_len - cur_len)
    text = tok_seq + padding
    
    return text

def build_vocab(df, tokenizer):
    counter = Counter()
    df['full_text'].apply(lambda string_ : counter.update(tokenizer(string_)))
    
    return vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

vocab_ = build_vocab(train_df, tokenizer)
vocab_.set_default_index(vocab_['<unk>'])

def data_process(df):
    data = []
    df['full_text_'] = df['full_text'].apply(lambda text : mark_bos_eos(text))
    for raw_en, label in zip(df['full_text_'], df[['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']].values.tolist()):
        tokens = [vocab_[token] for token in tokenizer(raw_en)]
        token_seq = pad_sequence(tokens, vocab_['<pad>'], MAX_SENT_LEN)
        data.append((token_seq, label))
        
    return data

train_data = data_process(train_df)
val_data = data_process(val_df)
test_data = data_process(test_df)



In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

BATCH_SIZE = 128
PAD_IDX = vocab_['<pad>']
BOS_IDX = vocab_['<bos>']
EOS_IDX = vocab_['<eos>']

def generate_batch(data_batch):
    en_batch = []
    label_batch = []
    
    for (en_item, label_item) in data_batch:
        en_batch.append(en_item)
        label_batch.append(label_item)
    
    en_batch = torch.tensor(en_batch, dtype=torch.long)
    label_batch = torch.tensor(label_batch, dtype=torch.float)
    
    return en_batch, label_batch

def generate_batch2(data_batch):
    en_batch = []
    label_batch = []
    
    for (en_item, label_item) in data_batch:
        en_batch.append(en_item)
        label_batch.append(label_item)
    
    en_batch = torch.tensor(en_batch, dtype=torch.float)
    label_batch = torch.tensor(label_batch, dtype=torch.float)
    
    return en_batch, label_batch

train_iter = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=generate_batch)
valid_iter = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=generate_batch)
test_iter = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=generate_batch)


cpu


In [19]:
class MODEL(nn.Module):
    def __init__(self,
                 input_dim: int,
                 emb_dim: int,
                 enc_hid_dim: int,
                 out_dim: int,
                 dropout: float):
        super().__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.out_dim = out_dim
        self.dropout = dropout
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.lstm = nn.LSTM(emb_dim, enc_hid_dim, num_layers=1, bidirectional=False, batch_first=True, dropout=dropout)
        
        self.fc = nn.Linear(MAX_SENT_LEN * enc_hid_dim, out_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src: Tensor) -> Tuple[Tensor]:
        
        embedded = self.dropout(self.embedding(src))
        
        outputs, _ = self.lstm(embedded)
        
        outputs = torch.reshape(outputs, (outputs.shape[0], outputs.shape[1]*outputs.shape[2]))
        
        fc_out = self.fc(outputs)
        
        return fc_out
    

In [20]:
INPUT_DIM = len(vocab_)
EMB_DIM = 256
GRU_HID_DIM = 128
OUTPUT_DIM = 6
DROPOUT = 0.2


In [21]:
model = MODEL(INPUT_DIM, EMB_DIM, GRU_HID_DIM, OUTPUT_DIM, DROPOUT)
model.to(device)

def init_weights(m: nn.Module):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            

model.apply(init_weights)

MODEL(
  (embedding): Embedding(24777, 256)
  (lstm): LSTM(256, 128, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=128000, out_features=6, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [22]:
class MCRMSE(nn.Module):
    def __init__(self, output_dim):
        super().__init__()
        self.mse = nn.MSELoss(reduction='sum')
        self.no_cols = output_dim
        
    def forward(self, target, output):
        rmse = torch.pow(torch.div(self.mse(target, output), output.shape[0]), 0.5)
        mcrmse = torch.div(rmse, self.no_cols)
        
        return mcrmse

In [23]:
criterion = MCRMSE(OUTPUT_DIM)

optimizer = optim.Adam(model.parameters())

def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 7,308,550 trainable parameters


In [24]:
def train(model: nn.Module,
          iterator: torch.utils.data.DataLoader,
          optimizer: optim.Optimizer,
          criterion: nn.Module,
          clip: float):
    
    model.train()
    
    epoch_loss = 0
    
    for _, (src, trg) in enumerate(iterator):
        src, trg = src.to(device), trg.to(device)
        
        optimizer.zero_grad()
        
        output = model(src)
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

def evaluate(model: nn.Module,
             iterator: torch.utils.data.DataLoader,
             criterion: nn.Module):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
        
        for _, (src, trg) in enumerate(iterator):
            src, trg = src.to(device), trg.to(device)
            
            output = model(src)
            
            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
    
    return epoch_loss / len(iterator)

def epoch_time(start_time: int,
               end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iter, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iter, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f}')
    
    

Epoch: 01 | Time: 1m 30s
	Train Loss: 0.446 | Train PPL:   1.561
	 Val. Loss: 0.276 | Val. PPL:   1.318
Epoch: 02 | Time: 1m 30s
	Train Loss: 0.240 | Train PPL:   1.272
	 Val. Loss: 0.219 | Val. PPL:   1.245
Epoch: 03 | Time: 1m 31s
	Train Loss: 0.207 | Train PPL:   1.231
	 Val. Loss: 0.252 | Val. PPL:   1.287
Epoch: 04 | Time: 1m 30s
	Train Loss: 0.220 | Train PPL:   1.246
	 Val. Loss: 0.282 | Val. PPL:   1.326
Epoch: 05 | Time: 1m 31s
	Train Loss: 0.212 | Train PPL:   1.236
	 Val. Loss: 0.279 | Val. PPL:   1.322
Epoch: 06 | Time: 1m 30s
	Train Loss: 0.208 | Train PPL:   1.231
	 Val. Loss: 0.252 | Val. PPL:   1.287
Epoch: 07 | Time: 1m 30s
	Train Loss: 0.190 | Train PPL:   1.209
	 Val. Loss: 0.243 | Val. PPL:   1.275
Epoch: 08 | Time: 1m 30s
	Train Loss: 0.163 | Train PPL:   1.177
	 Val. Loss: 0.307 | Val. PPL:   1.360
Epoch: 09 | Time: 1m 31s
	Train Loss: 0.173 | Train PPL:   1.189
	 Val. Loss: 0.314 | Val. PPL:   1.369
Epoch: 10 | Time: 1m 30s
	Train Loss: 0.168 | Train PPL:   1.182

In [13]:
train_output = np.empty((0,6), float)

with torch.no_grad():
    for _, (src, trg) in enumerate(train_iter):
        src, trg = src.to(device), trg.to(device)
        pred = model(src)
        pred = pred.to('cpu')
        train_output = np.append(train_output, pred, axis=0)
        
train_output = train_output.T
print(train_output.shape)

test_output_mid = np.empty((0,6), float)

with torch.no_grad():
    for _, (src, trg) in enumerate(test_iter):
        src, trg = src.to(device), trg.to(device)
        pred = model(src)
        pred = pred.to('cpu')
        test_output_mid = np.append(test_output_mid, pred, axis=0)
        
test_output_mid = test_output_mid.T
print(test_output_mid.shape)

val_output = np.empty((0,6), float)

with torch.no_grad():
    for _, (src, trg) in enumerate(valid_iter):
        src, trg = src.to(device), trg.to(device)
        pred = model(src)
        pred = pred.to('cpu')
        val_output = np.append(val_output, pred, axis=0)
        
val_output = val_output.T
print(val_output.shape)

(6, 3793)
(6, 3)
(6, 118)


In [74]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [91]:
def model_result_train(data):
    models = []
    data_res = np.array([x[1] for x in data]).T
    #print(data_res)
    for i in range(6):
        rand_arr = (np.random.rand(data_res.shape[1]) * 2) - 1
        data_sel = np.delete(data_res, i, 0)
        data_target = data_res[i,:]
        data_res[i,:] = data_res[i,:] + rand_arr
        reg = LinearRegression()
        reg.fit(data_res.T, data_target)
        models.append(reg)
    return(models)

models = model_result_train(train_data)
targets = np.array([x[1] for x in train_data]).T
model_results = train_output.T
def model_result_exec(models, model_results, targets):
    for i in range(6):
        target = targets[i,:]
        res = model_results
        print(target.shape, res[:,i].shape)
        or_rmse = mean_squared_error(target, res[:, i], squared=False)
        print(or_rmse)
        model = models[i]
        pred = model.predict(res)
        rmse = mean_squared_error(target, pred, squared=False)
        print(rmse)

model_result_exec(models, model_results, targets)


(3793,) (3793,)
0.8761160652287004
0.8761160652287012
(3793,) (3793,)
0.8742526816911792
0.8742526816911796
(3793,) (3793,)
0.7963907271281019
0.7963907271281009
(3793,) (3793,)
0.9039069913636952
0.9039069913636955
(3793,) (3793,)
1.0369511423012887
1.0369511423012887
(3793,) (3793,)
0.9073516457285837
0.9073516457285841


In [14]:
coh, syn, voc, phr, gra, conv = train_df['cohesion'], train_df['syntax'], train_df['vocabulary'], train_df['phraseology'], train_df['grammar'], train_df['conventions']
train_outputs = []
for idx, (c, s, v, p, g, cv) in enumerate(zip(coh, syn, voc, phr, gra, conv)):
    data = (list(train_output[:,idx]), [c, s, v, p, g, cv])
    train_outputs.append(data)

coh, syn, voc, phr, gra, conv = val_df['cohesion'], val_df['syntax'], val_df['vocabulary'], val_df['phraseology'], val_df['grammar'], val_df['conventions']
val_outputs = []
for idx, (c, s, v, p, g, cv) in enumerate(zip(coh, syn, voc, phr, gra, conv)):
    data = (list(val_output[:,idx]), [c, s, v, p, g, cv])
    val_outputs.append(data)
    
coh, syn, voc, phr, gra, conv = test_df['cohesion'], test_df['syntax'], test_df['vocabulary'], test_df['phraseology'], test_df['grammar'], test_df['conventions']
test_outputs = []
for idx, (c, s, v, p, g, cv) in enumerate(zip(coh, syn, voc, phr, gra, conv)):
    data = (list(test_output_mid[:,idx]), [c, s, v, p, g, cv])
    test_outputs.append(data)

[([3.2467923164367676, 3.0404694080352783, 3.668672800064087, 3.3395426273345947, 3.252776861190796, 3.1035242080688477], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]), ([2.953334331512451, 2.997270345687866, 3.081468343734741, 2.9999661445617676, 3.074998617172241, 3.1529500484466553], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]), ([3.3455190658569336, 3.01790189743042, 3.626939296722412, 3.644810199737549, 3.4584615230560303, 3.3398725986480713], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])]


In [15]:
submission_df = pd.DataFrame({'text_id':test_df['text_id'], 'cohesion':test_output_final[0],
                             'syntax': test_output_final[1], 'vocabulary': test_output_final[2],
                             'phraseology': test_output_final[3], 'grammar': test_output_final[4],
                             'conventions': test_output_final[5]})
for col in submission_df.columns:
    if col != 'text_id':
        submission_df[col] = submission_df[col].abs()

NameError: name 'test_output_final' is not defined

In [None]:
print(submission_df)

In [None]:
submission_df.to_csv('submission.csv', index=False)