In [4]:
! kaggle competitions download -c linking-writing-processes-to-writing-quality
! unzip linking-writing-processes-to-writing-quality.zip -d linking-writing-processes-to-writing-quality
! rm linking-writing-processes-to-writing-quality.zip

Downloading linking-writing-processes-to-writing-quality.zip to /home/hieunguyen/Desktop/NLP/linking_writing_process
 97%|██████████████████████████████████████▊ | 105M/108M [00:03<00:00, 36.0MB/s]
100%|████████████████████████████████████████| 108M/108M [00:03<00:00, 30.5MB/s]
Archive:  linking-writing-processes-to-writing-quality.zip
  inflating: linking-writing-processes-to-writing-quality/sample_submission.csv  
  inflating: linking-writing-processes-to-writing-quality/test_logs.csv  
  inflating: linking-writing-processes-to-writing-quality/train_logs.csv  
  inflating: linking-writing-processes-to-writing-quality/train_scores.csv  


# RNN

In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from utils.data import EssayDataset, generate_batch, activities,\
                       df, label_df, c_feat, d_feat
from utils.utils import train, evaluate, EarlyStopping, CosineAnnealingWarmupRestarts
from utils.models import Grader

# Define hyperparameters

## Model:
HIDDEN_DIM = 200
LINEAR_HIDDEN_DIM = 10
EMBEDDING_SIZE = 10

## Data:
SPLIT = 0.2
NOF_DATA = 100

## Training:
EPOCHS = 3
CLIP = 1
BATCH = 10
LR = 0.001
DELTA = 0.001
PATIENCE = 3
MIN_LR = 0.001
MAX_LR = 0.1

## Create the file name to save:
BASE_NAME = f"rnn_H{HIDDEN_DIM}_LH{LINEAR_HIDDEN_DIM}_E{EMBEDDING_SIZE}_" \
      f"S{SPLIT}_N{NOF_DATA}_E{EPOCHS}_C{CLIP}_B{BATCH}_" \
      f"LR{LR}_D{DELTA}_P{PATIENCE}_MinLR{MIN_LR}_MaxLR{MAX_LR}"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define the data
ids = df['id'].unique()[0:NOF_DATA]
from sklearn.model_selection import train_test_split
train_ids, val_ids = train_test_split(ids, test_size = SPLIT)
train_dataset = EssayDataset(df, train_ids, label_df, c_feat, d_feat)
val_dataset = EssayDataset(df, val_ids, label_df, c_feat, d_feat)
train_dataloader = DataLoader(train_dataset, batch_size = BATCH,
                        collate_fn = generate_batch)
val_dataloader = DataLoader(val_dataset, batch_size = BATCH,
                        collate_fn = generate_batch)

# Define model for training
class_size_dict = {'activity': len(activities)}
model = Grader(HIDDEN_DIM, LINEAR_HIDDEN_DIM, class_size_dict,
               EMBEDDING_SIZE, c_feat, d_feat).to(device)
training_steps = EPOCHS * len(train_dataloader)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = LR)
early_stop = EarlyStopping(patience= PATIENCE, delta= DELTA, file_name= f'{BASE_NAME}.pt')
lr_scheduler = CosineAnnealingWarmupRestarts(optimizer, training_steps, min_lr = MIN_LR, 
                                             max_lr= MAX_LR, warmup_steps= int(training_steps * 0.1))

In [None]:
for epoch in range(EPOCHS):


    train_loss = train(model, train_dataloader, val_dataloader,optimizer, criterion, CLIP,
                       evaluate, early_stop, BASE_NAME)
    valid_loss = evaluate(model, val_dataloader, criterion)

    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f}') # 0.723