In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import BatchNormalization, Flatten, Dense, Input, Conv1D, LSTM, Concatenate
from tensorflow import keras
from tensorflow.keras.initializers import glorot_uniform
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.metrics import mean_squared_error
import numpy as np
import gc
import pickle

In [None]:
!pip install pytorch_lightning

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch_lightning
  Downloading pytorch_lightning-1.8.3.post1-py3-none-any.whl (798 kB)
[K     |████████████████████████████████| 798 kB 7.0 MB/s 
Collecting torchmetrics>=0.7.0
  Downloading torchmetrics-0.10.3-py3-none-any.whl (529 kB)
[K     |████████████████████████████████| 529 kB 44.9 MB/s 
[?25hCollecting tensorboardX>=2.2
  Downloading tensorboardX-2.5.1-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 71.6 MB/s 
Collecting lightning-utilities==0.3.*
  Downloading lightning_utilities-0.3.0-py3-none-any.whl (15 kB)
Collecting fire
  Downloading fire-0.4.0.tar.gz (87 kB)
[K     |████████████████████████████████| 87 kB 8.6 MB/s 
Building wheels for collected packages: fire
  Building wheel for fire (setup.py) ... [?25l[?25hdone
  Created wheel for fire: filename=fire-0.4.0-py2.py3-none-any.whl size=115943 sha256=053e0144a028fd055623378c

In [None]:
import numpy as np 
import pandas as pd 
import os, gc, re, warnings
warnings.filterwarnings("ignore")
from keras.preprocessing.text import Tokenizer
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import gc
import torch
import torch.nn as nn
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from nltk.tokenize import word_tokenize

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
with open('/content/gdrive/MyDrive/ProjectEmbeddings/train_sent_emb.pkl', 'rb') as f:
    train_sent_emb = pickle.load(f)
with open('/content/gdrive/MyDrive/ProjectEmbeddings/test_sent_emb.pkl', 'rb') as f:
    test_sent_emb = pickle.load(f)

with open('/content/gdrive/MyDrive/ProjectEmbeddings/y_train.pkl', 'rb') as f:
    y = pickle.load(f)

with open('/content/gdrive/MyDrive/ProjectEmbeddings/y_test.pkl', 'rb') as f:
    y_test = pickle.load(f)

In [None]:
print(train_sent_emb.shape)
print(test_sent_emb.shape)

(3128, 768)
(783, 768)


In [None]:

classes = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions',]

In [None]:
class EssayDataset:
    def __init__(self,x_embeddings, labels = False, test = False):
        self.test = test
        self.texts = x_embeddings
        self.classes = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions',]
        if not test:
            self.labels = labels.loc[:,self.classes].values
            
            
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index):
        text = self.texts[index]
        text = torch.tensor(text, dtype=torch.long)
        if self.test is False:
            label = self.labels[index,:] / 5.
            label = torch.tensor(label, dtype=torch.float32)
            return text, label
        
        return text

In [None]:
train = train_sent_emb[:int(0.8*len(train_sent_emb))]
validation = train_sent_emb[int(0.8*len(train_sent_emb)):]
y_train = y[:int(0.8*len(y))]
y_valid = y[int(0.8*len(y)):]
# print(len(train_ds) + len(test_ds))
# print(len(train_ds))
# print(len(test_ds))
# print(len(train_sent_emb))

train_ds = EssayDataset(train, labels= y_train)
valid_ds = EssayDataset(validation, labels = y_valid)

In [None]:
config = {
    'embed_dim': 15,
    'hidden_dim': 8,
    'seq_len': 768,
    'vocab': 768,
    'n_layers': 4,
    'output_dim': len(classes),
    'lr': 0.025,
    'epochs': 25,
    'batch_size': 16,
    'model_name': 'lstm-embeddings'
}
train_loader = torch.utils.data.DataLoader(train_ds,batch_size=config['batch_size'],shuffle=True)
val_loader = torch.utils.data.DataLoader(valid_ds,batch_size=config['batch_size'])

In [None]:
class RNNModel(pl.LightningModule):
    def __init__(self, config):
        super(RNNModel, self).__init__()
        self.save_hyperparameters()
        self.config = config
        self.vocab_size = self.config['vocab']
        self.embed_dim = self.config['embed_dim']
        self.hidden_dim = self.config['hidden_dim']
        self.seq_len = self.config['seq_len']
        self.n_layers = self.config['n_layers']
        self.output_dim = self.config['output_dim']
        
        self.lr = config['lr']
        
        self.embedding = nn.Embedding(self.vocab_size, self.embed_dim)
        
        self.lstm = nn.LSTM(input_size=self.embed_dim, 
                            hidden_size=self.hidden_dim,
                            num_layers = self.n_layers,
                            batch_first = True,
                            dropout = 0.3
                           )
        self.mean_pool = nn.AvgPool1d(4)
        self.linear = nn.Linear(self.hidden_dim//4, self.output_dim)
        
        
        self.test_preds = []

    def forward(self, x):
        x = self.embedding(x)
        x, (h,c) = self.lstm(x)
        #print("Type of h[-1] is:")
        #print(type(h[-1]))
        pooled = self.mean_pool(h[-1])
        x = self.linear(pooled)
        return x
    
    def loss_fn(self, outputs, targets):
        colwise_mse = torch.mean(torch.square(targets - outputs), dim=0)
        loss = torch.mean(torch.sqrt(colwise_mse), dim=0)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,T_max=20,eta_min=1e-6)
        return [optimizer], [scheduler]
    
    def training_step(self, batch, batch_idx):
        
        x, y = batch
        outputs = self(x)
        
        loss = self.loss_fn(outputs, y)

        self.log('train_loss', loss.item(), on_epoch=True)
        
        return loss
    
    def validation_step(self, batch, batch_idx):
        
        x,y = batch
        
        outputs = self(x)
        
        loss = self.loss_fn(outputs, y)
        
        self.log('val_loss', loss.item(), on_epoch=True)
        
    def test_step(self, batch, batch_idx):
        sample = batch
        preds = self(sample) * 5.
        self.test_preds.append(preds.detach().cpu())
        
    def get_predictions(self):
        return torch.cat(self.test_preds).numpy()

In [None]:
model = RNNModel(config)
trainer = pl.Trainer(
                     callbacks=[
                         EarlyStopping(monitor="val_loss", 
                                       mode="min",
                                       patience=5,
                                      )
                     ],
                     max_epochs = config['epochs']
                    )


INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model, train_loader, val_loader)
metrics = trainer.logged_metrics

logs = {
    'train_loss': metrics['train_loss_epoch'].item(),
    'val_loss': metrics['val_loss'].item()
}
print(metrics)

INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 11.5 K
1 | lstm      | LSTM      | 2.5 K 
2 | mean_pool | AvgPool1d | 0     
3 | linear    | Linear    | 18    
----------------------------------------
14.1 K    Trainable params
0         Non-trainable params
14.1 K    Total params
0.056     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

{'train_loss_step': tensor(0.1240), 'val_loss': tensor(0.1276), 'train_loss_epoch': tensor(0.1336)}


In [None]:
test_ds = EssayDataset(test_sent_emb, test = True)
test_loader = torch.utils.data.DataLoader(test_ds,batch_size=1,shuffle=False)
trainer.test(model,test_loader)
p = model.get_predictions()

Testing: 0it [00:00, ?it/s]

In [None]:
print(test_sent_emb.shape)

(783, 768)


In [None]:
def mcrmse(y_true, y_pred):
    colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=0)
    return tf.reduce_mean(tf.sqrt(colwise_mse), axis=0)

print(mcrmse(p,y_test))

tf.Tensor(0.6493355784899156, shape=(), dtype=float64)
