<a href="https://colab.research.google.com/github/mjain2/csci544-group32/blob/main/CNN_token_Embed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pytorch_lightning
from os.path import join
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
import pandas as pd
import pickle
from torch import nn
from torch.nn import functional as F
from sklearn.metrics import mean_squared_error as MSE
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Ridge
import pytorch_lightning as pl
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
def get_target_cols():
    target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions',]
    return target_cols

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
X_token_embed = pickle.load(open('/content/drive/MyDrive/Colab Notebooks/NLP/Project/feedback-prize-english-language-learning/train_tok_emb.pkl','rb'))
X_test_embed = pickle.load(open('/content/drive/MyDrive/Colab Notebooks/NLP/Project/feedback-prize-english-language-learning/test_tok_emb.pkl','rb'))
print(X_test_embed.shape)
X = X_token_embed.reshape(X_token_embed.shape[0], X_token_embed.shape[2], X_token_embed.shape[1])

print(X.shape)
Y = pickle.load(open('/content/drive/MyDrive/Colab Notebooks/NLP/Project/feedback-prize-english-language-learning/y_train.pkl','rb'))
y_test = pickle.load(open('/content/drive/MyDrive/Colab Notebooks/NLP/Project/feedback-prize-english-language-learning/y_test.pkl','rb'))

(783, 640, 768)
(3128, 768, 640)


In [None]:
X_test = X_test_embed.reshape(X_test_embed.shape[0], X_test_embed.shape[2], X_test_embed.shape[1])

In [None]:
# X_token_embed = pickle.load(open('/content/gdrive/MyDrive/Colab Notebooks/NLP/Project/feedback-prize-english-language-learning/train_tok_emb.pkl','rb'))
# X_test_embed = pickle.load(open('/content/gdrive/MyDrive/Colab Notebooks/NLP/Project/feedback-prize-english-language-learning/test_tok_emb.pkl','rb'))
# #print(X_token_embed.shape)
# X = X_token_embed.reshape(X_token_embed.shape[0], X_token_embed.shape[2], X_token_embed.shape[1])
# X_test = X_token_embed.reshape(X_test_embed.shape[0], X_test_embed.shape[2], X_test_embed.shape[1])
# print(X.shape)
# Y = pickle.load(open('/content/gdrive/MyDrive/Colab Notebooks/NLP/Project/feedback-prize-english-language-learning/y_train.pkl','rb'))
# y_test = pickle.load(open('/content/gdrive/MyDrive/Colab Notebooks/NLP/Project/feedback-prize-english-language-learning/y_test.pkl','rb'))

In [None]:
X_train = X[:int(0.8 * len(X))]
X_valid = X[int(0.8 * len(X)):]
y_train = Y[:int(0.8 * len(Y))]
y_valid = Y[int(0.8 * len(Y)):]
print(X_train.shape, y_train.shape)
print(X_valid.shape, y_valid.shape)

(2502, 768, 640) (2502, 6)
(626, 768, 640) (626, 6)


In [None]:
class ELLDataset(Dataset):
    def __init__(self, X, y, transform=False, max_len=768, test=False):
        self.X = X
        self.y = y
        self.transform = transform
        self.max_len = max_len
        self.test = test
        
    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        features = torch.tensor(np.array(self.X[idx]))
        
        if self.transform == "avg":
            features = torch.mean(features, 1, True)  # collapsing word2vec vector per token

        # if len(features) < self.max_len:
        #     features = F.pad(features, (0, 0, 0, self.max_len-features.shape[0]))    
        # if len(features) > self.max_len:
        #     features = features[:self.max_len]               
        
        features = features.permute(1, 0)
        
        if self.test == True:
            return features.float()
        else:
            label = self.y.iloc[idx]
            label = torch.tensor(label)
            return features.float(), label.float()


In [None]:
# Hyperparameters
batch_size = 64
max_len = 768
emb_dim = 640
learning_rate = 0.01
epochs = 20
dropout_p = 0.3

model_name = '/content/drive/MyDrive/Colab Notebooks/NLP/Project/feedback-prize-english-language-learning/cnn_extractor.pt'

In [None]:
train_ds = ELLDataset(X_train, y_train, max_len=max_len)
train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_ds = ELLDataset(X_valid, y_valid, max_len=max_len)
valid_dataloader = DataLoader(valid_ds, batch_size=batch_size, shuffle=True)

In [None]:
# print(y_train.iloc[1620])
# print(y_train.shape)
# print(type(y_train))
for x, y in train_dataloader:
    
    print(x.shape)
    print(y)
    break

In [None]:
class CNNExtractor(pl.LightningModule):
    def __init__(self, emb_dim):
        super().__init__()
        self.conv1 = nn.Conv1d(emb_dim, emb_dim//2, 5, 1)
        self.conv2 = nn.Conv1d(emb_dim//2, emb_dim//4, 5, 1)
        self.pool = nn.MaxPool1d(2)
        self.dropout = nn.Dropout(dropout_p)
        self.fc1 = nn.Linear(emb_dim//4 * 760, 60)
        self.fc2 = nn.Linear(60, 6)
        self.test_preds = []
    
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.dropout(x)
        x = F.relu(self.conv2(x))
        x = self.dropout(x)
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
    
    def extract(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        return x


    def loss_fn(self, outputs, targets):
        colwise_mse = torch.mean(torch.square(targets - outputs), dim=0)
        loss = torch.mean(torch.sqrt(colwise_mse), dim=0)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,T_max=20,eta_min=1e-6)
        return [optimizer], [scheduler]

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [None]:
net = CNNExtractor(emb_dim).to(device)
print(net)
optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate)
loss_fn = torch.nn.MSELoss()

CNNExtractor(
  (conv1): Conv1d(640, 320, kernel_size=(5,), stride=(1,))
  (conv2): Conv1d(320, 160, kernel_size=(5,), stride=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc1): Linear(in_features=121600, out_features=60, bias=True)
  (fc2): Linear(in_features=60, out_features=6, bias=True)
)


In [None]:
def train_loop(dataloader, model, loss_fn, optimizer, device):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        X = X.to(device)
        y = y.to(device)
        # Compute prediction and loss
        pred = model.forward(X)
        loss = loss_fn(pred, y)
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_loop(dataloader, model, loss_fn, device):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss = 0
    mses = []
    actual = []
    preds = []

    with torch.no_grad():
        for X, y in dataloader:
            X = X.to(device)
            y = y.to(device)
            pred = model.forward(X)
            actual.append(y.cpu().numpy())
            preds.append(pred.cpu().numpy())
            test_loss += loss_fn(pred, y).item()
            mses.append(MSE(y.cpu().numpy(), pred.cpu().numpy(), multioutput="raw_values"))
    
    test_loss /= num_batches
    mses = np.array(mses).mean(axis=0)
    print(f"Test Error: \n MSE: {mses}, Avg loss: {test_loss:>8f} \n")
    print("mcrmse from def is :", )
    return (test_loss, mses)

def comp_score(y_true,y_pred):
    rmse_scores = []
    for i in range(len(target_cols)):
        rmse_scores.append(np.sqrt(mean_squared_error(y_true[:,i],y_pred[:,i])))
    return np.mean(rmse_scores)

In [None]:
def save_best_model(actual, best, model, t):
    if actual < best:
        torch.save(model.state_dict(), "/content/drive/MyDrive/Colab Notebooks/NLP/Project/feedback-prize-english-language-learning/" + model_name)
        print('model saved at epoch', t)
        return actual
    else:
        return best

In [None]:
best_loss = 100000
epoch_loss, mses_loss = [], []
for t in range(epochs):    
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, net, loss_fn, optimizer, device)
    actual_loss, actual_mses = test_loop(valid_dataloader, net, loss_fn, device)

    epoch_loss.append(actual_loss)
    mses_loss.append(actual_mses)
    
    best_loss = save_best_model(actual_loss, best_loss, net, t+1)
print("Done!")

Epoch 1
-------------------------------
loss: 9.518644  [    0/ 2502]
Test Error: 
 MSE: [8.059034  7.9597335 8.329003  8.767467  7.70586   7.601561 ], Avg loss: 8.070444 

mcrmse from def is :
model saved at epoch 1
Epoch 2
-------------------------------
loss: 7.937765  [    0/ 2502]
Test Error: 
 MSE: [6.5020676 6.1108294 6.92467   6.894745  5.955697  6.2242174], Avg loss: 6.435371 

mcrmse from def is :
model saved at epoch 2
Epoch 3
-------------------------------
loss: 6.961603  [    0/ 2502]
Test Error: 
 MSE: [3.6226528 3.3803558 3.8779588 3.882473  3.5034378 3.5454338], Avg loss: 3.635385 

mcrmse from def is :
model saved at epoch 3
Epoch 4
-------------------------------
loss: 3.604785  [    0/ 2502]
Test Error: 
 MSE: [0.48513636 0.49037686 0.39762494 0.37813392 0.79916215 0.58951294], Avg loss: 0.523325 

mcrmse from def is :
model saved at epoch 4
Epoch 5
-------------------------------
loss: 0.455741  [    0/ 2502]
Test Error: 
 MSE: [0.6399897  0.60397    0.7283614  0.5

In [None]:
test_ds = ELLDataset(X_test, y_test, max_len=max_len)
test_dataloader = DataLoader(test_ds, batch_size=1, shuffle=False)
net.load_state_dict(torch.load(model_name))
loss,mcmse = test_loop(test_dataloader, net, loss_fn, device)
print(loss)
print(mcmse)

Test Error: 
 MSE: [0.27157864 0.24219915 0.21472056 0.26234493 0.297478   0.24750508], Avg loss: 0.255971 

mcrmse from def is :
0.2559710854348057
[0.27157864 0.24219915 0.21472056 0.26234493 0.297478   0.24750508]
