In [None]:
import os
import pandas as pd

import logging as log

train = pd.read_csv("../deft_corpus/data/Task1_folds/train_0.csv", sep="\t")
val = pd.read_csv("../deft_corpus/data/Task1_folds/val_0.csv", sep="\t")
train.shape, val.shape

In [None]:
train['has_def'].sum() + val['has_def'].sum()

In [None]:
list(set(train['filename'].values))

In [None]:
test = pd.read_csv("../deft_corpus/data/task1_dev.csv", sep="\t")
test.head(10)

In [None]:
import spacy
import string

spacy_en = spacy.load('en_core_web_sm')

def tokenizer(text): 
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.translate(str.maketrans('', '', string.digits)).strip()
    return [tok.text for tok in spacy_en.tokenizer(text)]

train['preprocessed'] = train['text'].map(tokenizer)

In [None]:
train.head(10)

In [None]:
from torchtext.data import Field



TEXT = Field(sequential=True, tokenize=tokenizer, lower=True, use_vocab=True, pad_token="<PAD>")

LABEL = Field(sequential=False, use_vocab=False)

In [None]:
from torchtext.data import TabularDataset

tv_datafields = [("text", TEXT), ("has_def", LABEL),
                 ("filename", None)]
trn, vld = TabularDataset.splits(
               path="../deft_corpus/data/Task1_folds/", # the root directory where the data lies
               train='train_0.csv',validation="val_0.csv",
               format='tsv',
               skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
               fields=tv_datafields)

In [None]:
len(trn[65].text),len(trn[1].text),len(trn[48].text),

In [None]:
for i in range(100):
    if len(trn[i].text)<10:
        print(i)

In [None]:
trn[3].text

In [None]:
# import torchtext.vocab as vocab
# import os
# TEXT.build_vocab(trn, vld)
# vectors = vocab.Vectors(os.path.join(emb_path, "glove.6B.100d.txt"), cache=emb_path)
# TEXT.vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim)

In [None]:
TEXT.build_vocab(trn, vld, vectors="glove.6B.100d", max_size=20000,
        min_freq=50)#TEXT.build_vocab(train, vectors="glove.6B.100d")

In [None]:
from torchtext.data import Iterator, BucketIterator
import torch
train_iter, val_iter = BucketIterator.splits(
 (trn, vld), # we pass in the datasets we want the iterator to draw data from
 batch_sizes=(128, 128),
 device=torch.device('cpu'), # if you want to use the GPU, specify the GPU number here
 sort_key=lambda x: len(x.text), # the BucketIterator needs to be told what function it should use to group the data.
 sort_within_batch=False,
 repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

In [None]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
        self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # we pass in the list of attributes for x 

    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            if self.y_vars is not None: # we will concatenate y into a single tensor
                y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim=1).float()
            else:
                y = torch.zeros((1))

            yield (x, y)

    def __len__(self):
        return len(self.dl)

train_dl = BatchWrapper(train_iter, 'text', ['has_def'])
valid_dl = BatchWrapper(val_iter, 'text', ['has_def'])


In [None]:
len(train_dl)

In [None]:
x, y = next(train_dl.__iter__())
print(x.shape, y.shape)

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable


class SimpleBiLSTMBaseline(nn.Module):
    def __init__(self, hidden_dim, emb_dim=100,
                 spatial_dropout=0.05, recurrent_dropout=0.1, num_linear=2):
        super().__init__() # don't forget to call this!
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(TEXT.vocab.vectors))
        self.encoder = nn.LSTM(emb_dim, hidden_dim, num_layers=1, dropout=recurrent_dropout)
        self.linear_layers = []
        for _ in range(num_linear - 1):
            self.linear_layers.append(nn.Linear(hidden_dim, hidden_dim))
        self.linear_layers = nn.ModuleList(self.linear_layers)
        self.predictor = nn.Linear(hidden_dim, 1)
    
    def forward(self, seq):
        hdn, _ = self.encoder(self.embedding(seq))
        feature = hdn[-1, :, :]
        for layer in self.linear_layers:
            feature = layer(feature)
        preds = self.predictor(feature)
        return preds

In [None]:
em_sz = 100
nh = 100
nl = 3
model = SimpleBiLSTMBaseline(nh, emb_dim=em_sz).to('cpu'); model

In [None]:
from sklearn.utils.class_weight import compute_class_weight
class_weights = torch.FloatTensor([compute_class_weight('balanced', [0,1], train['has_def'].values)[1]])
class_weights

In [None]:
# log.basicConfig(
#     filename = 'trainlog.txt', 
#     format   = '%(asctime)s : %(message)s',
#     datefmt  = '%Y-%m-%d %H:%M:%S', 
#     level = log.INFO)
import time
timestr = time.strftime("%Y%m%d-%H%M%S")
"log_{}.log".format(timestr)

In [None]:
log_file = open("log_{}.log".format(timestr),"w")

In [None]:
import tqdm
import numpy as np
from sklearn.metrics import classification_report

import sys
old_stdout = sys.stdout

sys.stdout = log_file

opt = optim.Adam(model.parameters(), lr=1e-2)
loss_func = nn.BCEWithLogitsLoss(weight=class_weights)

epochs = 2

for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() # turn on training mode
    train_preds = []
    train_truth = []
    for x, y in tqdm.tqdm(train_dl):
        opt.zero_grad()

        preds = model(x)
        loss = loss_func(preds, y)
        loss.backward()
        opt.step()
        train_preds.extend(nn.Sigmoid()(preds).detach().cpu().numpy())
        train_truth.extend(y.cpu().numpy())
        running_loss += loss.item() * x.size(0)

    epoch_loss = running_loss / len(trn)

    # calculate the validation loss for this epoch
    val_loss = 0.0
    val_preds = []
    val_truth = []
    model.eval() # turn on evaluation mode
    for x, y in valid_dl:
        preds = model(x)
        loss = loss_func(preds, y)
        val_loss += loss.item() * x.size(0)
        val_preds.extend(nn.Sigmoid()(preds).detach().cpu().numpy())
        val_truth.extend(y.cpu().numpy())

    val_loss /= len(vld)
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))
    print("classification report Train")
    train_preds = np.where(np.array(train_preds)<0.5, 0, 1).flatten()
    print(classification_report(train_truth, train_preds))
    
    print("classification report Validation")
    val_preds = np.where(np.array(val_preds)<0.5, 0, 1).flatten()
    print(classification_report(val_truth, val_preds))
sys.stdout = old_stdout
log_file.close()