In [None]:
import os
import pandas as pd
DATA_FOLDER = "../deft_corpus/data"
TASK1 = {
    "Train": os.path.join(DATA_FOLDER,"Task1/train"),
    "Dev": os.path.join(DATA_FOLDER,"Task1/dev")
}

In [None]:
def create_data(folder_path, dev = True, output_type = 'dataframe'):
    df = pd.DataFrame()
    for i, file in enumerate(os.listdir(folder_path)):
        temp_df = pd.read_csv(os.path.join(folder_path, file), names=['text', 'has_def'], sep="\t")
        temp_df['filename'] = str(file).split(".")[0].split("_")[1]
        df = pd.concat([df, temp_df])
    df.dropna(inplace=True)
    if dev:
        df = df.sample(n=50, random_state=2)
    print(df.shape)
    if output_type=='dataframe':
        return df
    else:
        return df.to_json(orient='records')
df = create_data(TASK1['Train'],output_type = 'dataframe')
print(type(df))

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=2)
for i, (train_index, test_index) in enumerate(skf.split(df["text"], df["has_def"])):
    print(i, df.iloc[train_index].head())

In [None]:
df.to_csv("sample_val.csv", sep=",",index=False)

In [None]:
import spacy
spacy_en = spacy.load('en_core_web_sm')

In [None]:
from torchtext.data import Field

def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]

TEXT = Field(sequential=True, tokenize=tokenizer, lower=True)

LABEL = Field(sequential=False, use_vocab=False)

In [None]:
from torchtext.data import TabularDataset

tv_datafields = [("text", TEXT), ("has_def", LABEL),
                 ("filename", None)]
trn, vld = TabularDataset.splits(
               path="", # the root directory where the data lies
               train='sample_train.csv',validation="sample_val.csv",
               format='csv',
               skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
               fields=tv_datafields)

In [None]:
emb_path = "/media/mukesh/36AD331451677000/embeddings/glove.6B"

In [None]:
# import torchtext.vocab as vocab
# import os
# TEXT.build_vocab(trn, vld)
# vectors = vocab.Vectors(os.path.join(emb_path, "glove.6B.100d.txt"), cache=emb_path)
# TEXT.vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim)

In [None]:
TEXT.build_vocab(trn, vld, vectors="glove.6B.100d", max_size=20000,
        min_freq=50)#TEXT.build_vocab(train, vectors="glove.6B.100d")

In [None]:
from torchtext.data import Iterator, BucketIterator
import torch
train_iter, val_iter = BucketIterator.splits(
 (trn, vld), # we pass in the datasets we want the iterator to draw data from
 batch_sizes=(32, 32),
 device=torch.device('cuda'), # if you want to use the GPU, specify the GPU number here
 sort_key=lambda x: len(x.text), # the BucketIterator needs to be told what function it should use to group the data.
 sort_within_batch=False,
 repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

In [None]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
        self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # we pass in the list of attributes for x 

    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            if self.y_vars is not None: # we will concatenate y into a single tensor
                y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim=1).float()
            else:
                y = torch.zeros((1))

            yield (x, y)

    def __len__(self):
        return len(self.dl)

train_dl = BatchWrapper(train_iter, 'text', ['has_def'])
valid_dl = BatchWrapper(val_iter, 'text', ['has_def'])


In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable


class SimpleBiLSTMBaseline(nn.Module):
    def __init__(self, hidden_dim, emb_dim=100,
                 spatial_dropout=0.05, recurrent_dropout=0.1, num_linear=1):
        super().__init__() # don't forget to call this!
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(TEXT.vocab.vectors))
        self.encoder = nn.LSTM(emb_dim, hidden_dim, num_layers=1, dropout=recurrent_dropout)
        self.linear_layers = []
        for _ in range(num_linear - 1):
            self.linear_layers.append(nn.Linear(hidden_dim, hidden_dim))
        self.linear_layers = nn.ModuleList(self.linear_layers)
        self.predictor = nn.Linear(hidden_dim, 1)
    
    def forward(self, seq):
        hdn, _ = self.encoder(self.embedding(seq))
        feature = hdn[-1, :, :]
        for layer in self.linear_layers:
            feature = layer(feature)
        preds = self.predictor(feature)
        return preds

In [None]:

em_sz = 100
nh = 500
nl = 3
model = SimpleBiLSTMBaseline(nh, emb_dim=em_sz).to('cuda'); model

In [None]:
import tqdm

opt = optim.Adam(model.parameters(), lr=1e-2)
loss_func = nn.BCEWithLogitsLoss()

epochs = 200

for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() # turn on training mode
    for x, y in tqdm.tqdm(train_dl): # thanks to our wrapper, we can intuitively iterate over our data!
        opt.zero_grad()

        preds = model(x)
        loss = loss_func(preds, y)
        loss.backward()
        opt.step()
        
        running_loss += loss.item() * x.size(0)
        
    epoch_loss = running_loss / len(trn)
    
    # calculate the validation loss for this epoch
    val_loss = 0.0
    model.eval() # turn on evaluation mode
    for x, y in valid_dl:
        preds = model(x)
        loss = loss_func(preds, y)
        val_loss += loss.item() * x.size(0)

    val_loss /= len(vld)
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))