In [4]:
import torch 
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import torchtext
from torchtext.data import Field, BucketIterator
from torchtext.datasets import UDPOS
# from torchcrf import CRF

from tqdm import tqdm
import spacy
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings("ignore")

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [6]:
spacy_en = spacy.load("en_core_web_sm")

In [7]:
def tokenizer_en(text):
    return [token.text for token in spacy_en.tokenizer(text.lower())]

In [11]:
train_deft_file = "../deft_corpus/data/deft_files/train/t1_biology_0_0.deft"
with open(train_deft_file,'r') as fl:
    lines = fl.read().split("\n\n")

In [13]:
lines[0].split('\n')

['5\tdata/source_txt/train/t1_biology_0_0.txt\t 250\t 251\t O\t -1\t -1\t 0',
 '.\tdata/source_txt/train/t1_biology_0_0.txt\t 251\t 252\t O\t -1\t -1\t 0']

In [None]:
# TOKEN TXT_SOURCE_FILE START_CHAR END_CHAR TAG TAG_ID ROOT_ID RELATION

In [30]:
def get_text_labels(sequence_tags):
    # TOKEN TXT_SOURCE_FILE START_CHAR END_CHAR TAG TAG_ID ROOT_ID RELATION
    text = [data[0].strip() for data in sequence_tags]
    tags = [data[4].strip() for data in sequence_tags]
    return {"text":text, "labels":tags}
def parse_deft(deft_file):
    with open(deft_file, 'r') as deft:
        all_text = deft.read()
    all_sequences = []
    for lines in all_text.split("\n\n"):
        sents = []
        for token_data in lines.split("\n"):
            if len(token_data.split("\t"))==8:
                sents.append(token_data.split("\t"))
        all_sequences.append(get_text_labels(sents))
    return all_sequences
parse_deft(train_deft_file)


[{'text': ['5', '.'], 'labels': ['O', 'O']},
 {'text': ['Science',
   'includes',
   'such',
   'diverse',
   'fields',
   'as',
   'astronomy',
   ',',
   'biology',
   ',',
   'computer',
   'sciences',
   ',',
   'geology',
   ',',
   'logic',
   ',',
   'physics',
   ',',
   'chemistry',
   ',',
   'and',
   'mathematics',
   '(',
   '[',
   'link',
   ']',
   ')',
   '.'],
  'labels': ['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O']},
 {'text': ['However',
   ',',
   'those',
   'fields',
   'of',
   'science',
   'related',
   'to',
   'the',
   'physical',
   'world',
   'and',
   'its',
   'phenomena',
   'and',
   'processes',
   'are',
   'considered',
   'natural',
   'sciences',
   '.'],
  'labels': ['O',
   'O',
   'B-Definition',
   'I-Definition',
   'I-Definition',
   'I-Definition',
   'I-De

In [16]:
def get_text_labels(sequence_tags):
    # TOKEN TXT_SOURCE_FILE START_CHAR END_CHAR TAG TAG_ID ROOT_ID RELATION
    text = [data[0].strip for data in sequence_tags]
    tags = [data[4].strip() for data in sequence_tags]
    return {"text":text, "labels":tags}
get_text_labels(parse_deft(train_deft_file)[1])

{'text': ['Science',
  'includes',
  'such',
  'diverse',
  'fields',
  'as',
  'astronomy',
  ',',
  'biology',
  ',',
  'computer',
  'sciences',
  ',',
  'geology',
  ',',
  'logic',
  ',',
  'physics',
  ',',
  'chemistry',
  ',',
  'and',
  'mathematics',
  '(',
  '[',
  'link',
  ']',
  ')',
  '.'],
 'labels': [' O',
  ' O',
  ' O',
  ' O',
  ' O',
  ' O',
  ' O',
  ' O',
  ' O',
  ' O',
  ' O',
  ' O',
  ' O',
  ' O',
  ' O',
  ' O',
  ' O',
  ' O',
  ' O',
  ' O',
  ' O',
  ' O',
  ' O',
  ' O',
  ' O',
  ' O',
  ' O',
  ' O',
  ' O']}

In [17]:

TEXT = Field(
    sequential=True, 
    tokenize=tokenizer_en,
    init_token="<sos>",
    eos_token="<eos>",
#     batch_first=True,
    lower=True
)
LABELS = Field(
    sequential=True, 
    init_token="<sos>",
    eos_token="<eos>",
#     batch_first=True,
    is_target=True
)

In [None]:
from torchtext.data import TabularDataset

tv_datafields = [("text", TEXT), ("labels", LABEL)]
trn, vld = TabularDataset.splits(
               path="../deft_corpus/data/Task1_folds/", # the root directory where the data lies
               train='train_0.csv',validation="val_0.csv",
               format='tsv',
               skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
               fields=tv_datafields)