In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch.autograd import Variable
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
import random
import csv
import json 
from tqdm import tqdm

In [2]:
# from transformers import RobertaTokenizer, RobertaModel
# tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

from transformers import ElectraTokenizer, ElectraModel, ElectraConfig
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
# model = ElectraModel.from_pretrained('google/electra-small-discriminator', return_dict=True)

configuration = ElectraConfig()

# #Initializing a model from the electra-base-uncased style configuration
# model = ElectraModel(configuration)

# Load pre-trained model (weights)
model = ElectraModel.from_pretrained('google/electra-small-discriminator', output_hidden_states = True)

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

ElectraModel(
  (embeddings): ElectraEmbeddings(
    (word_embeddings): Embedding(30522, 128, padding_idx=0)
    (position_embeddings): Embedding(512, 128)
    (token_type_embeddings): Embedding(2, 128)
    (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=256, out_features=256, bias=True)
            (key): Linear(in_features=256, out_features=256, bias=True)
            (value): Linear(in_features=256, out_features=256, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=256, out_features=256, bias=True)
            (LayerNorm): LayerNorm((256,), eps=1e-12, eleme

In [3]:
torch.backends.cudnn.deterministic = True
torch.manual_seed(123)
torch.cuda.manual_seed_all(123)
np.random.seed(123)
torch.cuda.manual_seed_all(123)

In [4]:
# Python program for KMP Algorithm 
def computeLPSArray(pat, M, lps): 
    len = 0 # length of the previous longest prefix suffix 
  
    lps[0] # lps[0] is always 0 
    i = 1
  
    # the loop calculates lps[i] for i = 1 to M-1 
    while i < M: 
        if pat[i]== pat[len]: 
            len += 1
            lps[i] = len
            i += 1
        else: 
            # This is tricky. Consider the example. 
            # AAACAAAA and i = 7. The idea is similar  
            # to search step. 
            if len != 0: 
                len = lps[len-1] 
  
                # Also, note that we do not increment i here 
            else: 
                lps[i] = 0
                i += 1

def KMPSearch(pat, txt): 
    M = len(pat) 
    N = len(txt) 
  
    # create lps[] that will hold the longest prefix suffix  
    # values for pattern 
    lps = [0]*M 
    j = 0 # index for pat[] 
  
    # Preprocess the pattern (calculate lps[] array) 
    computeLPSArray(pat, M, lps) 
  
    i = 0 # index for txt[] 
    while i < N: 
        if pat[j] == txt[i]: 
            i += 1
            j += 1
  
        if j == M: 
            return np.arange(i-j, i-j+M, dtype=int).tolist()
            j = lps[j-1] 
  
        # mismatch after j matches 
        elif i < N and pat[j] != txt[i]: 
            # Do not match lps[0..lps[j-1]] characters, 
            # they will match anyway 
            if j != 0: 
                j = lps[j-1] 
            else: 
                i += 1
    
    return list()

In [13]:
df = pd.read_csv("../Group 1 Dataset/Dataset 1/lcp_single_train.tsv",
                              delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')
df = df.append(pd.read_csv("../Group 1 Dataset/Dataset 1/lcp_single_trial.tsv",
                              delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8'))
df = df.append(pd.read_csv("../Group 1 Dataset/Dataset 2/lcp_multi_train.tsv",
                              delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8'))
df = df.append(pd.read_csv("../Group 1 Dataset/Dataset 2/lcp_multi_trial.tsv",
                              delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8'))
df = df.append(pd.read_csv("../Group 1 Dataset/Test/lcp_single_test.tsv",
                              delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8'))
df = df.append(pd.read_csv("../Group 1 Dataset/Test/lcp_multi_test.tsv",
                              delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8'))
df = df.reset_index(drop=True)

Embed = {}

for i in tqdm(range(len(df))):
# for i in range(1):
    df['sentence'][i] = "[CLS] " + df['sentence'][i] + " [SEP]"
    tokenized_text = tokenizer.tokenize(df['sentence'][i])
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    word_token=tokenizer.tokenize(str(df['token'][i]))
    segments_ids = [1] * len(tokenized_text)
    token_indices = KMPSearch(word_token, tokenized_text)
    
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    
    with torch.no_grad():

        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[0]
        
    token_embeddings = torch.squeeze(hidden_states, dim=0)
    
    embedding = np.zeros(token_embeddings[0].numpy().shape, dtype = np.float32)
    if len(token_indices) > 0:
        for j in token_indices:
            embedding += token_embeddings[j].numpy()
        embedding = embedding/len(token_indices)
    Embed[df['id'][i]] = embedding.tolist()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100%|████████████████████████████████████████████████████████████████████████████| 10800/10800 [05:44<00:00, 31.35it/s]


In [14]:
with open("Electra_embeddings.json", "w") as outfile:  
    json.dump(Embed, outfile, indent = 4) 
# with open("Electra_embeddings_2018.json", "w") as outfile:  
#     json.dump(Embed, outfile, indent = 4) 