In [None]:
import gensim.downloader as api
import similarity

txt = "While scanning the water for these hydrodynamic signals at a swimming speed in the order of meters per second, the seal keeps its long and flexible whiskers in an abducted position, largely perpendicular to the swimming direction. Remarkably, the whiskers of harbor seals possess a specialized undulated surface structure, the function of which was, up to now, unknown. Here, we show that this structure effectively changes the vortex street behind the whiskers and reduces the vibrations that would otherwise be induced by the shedding of vortices from the whiskers (vortex-induced vibrations). Using force measurements, flow measurements and numerical simulations, we find that the dynamic forces on harbor seal whiskers are, by at least an order of magnitude, lower than those on sea lion (Zalophus californianus) whiskers, which do not share the undulated structure. The results are discussed in the light of pinniped sensory biology and potential biomimetic applications."
target = "A small diameter fiber with an undulated surface structure reduces vibrations caused by drag forces" 
model = api.load("word2vec-google-news-300")

sim = similarity.W2VTextSimilarity(txt, target, model)
mapping, scores = sim.compute_similarity()
scores.sort()
for score in scores:
    print(str(score) + " " + mapping[score])

In [None]:
from transformers import pipeline
classifier = pipeline("sentiment-analysis")
classifier(
    ["This is a great movie","This is a bad movie"]
)

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoModelForSequenceClassification
import torch

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

raw_text = ["This is a great movie","This is a bad movie"]

inputs = tokenizer(raw_text, return_tensors="pt", padding=True, truncation=True)
# print(inputs)

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

In [None]:
from transformers import BertConfig, BertModel

# config = BertConfig()
model = BertModel.from_pretrained("bert-base-cased")
model.save_pretrained("models")
# print(config)



In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
sequence = "Using a transformer network is simple"
tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
decoded_string = tokenizer.decode(ids)
print(decoded_string)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life"

tokens = tokenizer.tokenize(sequence, return_tensors="pt")
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor([ids])
# model(input_ids)

# batched_ids = torch.tensor([ids, ids])
padding_id = 100
batched_ids = [[200, 200, 200], 
                [200, 200, padding_id]
            ]
   
sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
# print(model(torch.tensor(batched_ids)).logits)

attention_mask = torch.tensor([[1, 1, 1], [1, 1, 0]])
outputs = model(torch.tensor(batched_ids), attention_mask=attention_mask)
print(outputs.logits)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = ["I've been waiting for a HuggingFace course my whole life", "So have I!"]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
output = model(**tokens)

In [None]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

# Same as before
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# This is new
batch["labels"] = torch.tensor([1, 1])

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

In [7]:
import json
import torch
from transformers import AdamW, AutoTokenizer
import tokenizations

checkpoint = "allenai/scibert_scivocab_cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# FOBIE Dataset
data = json.load(open("data/dev_set.json"))

#   1   2 3    4     5  6     7
# this is a testing of the spanning
#   1   2 3   4   5  6   7   8    9
# this is a test ing of the span ing

    


ground = ['Another', 'interesting', 'finding', 'was', 'the', 'lack', 'of', 'correlation', 'between', 'resting', 'levels', 'of', 'both', 'HSP', '90β', 'and', 'HSC', '70', 'and', 'CTmax.']
ground = list(map(str.lower,ground))
split = ['another', 'interesting', 'finding', 'was', 'the', 'lack', 'of', 'correlation', 'between', 'resting', 'levels', 'of', 'both', 'hs', '##p', '90', '##β', 'and', 'hs', '##c', '70', 'and', 'ct', '##max', '.']
a2b, b2a = tokenizations.get_alignments(ground, split) 
print (a2b)
print(b2a)

# for i in range(len(ground)):
#     # print(ground[i])
#     for j in a2b[i]:
#         # print("    ", split[j])
span = [14,19] 
# print(span)       

def match_span_to_tokenizer(ground, split, span):
    a2b, b2a = tokenizations.get_alignments(ground, split) 
    for i in range(span[0], span[1]+1):
        print(i)

span = match_span_to_tokenizer(ground, split, span)


[[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13, 14], [15, 16], [17], [18, 19], [20], [21], [22, 23, 24]]
[[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [13], [14], [14], [15], [16], [16], [17], [18], [19], [19], [19]]
14
15
16
17
18
19
