In [2]:
import json
import requests

In [3]:
API_URL = "https://api-inference.huggingface.co/models/sentence-transformers/all-MiniLM-L6-v2"
headers = {"Authorization": f"Bearer {'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'}"}

def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()


source_sentence="I'm very happy"
sentences=["I'm filled with happiness", "I'm happy"]


data = query(
    {
        "inputs": {
            "source_sentence": source_sentence,
            "sentences": sentences
        }
    })

data

[0.605808436870575, 0.8944038152694702]

In [4]:
from sentence_transformers import SentenceTransformer,util
import torch
import numpy


model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

source_embedding=model.encode(source_sentence,convert_to_tensor=True)
embedding_1= model.encode(sentences[0], convert_to_tensor=True)
embedding_2 = model.encode(sentences[1], convert_to_tensor=True)

print(util.pytorch_cos_sim(source_embedding, embedding_1),util.pytorch_cos_sim(source_embedding,embedding_2))

embedding_1_numpy=util.pytorch_cos_sim(source_embedding, embedding_1).numpy()
embedding_2_numpy=util.pytorch_cos_sim(source_embedding, embedding_2).numpy()

embedding_1_numpy=embedding_1_numpy.reshape([1,])
embedding_2_numpy=embedding_2_numpy.reshape([1,])

scores=list([embedding_1_numpy[0],embedding_2_numpy[0]])

print("After Conversion")
print(embedding_1_numpy,embedding_2_numpy)
print(scores)

tensor([[0.6058]]) tensor([[0.8944]])
After Conversion
[0.6058084] [0.8944038]
[0.6058084, 0.8944038]


In [5]:
def query_input(new_text, source_stories, story_ids=1):

    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

    source_scores=list()
    new_embedding=model.encode(new_text,convert_to_tensor=True)

    res=dict()

    for i in source_stories:
        temp=model.encode(i,convert_to_tensor=True)
        temp=util.pytorch_cos_sim(new_embedding,temp).numpy().reshape([1,])
        source_scores.append(temp[0])

    for i in range(len(story_ids)):
        res[story_ids[i]]=source_scores[i]

        
    return res

In [6]:
source="timely"
compare_with=["quickly", "instantaneously","excited"]
x=list()
for i in range(len(compare_with)):
    x.append(i)
c=query_input(source,compare_with,story_ids=x)

In [7]:
list(c.values())
c

{0: 0.6002136, 1: 0.50213504, 2: 0.35151577}

In [8]:
lexical_AMB = ['bound', 'break', 'content', 'call', 'continue', 'contract', 'count', 'direct', 'even', 'express', 'form', 'forward', 'function', 'job', 'level', 'name', 'notice', 'number', 'out', 'position', 'record', 'reference', 'subject', 'string', 'switch', 'throw', 'translate', 'try', 'under']

referential_AMB = ['everyone', 'everything', 'someone', 'something', 'anything', 'anyone', 'itself', 'yourself']

coordination_AMB = ['also', 'if then', 'unless', 'if and only if']

scope_AMB = ['all', 'any', 'few', 'little', 'many', 'much', 'several', 'some']

vague_AMB = ['good', 'better', 'worse', 'available', 'common', 'capability', 'easy', 'full', 'maximum', 'minimum', 'quickly', 'random', 'recently', 'sufficient', 'sufficiently', 'simple', 'useful', 'various']

In [20]:
from nltk.stem import WordNetLemmatizer
  
lemmatizer = WordNetLemmatizer()
  
print("rocks :", lemmatizer.lemmatize("rocks"))
print("corpora :", lemmatizer.lemmatize("corpora"))
  
# a denotes adjective in "pos"
print("better :", lemmatizer.lemmatize("all",pos="a"))

rocks : rock
corpora : corpus
better : all


In [59]:
from transformers import AutoTokenizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [73]:
def create_stopwords_custom(amb_list,stopwords):
    amb_list=list(amb_list)
    for i in amb_list:
        if i in stopwords:
            stopwords.remove(i)
    return stopwords

stopwords_custom=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourselves',
 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'they', 
 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 
 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 
 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 
 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 
 'to', 'from', 'up', 'down', 'in', 'on', 'off', 'over', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 
 'where', 'why', 'how', 'both', 'each', 'more', 'most', 'other', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 
 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 
 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', 
 "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', 
 "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

punctuation=['.',',',';','?']

print(type(stopwords_custom))

<class 'list'>


In [110]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [111]:
def sentence_ambiguity(sentence):

    tokens=tokenizer.backend_tokenizer.normalizer.normalize_str(sentence)
    lemmatizer=WordNetLemmatizer()
    tokens=word_tokenize(sentence)
    filtered_tokens=list()
    for token in tokens:
        if token not in stopwords_custom:
            filtered_tokens.append(token)
    
    for i in filtered_tokens:
        filtered_tokens[filtered_tokens.index(i)]=i.lower()
        if i in punctuation:
            filtered_tokens.remove(i)
    
    lexical=dict()
    scope=dict()
    referential=dict()
    vague=dict()
    coordination=dict()
    ambiguity=dict()

    for i in filtered_tokens:
        for j in lexical_AMB:
            temp=model.encode(i,convert_to_tensor=True)
            temp2=model.encode(j,convert_to_tensor=True)
            cos_sim=util.pytorch_cos_sim(temp,temp2).numpy().reshape([1,])
            if(cos_sim[0]>=0.6):
                lexical[i+"+"+j]=cos_sim[0]
        
        for j in scope_AMB:
            temp=model.encode(i,convert_to_tensor=True)
            temp2=model.encode(j,convert_to_tensor=True)
            cos_sim=util.pytorch_cos_sim(temp,temp2).numpy().reshape([1,])
            if(cos_sim[0]>=0.6):
                scope[i+"+"+j]=cos_sim[0]
        
        for j in referential_AMB:
            temp=model.encode(i,convert_to_tensor=True)
            temp2=model.encode(j,convert_to_tensor=True)
            cos_sim=util.pytorch_cos_sim(temp,temp2).numpy().reshape([1,])
            if(cos_sim[0]>=0.6):
                referential[i+"+"+j]=cos_sim[0]
        
        for j in vague_AMB:
            temp=model.encode(i,convert_to_tensor=True)
            temp2=model.encode(j,convert_to_tensor=True)
            cos_sim=util.pytorch_cos_sim(temp,temp2).numpy().reshape([1,])
            if(cos_sim[0]>=0.6):
                vague[i+"+"+j]=cos_sim[0]

        for j in coordination_AMB:
            temp=model.encode(i,convert_to_tensor=True)
            temp2=model.encode(j,convert_to_tensor=True)
            cos_sim=util.pytorch_cos_sim(temp,temp2).numpy().reshape([1,])
            if(cos_sim[0]>=0.6):
                coordination[i+"+"+j]=cos_sim[0]
        
        ambiguity["lexical"]=lexical
        ambiguity["referential"]=referential
        ambiguity["scope"]=scope
        ambiguity["vague"]=vague
        ambiguity["coordination"]=coordination
    
    print(filtered_tokens)
    print(ambiguity)

In [114]:
amb_samples=["The tool shall monitor the unit under diagnosis for a certain period of time.",
"The system must make it easy to correct mistakes.",
"As a repoadmin, I want to provide a better interface to the repository.",
"As an app developer, I want to deal with existing datasets if their properties are different but compatible when creating a dataset as part of app deployment.",
"As a researcher, I want to create a log book page for an experiment and attach a directory that contains various files.",
"As a researcher, I want to collect data as a library of data, which can then be used by some experiments that are defined at a later stage.",
"As a consumer, I want to buy ALFRED for a low price.",
"As a MedicalCaregiver, I want to have feedback on the activity level of the user.",
"As a legalofficer, I want something to inform me about data sensitivity, so that I can establish sharing options.",
"As a manager, I want everyone to be able to contact repositories.",
"The system must divide resources and use up as little storage as possible.",
"The system must break if it reaches minimum level of power activity.",
"The test can only continue if it receives all inputs from previous page."]

len(amb_samples)

13

In [115]:
for i in amb_samples:
    sentence_ambiguity(i)

['the', 'tool', 'shall', 'monitor', 'unit', 'under', 'diagnosis', 'certain', 'period', 'time']
{'lexical': {'under+under': 0.99999994}, 'referential': {}, 'scope': {'certain+some': 0.60891855}, 'vague': {}, 'coordination': {}}
['the', 'system', 'must', 'make', 'easy', 'correct', 'mistakes']
{'lexical': {}, 'referential': {}, 'scope': {}, 'vague': {'easy+easy': 0.9999999, 'easy+simple': 0.7983675}, 'coordination': {}}
['as', 'repoadmin', 'I', 'want', 'provide', 'better', 'interface', 'repository']
{'lexical': {}, 'referential': {}, 'scope': {'want+any': 0.6034759}, 'vague': {'better+better': 1.0000002, 'better+worse': 0.6654369}, 'coordination': {}}
['as', 'app', 'developer', 'I', 'want', 'deal', 'existing', 'datasets', 'properties', 'different', 'compatible', 'creating', 'dataset', 'part', 'app', 'deployment']
{'lexical': {}, 'referential': {}, 'scope': {'want+any': 0.6034759}, 'vague': {}, 'coordination': {}}
['as', 'researcher', 'I', 'want', 'create', 'log', 'book', 'page', 'experime

In [117]:
import pickle

In [136]:
lexical_encoded=dict()
for i in vague_AMB:
    temp=model.encode(i,convert_to_tensor=True)
    lexical_encoded[i]=temp

In [137]:
pickle.dump(lexical_encoded,open("vague_encoded.pickel","wb"))

In [125]:
a=pickle.load(open("lexical_encoded.pickel","rb"))
a

{'bound': tensor([ 3.3665e-02,  8.6533e-02, -1.2689e-02,  1.7100e-02, -4.0627e-02,
          3.7022e-02,  1.9857e-02,  7.0380e-02, -1.9325e-02, -5.1245e-02,
         -7.7717e-02,  1.1778e-05,  4.7559e-02, -4.9199e-02, -3.6979e-02,
         -7.7072e-03,  5.7109e-02, -9.4658e-02, -1.5983e-01,  2.2128e-02,
          9.0241e-03, -2.9926e-02,  3.0744e-02,  2.0341e-02, -2.9316e-02,
         -1.0104e-02, -9.7430e-04, -2.0542e-02,  1.1142e-01, -6.6432e-02,
         -8.6479e-02, -3.2890e-02,  3.0363e-02,  2.1406e-02, -3.4783e-02,
          5.7885e-03, -9.6222e-03, -2.2681e-02,  6.6181e-02, -1.6226e-02,
         -2.8015e-02, -4.7305e-02,  4.2412e-02,  4.5166e-02,  5.4705e-02,
          5.0281e-02, -1.3724e-02,  6.5922e-02, -1.1170e-02, -4.6128e-02,
         -3.2347e-02,  2.7554e-02, -2.4255e-02,  7.4346e-02, -5.2612e-02,
         -4.8489e-02, -1.1072e-02, -3.9049e-02, -3.6673e-02,  1.7665e-02,
          3.9063e-02, -8.5334e-03, -1.4819e-02,  1.8115e-02, -7.8742e-03,
         -2.5276e-02,  5.5063