In [35]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from MRRandNDCG import *
from preprocessing import *
from Pretrained_versions import *

In [3]:
mrr_and_ndcg = MRR_NDCG()
preprocess = preprocessing()
pretrained_ver = pretrained()

In [12]:
data = pd.read_csv("data/requirements.csv")
data['requirements'] = data['feature']+ ", " + data['benefit'] + '.'
d = pd.DataFrame(list(zip(data['requirements'], data['application_domain' ])),columns = ['requirements','class'])
d['n_class'] = d['class']
# replacing values
d['n_class'].replace(['Health', 'Energy', 'Entertainment', 'Safety', 'Other'],[0,1,2,3,4], inplace=True)
labels = d['n_class']
namelabels = data['application_domain']

In [None]:
#Pre Processing
corpus,corp,allvocab,freq,wavg = preprocess.processing(d['requirements'])

In [15]:
####### Bag of Words(1,2-gram) #######
from sklearn.feature_extraction.text import CountVectorizer 
vect = CountVectorizer(binary = False, ngram_range = (1,2))
bag_of_words = vect.fit_transform(corp)
doc_term_matrix = bag_of_words.todense()
print(f"Model: Bag of Words(1,2-gram) \nMRR: {mrr_and_ndcg.MRR(doc_term_matrix ,vect,labels)}")
print(f"Model: Bag of Words(1,2-gram) \nNDCG: {mrr_and_ndcg.NDCG(doc_term_matrix ,vect,labels)}")

100%|██████████| 2966/2966 [01:03<00:00, 46.41it/s]


Model: Bag of Words(1,2-gram) 
MRR: 0.687


100%|██████████| 2966/2966 [01:15<00:00, 39.41it/s]

Model: Bag of Words(1,2-gram) 
NDCG: 0.769





In [16]:
#######  TF-IDF(1,2-gram) #######
from sklearn.feature_extraction.text import TfidfVectorizer 
v = TfidfVectorizer(binary = False, ngram_range = (1,2))
tf_idf = v.fit_transform(corp).todense()
tfidfarray = v.fit_transform(corp).toarray()
tflist = list(v.get_feature_names_out())
print(f"Model: TF-IDF(1,2-gram) \nMRR: {mrr_and_ndcg.MRR(tf_idf,v,labels)}")
print(f"Model: TF-IDF(1,2-gram) \nNDCG: {mrr_and_ndcg.NDCG(tf_idf,v,labels)}")

100%|██████████| 2966/2966 [01:06<00:00, 44.78it/s]


Model: TF-IDF(1,2-gram) 
MRR: 0.699


100%|██████████| 2966/2966 [01:14<00:00, 40.07it/s]

Model: TF-IDF(1,2-gram) 
NDCG: 0.768





In [19]:
####### Doc2Vec #######
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(corpus)] # Convert tokenized document into gensim formated tagged data
dmodel = Doc2Vec(tagged_data, vector_size=100, window=5, min_count=1,epochs=30)
doc2vec_embedd = [dmodel.dv[x] for x in range(0,2966)]
print(f"Model: Doc2Vec \nMRR: {mrr_and_ndcg.MRR(doc2vec_embedd,dmodel,labels)}")
print(f"Model: Doc2Vec \nNDCG: {mrr_and_ndcg.NDCG(doc2vec_embedd,dmodel,labels)}")

100%|██████████| 2966/2966 [01:04<00:00, 46.10it/s]


Model: Doc2Vec 
MRR: 0.672


100%|██████████| 2966/2966 [01:16<00:00, 38.61it/s]

Model: Doc2Vec 
NDCG: 0.767





In [20]:
####### Word2Vec(Self Trained) #######
from gensim.models import Word2Vec
word2vec = Word2Vec(corpus, min_count = 1,vector_size = 100,window = 5,sg = 1,epochs=30, seed = 1) # 1-> skipgram, 0-> cbow
avgword2vec = []
for x in corpus:
    avgword2vec.append(np.mean([word2vec.wv[token] for token in x if token in word2vec.wv.index_to_key],axis=0))
print(f"Model: Word2Vec(Self Trained) \nMRR: {mrr_and_ndcg.MRR(avgword2vec,word2vec,labels)}")
print(f"Model: Word2Vec(Self Trained) \nNDCG: {mrr_and_ndcg.NDCG(avgword2vec,word2vec,labels)}")

100%|██████████| 2966/2966 [01:08<00:00, 43.50it/s]


Model: Word2Vec(Self Trained) 
MRR: 0.726


100%|██████████| 2966/2966 [01:10<00:00, 41.96it/s]

Model: Word2Vec(Self Trained) 
NDCG: 0.769





In [21]:
####### Tf-IDF Word2Vec(Self Trained) ####### 
tfidfword2vec = []
for x in range(len(corpus)):
    z1 = [r for r in corpus[x] if len(r) > 1]
    tfidfword2vec.append(np.mean([word2vec.wv[token]*tfidfarray[x][tflist.index(token)] for token in z1 if token in word2vec.wv.index_to_key],axis=0))
print(f"Model: Tf-IDF Word2Vec(Self Trained) \nMRR: {mrr_and_ndcg.MRR(tfidfword2vec,word2vec,labels)}")
print(f"Model: Tf-IDF Word2Vec(Self Trained) \nNDCG: {mrr_and_ndcg.NDCG(tfidfword2vec,word2vec,labels)}")

100%|██████████| 2966/2966 [01:03<00:00, 46.52it/s]


Model: Tf-IDF Word2Vec(Self Trained) 
MRR: 0.715


100%|██████████| 2966/2966 [01:14<00:00, 39.57it/s]

Model: Tf-IDF Word2Vec(Self Trained) 
NDCG: 0.769





In [24]:
#######  Word2Vec(Pre Trained) & Tf-IDF Word2Vec(Pre Trained) ####### 
from gensim.models import KeyedVectors
import gensim.downloader as api
w = api.load('word2vec-google-news-300')

word2vec_pretrained = pretrained_ver.avg_pretrained_embedding(w, corpus)  
word2vec_tfidf = pretrained_ver.tfidf_embedding(w,corpus,tfidfarray,tflist)

print(f"Model: Word2Vec(Pre Trained) \nMRR: {mrr_and_ndcg.MRR(word2vec_pretrained ,w,labels)}")
print(f"Model: Word2Vec(Pre Trained) \nNDCG: {mrr_and_ndcg.NDCG(word2vec_pretrained ,w,labels)}")

print("\n\n")

print(f"Model: Tf-IDF Word2Vec(Pre Trained)  \nMRR: {mrr_and_ndcg.MRR(word2vec_tfidf ,w,labels)}")
print(f"Model: Tf-IDF Word2Vec(Pre Trained)  \nNDCG: {mrr_and_ndcg.NDCG(word2vec_tfidf ,w,labels)}")   

174


100%|██████████| 2966/2966 [01:10<00:00, 41.90it/s]


Model: Word2Vec(Pre Trained) 
MRR: 0.722


100%|██████████| 2966/2966 [01:11<00:00, 41.54it/s]


Model: Word2Vec(Pre Trained) 
NDCG: 0.77





100%|██████████| 2966/2966 [01:04<00:00, 46.34it/s]


Model: Tf-IDF Word2Vec(Pre Trained)  
MRR: 0.712


100%|██████████| 2966/2966 [01:16<00:00, 38.82it/s]

Model: Tf-IDF Word2Vec(Pre Trained)  
NDCG: 0.77





In [25]:
#######  FastText(Self Trained) ####### 
from gensim.models import FastText
fasttext_model = FastText(corpus, min_count=1,vector_size = 100,window = 5,epochs=30)
avgfasttext = []
for x in corpus:
    avgfasttext.append(np.mean([fasttext_model.wv[token] for token in x ],axis=0))
print(f"Model: FastText(Self Trained)  \nMRR: {mrr_and_ndcg.MRR(avgfasttext,fasttext_model,labels)}")
print(f"Model: FastText(Self Trained)  \nNDCG: {mrr_and_ndcg.NDCG(avgfasttext ,fasttext_model,labels)}")

100%|██████████| 2966/2966 [00:57<00:00, 51.45it/s]


Model: FastText(Self Trained)  
MRR: 0.672


100%|██████████| 2966/2966 [01:17<00:00, 38.20it/s]

Model: FastText(Self Trained)  
NDCG: 0.767





In [26]:
####### TF-IDF Fasttext(Self Trained) ####### 
tfidf_fasttext = []
for x in range(len(corpus)):
    z1 = [r for r in corpus[x] if len(r) > 1]
    tfidf_fasttext.append(np.mean([fasttext_model.wv[token]*tfidfarray[x][tflist.index(token)] for token in z1],axis=0))
print(f"Model: TF-IDF Fasttext(Self Trained)  \nMRR: {mrr_and_ndcg.MRR(tfidf_fasttext,fasttext_model,labels)}") 
print(f"Model: TF-IDF Fasttext(Self Trained)  \nNDCG: {mrr_and_ndcg.NDCG(tfidf_fasttext ,fasttext_model,labels)}")    

100%|██████████| 2966/2966 [01:06<00:00, 44.35it/s]


Model: TF-IDF Fasttext(Self Trained)  
MRR: 0.682


100%|██████████| 2966/2966 [01:09<00:00, 42.69it/s]

Model: TF-IDF Fasttext(Self Trained)  
NDCG: 0.767





In [27]:
#######  FastText(Pre Trained) & TF-IDF Fasttext(Pre Trained) ####### 
modelfasttext = api.load("fasttext-wiki-news-subwords-300")
fasttext_pretrained =  pretrained_ver.avg_pretrained_embedding(modelfasttext, corpus)
tfidf_ft = pretrained_ver.tfidf_embedding(modelfasttext,corpus,tfidfarray,tflist)

print(f"Model: FastText(Pre Trained)  \nMRR: {mrr_and_ndcg.MRR(fasttext_pretrained ,modelfasttext ,labels)}")
print(f"Model: FastText(Pre Trained)  \nNDCG: {mrr_and_ndcg.NDCG(fasttext_pretrained  ,modelfasttext ,labels)}")

print("\n\n")

print(f"Model: TF-IDF Fasttext(Pre Trained)  \nMRR: {mrr_and_ndcg.MRR(tfidf_ft ,w,labels)}")
print(f"Model: TF-IDF Fasttext(Pre Trained) \nNDCG: {mrr_and_ndcg.NDCG(tfidf_ft ,w,labels)}")   

170


100%|██████████| 2966/2966 [01:02<00:00, 47.28it/s]


Model: FastText(Pre Trained)  
MRR: 0.704


100%|██████████| 2966/2966 [01:07<00:00, 43.69it/s]


Model: FastText(Pre Trained)  
NDCG: 0.769





100%|██████████| 2966/2966 [01:08<00:00, 43.57it/s]


Model: TF-IDF Fasttext(Pre Trained)  
MRR: 0.696


100%|██████████| 2966/2966 [01:12<00:00, 40.64it/s]

Model: TF-IDF Fasttext(Pre Trained) 
NDCG: 0.769





In [28]:
###### Glove(Pre Trained) & TF-IDF Glove(Pre Trained) #######
glove = api.load('glove-wiki-gigaword-300')
glove_pretrained = pretrained_ver.avg_pretrained_embedding(glove, corpus)
wavg_glove = pretrained_ver.weighted_avg_pretrained_embedding(glove, corpus,wavg,freq)
tfidfglove = pretrained_ver.tfidf_embedding(glove,corpus,tfidfarray,tflist)
print(f"Model: Glove(Pre Trained) \nMRR: {mrr_and_ndcg.MRR(glove_pretrained,glove,labels)}")
print(f"Model: Glove(Pre Trained) \nMRR: {mrr_and_ndcg.NDCG(glove_pretrained,glove,labels)}")  

print("\n\n")

print(f"Model: TF-IDF Glove(Pre Trained)  \nMRR: {mrr_and_ndcg.MRR(tfidfglove,glove,labels)}")
print(f"Model: TF-IDF Glove(Pre Trained)  \nNDCG: {mrr_and_ndcg.NDCG(tfidfglove,glove,labels)}")  

286


100%|██████████| 2966/2966 [01:05<00:00, 44.95it/s]


Model: Glove(Pre Trained) 
MRR: 0.708


100%|██████████| 2966/2966 [01:08<00:00, 43.57it/s]


Model: Glove(Pre Trained) 
MRR: 0.769





100%|██████████| 2966/2966 [01:06<00:00, 44.31it/s]


Model: TF-IDF Glove(Pre Trained)  
MRR: 0.697


100%|██████████| 2966/2966 [01:06<00:00, 44.65it/s]

Model: TF-IDF Glove(Pre Trained)  
NDCG: 0.769





In [None]:
# !pip install glove-python-binary

In [None]:
# ###### Glove(Self Trained) ######
# import glove
# from glove import Corpus, Glove # creating a corpus object
# corpusa = Corpus()
# corpusa.fit(corpus,window = 5)
# glove = Glove(100,0.05)
# glove.fit(corpusa.matrix, epochs = 50,no_threads =4)
# glove.add_dictionary(corpusa.dictionary)

# # Glove(Self Trained)
# avgglove = []
# for x in corpus:
#     avgglove.append(np.mean([glove.word_vectors[glove.dictionary[token]] for token in x ],axis=0))
# print(f"Model: Glove(Self Trained)  \nMRR: {mrr_and_ndcg.MRR(avgglove,glove,labels)}")
# print(f"Model: Glove(Self Trained)  \nNDCG: {mrr_and_ndcg.NDCG(avgglove ,glove,labels)}")

In [31]:
##### LaBSE #####
labse = SentenceTransformer('sentence-transformers/LaBSE')
labse_embeddings = labse.encode(corp)
print(f"Model: LaBSE \nMRR: {mrr_and_ndcg.MRR(labse_embeddings,labse ,labels)}")
print(f"Model: LaBSE \nNDCG: {mrr_and_ndcg.NDCG(labse_embeddings ,labse ,labels)}") 

100%|██████████| 2966/2966 [01:19<00:00, 37.32it/s]


Model: LaBSE 
MRR: 0.72


100%|██████████| 2966/2966 [01:40<00:00, 29.45it/s]

Model: LaBSE 
NDCG: 0.77





In [36]:
####### Universal Sentence Encoder (USE) #######
import tensorflow_hub as hub
url = "https://tfhub.dev/google/universal-sentence-encoder/4"
use = hub.load(url)
usi_embedd = use(corp)
print(f"Model: USE \nMRR: {mrr_and_ndcg.MRR(usi_embedd ,use ,labels)}")
print(f"Model: USE \nNDCG: {mrr_and_ndcg.NDCG(usi_embedd ,use ,labels)}")  





100%|██████████| 2966/2966 [01:44<00:00, 28.51it/s]


Model: USE 
MRR: 0.722


100%|██████████| 2966/2966 [01:28<00:00, 33.40it/s]

Model: USE 
NDCG: 0.771





In [37]:
####### SBERT #######
sbert = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
s_embeddings = sbert.encode(corp)
print(f"Model: SBERT \nMRR: {mrr_and_ndcg.MRR(s_embeddings ,sbert,labels)}")
print(f"Model: SBERT \nNDCG: {mrr_and_ndcg.NDCG(s_embeddings ,sbert,labels)}")  

100%|██████████| 2966/2966 [01:16<00:00, 38.59it/s]


Model: SBERT 
MRR: 0.731


100%|██████████| 2966/2966 [01:43<00:00, 28.61it/s]

Model: SBERT 
NDCG: 0.77





In [38]:
# S-RoBERTa
sroberta = SentenceTransformer('sentence-transformers/all-distilroberta-v1')
sroberta_emb = sroberta.encode(corp)
print(f"Model: S-RoBERTa \nMRR: {mrr_and_ndcg.MRR(sroberta_emb ,sroberta,labels)}")
print(f"Model: S-RoBERTa \nNDCG: {mrr_and_ndcg.NDCG(sroberta_emb ,sroberta,labels)}")

100%|██████████| 2966/2966 [01:08<00:00, 43.46it/s]


Model: S-RoBERTa 
MRR: 0.738


100%|██████████| 2966/2966 [01:28<00:00, 33.34it/s]

Model: S-RoBERTa 
NDCG: 0.771



