In [1]:
! pip install sentence_transformers



In [2]:
import numpy as np
import pandas as pd
import urllib
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

model = SentenceTransformer('bert-base-nli-mean-tokens')

In [3]:
# Extract data for the STS tasks
def extract_test_data(X_train_path, category):
  tmp_df = pd.DataFrame(columns=['sent1', 'sent2', 'score', 'category'])
  response_X = urllib.request.urlopen(X_train_path)

  for line_X in response_X:
    tmp_df = tmp_df.append({'sent1' : line_X.decode("utf-8").strip().split('\t')[0],
                            'sent2' : line_X.decode("utf-8").strip().split('\t')[1],
                            'score' : 0.0,
                            'category' : category}, ignore_index=True)
  return tmp_df
  
  
test_df = pd.DataFrame(columns=['sent1', 'sent2', 'score', 'category'])

test_df = test_df.append(extract_test_data('https://raw.githubusercontent.com/nee2shaji/STS-Task/main/data/STS2016.input.answer-answer.txt', 0))
test_df = test_df.append(extract_test_data('https://raw.githubusercontent.com/nee2shaji/STS-Task/main/data/STS2016.input.headlines.txt', 1))
test_df = test_df.append(extract_test_data('https://raw.githubusercontent.com/nee2shaji/STS-Task/main/data/STS2016.input.plagiarism.txt', 2))
test_df = test_df.append(extract_test_data('https://raw.githubusercontent.com/nee2shaji/STS-Task/main/data/STS2016.input.postediting.txt', 3))
test_df = test_df.append(extract_test_data('https://raw.githubusercontent.com/nee2shaji/STS-Task/main/data/STS2016.input.question-question.txt', 4))

test_df = test_df.reset_index(drop=True)

test_df

Unnamed: 0,sent1,sent2,score,category
0,Tasting it is the only reliable way.,The way you have it is fine.,0.0,0
1,I think it probably depends on your money.,It depends on your country.,0.0,0
2,You need to read a lot to know what you like a...,You don't have to know.,0.0,0
3,"Obviously, the best book for you depends a lot...",The answer will depend of course on what you'r...,0.0,0
4,I've had this same problem.,I had the same problem as you.,0.0,0
...,...,...,...,...
9178,what is the difference between Erebor and Moria?,What is the difference between sortition and d...,0.0,4
9179,What window part do I need for this window?,Do I need a header for an egress window?,0.0,4
9180,How to test if a toilet is leaking?,How can I test if a dehydrator is getting to t...,0.0,4
9181,How do I test the functionality of a sump pump?,Do I really need a redundant sump pump?,0.0,4


In [4]:
# Get the embeddings of the sentences and compare them using cosine similarity
def get_similarity(row):
  sentence_embeddings = model.encode(row[['sent1', 'sent2']].values)
  row['score'] = cosine_similarity([sentence_embeddings[0]], [sentence_embeddings[1]])[0][0]
  # print(row)
  return row

test_df = test_df.apply(get_similarity, axis=1)

In [5]:
test_df

Unnamed: 0,sent1,sent2,score,category
0,Tasting it is the only reliable way.,The way you have it is fine.,0.638406,0
1,I think it probably depends on your money.,It depends on your country.,0.699683,0
2,You need to read a lot to know what you like a...,You don't have to know.,0.197526,0
3,"Obviously, the best book for you depends a lot...",The answer will depend of course on what you'r...,0.569982,0
4,I've had this same problem.,I had the same problem as you.,0.915018,0
...,...,...,...,...
9178,what is the difference between Erebor and Moria?,What is the difference between sortition and d...,0.885300,4
9179,What window part do I need for this window?,Do I need a header for an egress window?,0.716262,4
9180,How to test if a toilet is leaking?,How can I test if a dehydrator is getting to t...,0.696114,4
9181,How do I test the functionality of a sump pump?,Do I really need a redundant sump pump?,0.710817,4


In [6]:
# Evaluation is done separately for each category in STS 2016
# Create separate output for files for each category
test_df[test_df['category'] == 0]['score'].to_csv('STS2016_output_answer-answer.txt', sep=' ', index=False, header=False)
test_df[test_df['category'] == 1]['score'].to_csv('STS2016_output_headlines.txt', sep=' ', index=False, header=False)
test_df[test_df['category'] == 2]['score'].to_csv('STS2016_output_plagiarism.txt', sep=' ', index=False, header=False)
test_df[test_df['category'] == 3]['score'].to_csv('STS2016_output_postediting.txt', sep=' ', index=False, header=False)
test_df[test_df['category'] == 4]['score'].to_csv('STS2016_output_question-question.txt', sep=' ', index=False, header=False)