In [None]:
#install sentence transformers
!pip install sentence_transformers

In [None]:
import io
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from statistics import mean
from google.colab import files

In [None]:
#load sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')
COSINE_SIMILARITY_THRESHOLD = 0.7

In [None]:
#Upload IAM test dataset
uploaded = files.upload()
file_path = io.BytesIO(uploaded['testdataset.txt'])
df = pd.read_csv(file_path, sep='\t', header=None)
df.columns = ['labels', 'text_a', 'text_b', 'id', 'stance_labels']
original_claim_sentences = df['text_b'].tolist()
original_topic_sentences = df['text_a'].tolist()

Saving testdataset.txt to testdataset.txt


## **Synonym**

In [None]:
#Upload synonym dataset
uploaded = files.upload()
file_path = io.BytesIO(uploaded['synonym final.txt'])
df = pd.read_csv(file_path, sep='\t', header=None)
df.columns = ['topic_sentences', 'text_a', 'stance_labels', 'nan']
synonym_peturbed_sentences = df['topic_sentences'].tolist()

Saving synonym final.txt to synonym final.txt


In [None]:
sentences1 = original_topic_sentences
sentences2 = synonym_peturbed_sentences

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
synonym_scores = []
for i in range(len(sentences1)):
    score = cosine_scores[i][i]
    synonym_scores.append(score.item())

In [None]:
#Print highest value of synonym scores
print(max(synonym_scores))
#Print lowest value of synonym scores
print(min(synonym_scores))
#Print average value of synonym scores
print(mean(synonym_scores))


0.9959730505943298
0.7967403531074524
0.9427753845428796


In [None]:
# Add scores under threshold to list
scores_under_threshold = []
for i in range(len(synonym_scores)):
  if synonym_scores[i] < COSINE_SIMILARITY_THRESHOLD:
    scores_under_threshold.append(i)

0


## **Location Scores**

In [None]:
#Upload perturbed data
uploaded = files.upload()
file_path = io.BytesIO(uploaded['location final.txt'])
df = pd.read_csv(file_path, sep='\t', header=None)
df.columns = ['topic_sentences', 'text_a', 'stance_labels']
location_peturbed_sentences = df['text_a'].tolist()

Saving location final.txt to location final.txt


In [None]:
# Upload original data
uploaded = files.upload()
file_path = io.BytesIO(uploaded['location original.txt'])
df = pd.read_csv(file_path, sep='\t', header=None)
df.columns = ['original_claims']
location_original_sentences = df['original_claims'].tolist()

Saving location original.txt to location original.txt


In [None]:
sentences1 = location_original_sentences
sentences2 = location_peturbed_sentences

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
location_scores = []
for i in range(len(sentences1)):
    score = cosine_scores[i][i]
    location_scores.append(score.item())

In [None]:
#Print highest value of synonym scores
print(max(location_scores))
#Print lowest value of synonym scores
print(min(location_scores))
#Print average value of synonym scores
print(mean(location_scores))

1.000000238418579
0.2596525251865387
0.8562364441955962


In [None]:
scores_under_threshold = []
for i in range(len(location_scores)):
  if location_scores[i] < COSINE_SIMILARITY_THRESHOLD:
    scores_under_threshold1.append(i)
print(scores_under_threshold)

[]
32


## **French Score**

In [None]:
# upload perturbed data
uploaded = files.upload()
file_path = io.BytesIO(uploaded['back-translation-french.tsv'])
df = pd.read_csv(file_path, sep='\t', header=None)
df.columns = ['topic_sentences', 'text_a', 'stance_labels']
french_peturbed_sentences = df['text_a'].tolist()

Saving back-translation-french.tsv to back-translation-french.tsv


In [None]:
# upload original data
uploaded = files.upload()
file_path = io.BytesIO(uploaded['Translate original sentences.txt'])
df = pd.read_csv(file_path, sep='\t', header=None)
df.columns = ['claim_sentences']
french_orginal_sentences = df['claim_sentences'].tolist()

Saving Translate original sentences.txt to Translate original sentences.txt


In [None]:
sentences1 = french_orginal_sentences
sentences2 = french_peturbed_sentences

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
french_scores = []
for i in range(len(sentences1)):
    score = cosine_scores[i][i]
    french_scores.append(score.item())

In [None]:
# add scores under threshold to list
scores_under_threshold = []
for i in range(len(location_scores)):
  if french_scores[i] < COSINE_SIMILARITY_THRESHOLD:
    scores_under_threshold.append(i)
print(scores_under_threshold)

In [None]:
#Print highest value of synonym scores
print(max(french_scores))
#Print lowest value of synonym scores
print(min(french_scores))
#Print average value of synonym scores
print(mean(french_scores))

1.0000008344650269
-0.026197806000709534
0.924188569540845


## **Spanish Scores**

In [None]:
uploaded = files.upload()
file_path = io.BytesIO(uploaded['back-translation-spanish.tsv'])
df = pd.read_csv(file_path, sep='\t', header=None)
df.columns = ['topic_sentences', 'text_a', 'stance_labels']
spanish_peturbed_sentences = df['text_a'].tolist()

Saving back-translation-spanish.tsv to back-translation-spanish.tsv


In [None]:
sentences1 = original_claim_sentences

sentences2 = spanish_peturbed_sentences

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
spanish_scores = []
for i in range(len(sentences1)):
    score = cosine_scores[i][i]
    spanish_scores.append(score.item())
    #print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))

In [None]:
#Print highest value of synonym scores
print(max(spanish_scores))
#Print lowest value of synonym scores
print(min(spanish_scores))
#Print average value of synonym scores
print(mean(spanish_scores))

1.0000009536743164
0.010606251657009125
0.9414451527440234


In [None]:
# add scores under threshold to list
scores_under_threshold = []
for i in range(len(location_scores)):
  if spanish_scores[i] < COSINE_SIMILARITY_THRESHOLD:
    scores_under_threshold.append(i)

## **Paraphrase Scores**


In [None]:
#Upload paraphrased dataset
uploaded = files.upload()
file_path = io.BytesIO(uploaded['paraphrased new.txt'])
df = pd.read_csv(file_path, sep='\t', header=None)
df.columns = ['topic_sentences', 'text_a', 'stance_labels']
paraphrased_sentences = df['text_a'].tolist()


In [None]:
sentences1 = original_claim_sentences
sentences2 = paraphrased_sentences

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
paraphrase_scores = []
for i in range(len(sentences1)):
    score = cosine_scores[i][i]
    paraphrase_scores.append(score.item())

In [None]:
print(np.std(paraphrase_scores))
#print highest value of the scores
print(max(paraphrase_scores))
#print lowest value of the scores
print(min(paraphrase_scores))
#print average of scores
print(mean(paraphrase_scores))

0.12418235405547658
1.0000008344650269
-0.00454594474285841
0.8840257031560967


In [None]:
# add scores under threshold to list
scores_under_threshold = []
for i in range(len(paraphrase_scores)):
  if paraphrase_scores[i] < COSINE_SIMILARITY_THRESHOLD:
    scores_under_threshold.append(i)


## **Deletion**

In [None]:
# upload data
uploaded = files.upload()
file_path = io.BytesIO(uploaded['topic-char-deletion.tsv'])
df = pd.read_csv(file_path, sep='\t', header=None)
df.columns = ['topic_sentences', 'text_a', 'stance_labels']
deletion_peturbed_sentences = df['topic_sentences'].tolist()

Saving topic-char-deletion.tsv to topic-char-deletion.tsv


In [None]:
sentences1 = original_topic_sentences

sentences2 = deletion_peturbed_sentences

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
deletion_scores = []
for i in range(len(sentences1)):
    score = cosine_scores[i][i]
    deletion_scores.append(score.item())

In [None]:
#Print highest value of synonym scores
print(max(deletion_scores))
#Print lowest value of synonym scores
print(min(deletion_scores))
#Print average value of synonym scores
print(mean(deletion_scores))

0.9955686330795288
0.34239718317985535
0.8627336754920376


In [None]:
# add scores under threshold to list
scores_under_threshold = []
for i in range(len(deletion_scores)):
  if deletion_scores[i] < COSINE_SIMILARITY_THRESHOLD:
    scores_under_threshold.append(i)


## Insertion

In [None]:
#upload data
uploaded = files.upload()
file_path = io.BytesIO(uploaded['topic-char-insertion.tsv'])
df = pd.read_csv(file_path, sep='\t', header=None)
df.columns = ['topic_sentences', 'text_a', 'stance_labels']
insertion_peturbed_sentences = df['topic_sentences'].tolist()

Saving topic-char-insertion.tsv to topic-char-insertion.tsv


In [None]:
sentences1 = original_topic_sentences

sentences2 = insertion_peturbed_sentences

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
insertion_scores = []
for i in range(len(sentences1)):
    score = cosine_scores[i][i]
    insertion_scores.append(score.item())

In [None]:
#Print highest value of synonym scores
print(max(insertion_scores))
#Print lowest value of synonym scores
print(min(insertion_scores))
#Print average value of synonym scores
print(mean(insertion_scores))

0.9966719746589661
-0.04158793017268181
0.838430548979192


In [None]:
# add scores under threshold to list
scores_under_threshold = []
for i in range(len(insertion_scores)):
  if insertion_scores[i] < COSINE_SIMILARITY_THRESHOLD:
    scores_under_threshold.append(i)

## Contraction

In [None]:
# upload perturbed data
uploaded = files.upload()
file_path = io.BytesIO(uploaded['contractions final.txt'])
df = pd.read_csv(file_path, sep='\t', header=None)
df.columns = ['topic_sentences', 'text_a', 'stance_labels']
contraction_peturbed_sentences = df['text_a'].tolist()

Saving contractions final.txt to contractions final.txt


In [None]:
# upload original data
uploaded = files.upload()
file_path = io.BytesIO(uploaded['contract original.txt'])
df = pd.read_csv(file_path, sep='\t', header=None)
df.columns = ['claim_sentences']
contraction_original_sentences = df['claim_sentences'].tolist()

Saving contract original.txt to contract original.txt


In [None]:
sentences1 = contraction_original_sentences

sentences2 = contraction_peturbed_sentences

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
contraction_scores = []
for i in range(len(sentences1)):
    score = cosine_scores[i][i]
    contraction_scores.append(score.item())

In [None]:
#Print highest value of synonym scores
print(max(contraction_scores))
#Print lowest value of synonym scores
print(min(contraction_scores))
#Print average value of synonym scores
print(mean(contraction_scores))

0.9998019933700562
0.9157149791717529
0.9942047954996783


In [None]:
# add scores under threshold to list
scores_under_threshold = []
for i in range(len(contraction_scores)):
  if contraction_scores[i] < COSINE_SIMILARITY_THRESHOLD:
    scores_under_threshold.append(i)

[]
0


## Number

In [None]:
# upload perturbed data
uploaded = files.upload()
file_path = io.BytesIO(uploaded['number final.txt'])
df = pd.read_csv(file_path, sep='\t', header=None)
df.columns = ['topic_sentences', 'text_a', 'stance_labels']
number_peturbed_sentences = df['text_a'].tolist()

Saving number final.txt to number final.txt


In [None]:
# upload original data
uploaded = files.upload()
file_path = io.BytesIO(uploaded['number original.txt'])
df = pd.read_csv(file_path, sep='\t', header=None)
df.columns = ['claim_sentences']
number_original_sentences = df['claim_sentences'].tolist()

Saving number original.txt to number original.txt


In [None]:
sentences1 = number_original_sentences

sentences2 = number_peturbed_sentences

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
number_scores = []
for i in range(len(sentences1)):
    score = cosine_scores[i][i]
    number_scores.append(score.item())

In [None]:
#Print highest value of synonym scores
print(max(number_scores))
#Print lowest value of synonym scores
print(min(number_scores))
#Print average value of synonym scores
print(mean(number_scores))

1.0000001192092896
0.169110506772995
0.9548028346465469


In [None]:
# add scores under threshold to list
scores_under_threshold = []
for i in range(len(number_scores)):
  if number_scores[i] < COSINE_SIMILARITY_THRESHOLD:
    scores_under_threshold.append(i)

## Swapping


In [None]:
# upload perturbed data
uploaded = files.upload()
file_path = io.BytesIO(uploaded['claim_sentence-char-swapping.tsv'])
df = pd.read_csv(file_path, sep='\t', header=None)
df.columns = ['topic_sentences', 'text_a', 'stance_labels']
swap_peturbed_sentences = df['text_a'].tolist()

Saving claim_sentence-char-swapping.tsv to claim_sentence-char-swapping.tsv


In [None]:
# upload original data
uploaded = files.upload()
file_path = io.BytesIO(uploaded['swap_original.tsv'])
df = pd.read_csv(file_path, sep='\t', header=None)
df.columns = ['claim_sentences']
swap_original_sentences = df['claim_sentences'].tolist()

Saving swap_original.tsv to swap_original.tsv


In [None]:
sentences1 = swap_original_sentences

sentences2 = swap_peturbed_sentences

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
swapping_scores = []
for i in range(len(sentences1)):
    score = cosine_scores[i][i]
    swapping_scores.append(score.item())
    #print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))

In [None]:
#Print highest value of synonym scores
print(max(swapping_scores))
#Print lowest value of synonym scores
print(min(swapping_scores))
#Print average value of synonym scores
print(mean(swapping_scores))

1.0000005960464478
0.06563708931207657
0.9070992866313161


In [None]:
# add scores under threshold to list
scores_under_threshold = []
for i in range(len(swapping_scores)):
  if swapping_scores[i] < COSINE_SIMILARITY_THRESHOLD:
    scores_under_threshold.append(i)

## Repetition

In [None]:
# upload pertrubed data
uploaded = files.upload()
file_path = io.BytesIO(uploaded['claim-sentence-char-repetition.tsv'])
df = pd.read_csv(file_path, sep='\t', header=None)
df.columns = ['topic_sentences', 'text_a', 'stance_labels']
repetition_peturbed_sentences = df['text_a'].tolist()

Saving claim-sentence-char-repetition.tsv to claim-sentence-char-repetition.tsv


In [None]:
# upload original data
uploaded = files.upload()
file_path = io.BytesIO(uploaded['repetition_original.tsv'])
df = pd.read_csv(file_path, sep='\t', header=None)
df.columns = ['claim_sentences']
repetition_original_sentences = df['claim_sentences'].tolist()

Saving repetition_original.tsv to repetition_original.tsv


In [None]:
sentences1 = repetition_original_sentences

sentences2 = repetition_peturbed_sentences

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
repetition_scores = []
for i in range(len(sentences1)):
    score = cosine_scores[i][i]
    repetition_scores.append(score.item())
    #print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))

In [None]:
#Print highest value of synonym scores
print(max(repetition_scores))
#Print lowest value of synonym scores
print(min(repetition_scores))
#Print average value of synonym scores
print(mean(repetition_scores))

0.9998999238014221
0.14464090764522552
0.926444690083081


In [None]:
# add scores under threshold to list
scores_under_threshold = []
for i in range(len(repetition_scores)):
  if repetition_scores[i] < COSINE_SIMILARITY_THRESHOLD:
    scores_under_threshold.append(i)

## Synonym and Repetition

In [None]:
# upload data
uploaded = files.upload()
file_path = io.BytesIO(uploaded['synonymandrepetition.tsv'])
df = pd.read_csv(file_path, sep='\t', header=None)
df.columns = ['topic_sentences', 'text_a', 'stance_labels']
synonym_and_rep_peturbed_sentences = df['topic_sentences'].tolist()

Saving synonymandrepetition.tsv to synonymandrepetition.tsv


In [None]:
sentences1 = original_topic_sentences

sentences2 = synonym_and_rep_peturbed_sentences

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
synandrep_scores = []
for i in range(len(sentences1)-1):
    score = cosine_scores[i][i]
    synandrep_scores.append(score.item())

In [None]:
#Print highest value of synonym scores
print(max(synandrep_scores))
#Print lowest value of synonym scores
print(min(synandrep_scores))
#Print average value of synonym scores
print(mean(synandrep_scores))

0.9936739802360535
-0.010243510827422142
0.8222818193742112


In [None]:
# add scores under threshold to list
scores_under_threshold = []
for i in range(len(synandrep_scores)):
  if synandrep_scores[i] < COSINE_SIMILARITY_THRESHOLD:
    scores_under_threshold.append(i)

## French and insertion

In [None]:
#upload perturbed data
uploaded = files.upload()
file_path = io.BytesIO(uploaded['paraphrasedandinsertion.tsv'])
df = pd.read_csv(file_path, sep='\t', header=None)
df.columns = ['topic_sentences', 'text_a', 'stance_labels']
french_and_insert_peturbed_sentences = df['text_a'].tolist()

Saving paraphrasedandinsertion.tsv to paraphrasedandinsertion.tsv


In [None]:
#upload original data
uploaded = files.upload()
file_path = io.BytesIO(uploaded['frenchparaandinsert_original.txt'])
df = pd.read_csv(file_path, sep='\t', header=None)
df.columns = ['text_a']
french_and_insert_original_sentences = df['text_a'].tolist()

Saving frenchparaandinsert_original.txt to frenchparaandinsert_original.txt


In [None]:
sentences1 = french_and_insert_original_sentences

sentences2 = french_and_insert_peturbed_sentences

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
frenchandinsert_scores = []
for i in range(len(sentences1)-1):
    score = cosine_scores[i][i]
    frenchandinsert_scores.append(score.item())
    #print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))

In [None]:
#Print highest value of synonym scores
print(max(frenchandinsert_scores))
#Print lowest value of synonym scores
print(min(frenchandinsert_scores))
#Print average value of synonym scores
print(mean(frenchandinsert_scores))

0.9997437596321106
0.18691399693489075
0.9088459049092324


In [None]:
# add scores under threshold to list
scores_under_threshold = []
for i in range(len(frenchandinsert_scores)):
  if frenchandinsert_scores[i] < COSINE_SIMILARITY_THRESHOLD:
    scores_under_threshold.append(i)