#Extracting and Cleaning

In [None]:
# Intalling packages

!pip install pdfplumber
!pip install -U spacy
!python -m spacy download en_core_web_sm
!pip install sentence-transformers

In [None]:
# Importing libraries

import pdfplumber
import spacy
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import CrossEncoder
from sentence_transformers import SentenceTransformer

In [None]:
# Saving report locally

import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

!curl "https://www.statkraft.com/globalassets/0/.com/0-start-page/2021-annual-report_sustainability-chapter.pdf" > sustainability_report.pdf
!pdfplumber < sustainability_report.pdf > sustainability_report.csv

!curl "https://raw.githubusercontent.com/gabriellecastilho/mba_tcc/f6217a3922fb0641f6c8aec3933d546f97bb7976/SDG%20Goal%204%20Clean.pdf" > sdg_report.pdf
!pdfplumber < sdg_report.pdf > sdg_report.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 15.7M  100 15.7M    0     0  10.7M      0  0:00:01  0:00:01 --:--:-- 10.7M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 29654  100 29654    0     0  83532      0 --:--:-- --:--:-- --:--:-- 83532


In [None]:
#Extracting report text

with pdfplumber.open("sustainability_report.pdf") as pdf:
    page_three = pdf.pages[2]
    page_three = page_three.extract_text(x_tolerance=3, y_tolerance=3, layout=False, x_density=7.25, y_density=13)
    page_three = " ".join(page_three.lower().split('\n')) # delete '\n'
    page_three = " ".join(page_three.split()) # delete white space

    page_four = pdf.pages[3]
    page_four = page_four.extract_text(x_tolerance=3, y_tolerance=3, layout=False, x_density=7.25, y_density=13)
    page_four = " ".join(page_four.lower().split('\n')) # delete '\n'
    page_four = " ".join(page_four.split()) # delete white space

    # Considering 2 columns
    sus_content = []
    x0 = 0    # Distance of left side of character from left side of page.
    x1 = 0.5  # Distance of right side of character from left side of page.
    y0 = 0  # Distance of bottom of character from bottom of page.
    y1 = 1  # Distance of top of character from bottom of page.

    for i, page in enumerate(pdf.pages):
        width = page.width
        height = page.height

        # Crop pages
        left_bbox = (x0*float(width), y0*float(height), x1*float(width), y1*float(height))
        page_crop = page.crop(bbox=left_bbox)
        left_text = page_crop.extract_text()

        left_bbox = (0.5*float(width), y0*float(height), 1*float(width), y1*float(height))
        page_crop = page.crop(bbox=left_bbox)
        right_text = page_crop.extract_text()
        page_context = ' '.join([left_text, right_text])
        page_context = " ".join(page_context.lower().split('\n')) # delete '\n'
        page_context = " ".join(page_context.split()) # delete white space
        sus_content.append(page_context)

sus_rep = " ".join(sus_content)

#Extracting goal text

sdg_content = []
with pdfplumber.open("sdg_report.pdf") as pdf:
    for i, page in enumerate(pdf.pages):
        page_ = pdf.pages[i]
        page_ = page_.extract_text(x_tolerance=3, y_tolerance=3, layout=False, x_density=7.25, y_density=13)
        page_ = " ".join(page_.lower().split('\n')) # delete '\n'
        page_ = " ".join(page_.split()) # delete white space
        sdg_content.append(page_)

sdg = " ".join(sdg_content)

page_three_correct = sus_content[2]
page_four_correct = sus_content[3]

print(page_three)
print(page_three_correct)
print(sus_rep)
print(sdg)

41 sustainability how we manage sustainability statkraft aims to be a leading renewables company by 2025. a key activities, including acquisition and construction projects. the clear business strategy has been developed to achieve this. one code of conduct applies to our employees and all the companies of the enablers of the strategy is the way in which statkraft in the statkraft group. when it comes to our business partners operates as a company. this is reflected in the company’s and suppliers, they are expected to adhere to our supplier code commitment to sustainability and responsible business practices. of conduct. through its activities, statkraft aims to create value for society, the we have a system for registration and follow-up of non- environment and the company. compliance with external and internal requirements. it facilitates at statkraft, we recognise the importance of businesses in handling of cases, analysis of incidents, identification of contributing to the realisati

In [None]:
# Deleting punctuation

punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''

for elem in page_three:
    if elem in punc:
        page_three = page_three.replace(elem, "")

for elem in page_four:
    if elem in punc:
        page_four = page_four.replace(elem, "")

for elem in page_three_correct:
    if elem in punc:
        page_three_correct = page_three_correct.replace(elem, "")

for elem in page_four_correct:
    if elem in punc:
        page_four_correct = page_four_correct.replace(elem, "")

for elem in sus_rep:
    if elem in punc:
        sus_rep = sus_rep.replace(elem, "")

for elem in sdg:
    if elem in punc:
        sdg = sdg.replace(elem, "")

print(page_three)
print(page_three_correct)
print(sus_rep)
print(sdg)

41 sustainability how we manage sustainability statkraft aims to be a leading renewables company by 2025 a key activities including acquisition and construction projects the clear business strategy has been developed to achieve this one code of conduct applies to our employees and all the companies of the enablers of the strategy is the way in which statkraft in the statkraft group when it comes to our business partners operates as a company this is reflected in the company’s and suppliers they are expected to adhere to our supplier code commitment to sustainability and responsible business practices of conduct through its activities statkraft aims to create value for society the we have a system for registration and followup of non environment and the company compliance with external and internal requirements it facilitates at statkraft we recognise the importance of businesses in handling of cases analysis of incidents identification of contributing to the realisation of the un susta

In [None]:
#lemmatize words

nlp = spacy.load("en_core_web_sm")
doc_three = nlp(page_three)
doc_four = nlp(page_four)
lemmatizer = nlp.get_pipe("lemmatizer")

page_three_lem = [token.lemma_ for token in doc_three]
page_four_lem = [token.lemma_ for token in doc_four]

print(page_three_lem)

########## DON'T FORGET TO REMOVE STOP-WORDS ###########

['41', 'sustainability', 'how', 'we', 'manage', 'sustainability', 'statkraft', 'aim', 'to', 'be', 'a', 'lead', 'renewable', 'company', 'by', '2025', 'a', 'key', 'activity', 'include', 'acquisition', 'and', 'construction', 'project', 'the', 'clear', 'business', 'strategy', 'have', 'be', 'develop', 'to', 'achieve', 'this', 'one', 'code', 'of', 'conduct', 'apply', 'to', 'our', 'employee', 'and', 'all', 'the', 'company', 'of', 'the', 'enabler', 'of', 'the', 'strategy', 'be', 'the', 'way', 'in', 'which', 'statkraft', 'in', 'the', 'statkraft', 'group', 'when', 'it', 'come', 'to', 'our', 'business', 'partner', 'operate', 'as', 'a', 'company', 'this', 'be', 'reflect', 'in', 'the', 'company', '’s', 'and', 'supplier', 'they', 'be', 'expect', 'to', 'adhere', 'to', 'our', 'supplier', 'code', 'commitment', 'to', 'sustainability', 'and', 'responsible', 'business', 'practice', 'of', 'conduct', 'through', 'its', 'activity', 'statkraft', 'aim', 'to', 'create', 'value', 'for', 'society', 'the', 'we', 'h

#Bag of Words - TF-IDF

Bag of Words is a collection of classical methods to extract features from texts and convert them into numeric embedding vectors. We then compare these embedding vectors by computing the cosine similarity between them. There are two popular ways of using the bag of words approach: Count Vectorizer and TFIDF Vectorizer.

**Count Vectorizer**

This algorithm maps each unique word in the entire text corpus to a unique vector index. The vector values for each document are the number of times each specific word appears in that text. Thus, the vector can consist of integer values, including 0, which indicates that the word does not appear in the text. While Count Vectorizer is simple to understand and implement, its main drawback is that it treats all words equally important irrespective of the actual importance of the word.

**TFIDF Vectorizer**

To overcome the drawback of the Count Vectorizer, we can use the TFIDF vectorizer. This algorithm also maps each unique word in the entire text corpus to a unique vector index. But instead of a simple count, the values of the vector for each document are the product of two values: Term Frequency (TF) and Inverse Document Frequency (IDF).

In [None]:
# TF IDF

corpus = [page_three, page_four]
vect = TfidfVectorizer(min_df=1, stop_words="english")
tfidf = vect.fit_transform(corpus)
pairwise_similarity = tfidf * tfidf.T
pairwise_similarity.toarray()

array([[1.        , 0.29367253],
       [0.29367253, 1.        ]])

In [None]:
# TF IDF

corpus = [page_three_correct, page_four_correct]
vect = TfidfVectorizer(min_df=1, stop_words="english")
tfidf = vect.fit_transform(corpus)
pairwise_similarity = tfidf * tfidf.T
pairwise_similarity.toarray()

array([[1.        , 0.28899387],
       [0.28899387, 1.        ]])

In [None]:
# TF IDF

corpus = [page_three, sdg]
vect = TfidfVectorizer(min_df=1, stop_words="english")
tfidf = vect.fit_transform(corpus)
pairwise_similarity = tfidf * tfidf.T
pairwise_similarity.toarray()

array([[1.        , 0.03029439],
       [0.03029439, 1.        ]])

In [None]:
# TF IDF

corpus = [page_three_correct, sdg]
vect = TfidfVectorizer(min_df=1, stop_words="english")
tfidf = vect.fit_transform(corpus)
pairwise_similarity = tfidf * tfidf.T
pairwise_similarity.toarray()

array([[1.        , 0.03017776],
       [0.03017776, 1.        ]])

#Universal Setence Encoder - Tensor Flow Cosine Similarity

In USE, researchers at Google first pre-trained a Transformer-based model on multi-task objectives and then used it for Transfer Learning. To calculate the textual similarity, we first use the pre-trained USE model to compute the contextual word embeddings for each word in the sentence. We then compute the sentence embedding by performing the element-wise sum of all the word vectors and diving by the square root of the length of the sentence to normalize the sentence lengths. Once we have the USE embeddings for each sentence, we can calculate the cosine similarity using the helper function we defined at the beginning of this post. The researchers have open-sourced the pre-trained model on the Tensorflow hub, which we’ll use directly.

In [None]:
# Tensor Flow cosine similarity

# Load the pre-trained model
# gpus = tf.config.list_physical_devices('GPU')
# for gpu in gpus:
#     # Control GPU memory usage
#     tf.config.experimental.set_memory_growth(gpu, True)

module_url = 'https://tfhub.dev/google/universal-sentence-encoder/4'
model_flow = hub.load(module_url)

# Generate Embeddings
sentence1_emb = model_flow([page_three]).numpy() #model(stsb_test['sentence1']).numpy()
sentence2_emb = model_flow([page_four]).numpy() #model(stsb_test['sentence2']).numpy()

sentence3_emb = model_flow([page_three_correct]).numpy() #model(stsb_test['sentence1']).numpy()
sentence4_emb = model_flow([page_four_correct]).numpy() #model(stsb_test['sentence2']).numpy()

sentence5_emb = model_flow([page_three]).numpy() #model(stsb_test['sentence1']).numpy()
sentence6_emb = model_flow([sdg]).numpy() #model(stsb_test['sentence2']).numpy()

sentence7_emb = model_flow([page_three_correct]).numpy() #model(stsb_test['sentence1']).numpy()
sentence8_emb = model_flow([sdg]).numpy() #model(stsb_test['sentence2']).numpy()

cos_sim12 = cosine_similarity(sentence1_emb, sentence2_emb)
cos_sim34 = cosine_similarity(sentence3_emb, sentence4_emb)
cos_sim56 = cosine_similarity(sentence5_emb, sentence6_emb)
cos_sim78 = cosine_similarity(sentence7_emb, sentence8_emb)

# Cosine Similarity
print(cos_sim12)
print(cos_sim34)
print(cos_sim56)
print(cos_sim78)
#print(np.diag(cos_sim12))

[[0.6653944]]
[[0.6645857]]
[[0.35602468]]
[[0.3558625]]


#Sentence Transformer - CrossEncoder [NOT USED]

The advent of the Bidirectional Encoder Representations from Transformers (BERT) model in 2018 ushered in a new era in NLP by beating several benchmarks. Over time, researchers continued to improve over the vanilla BERT model resulting in several notable variants such as RoBERTa, DistilBERT, ALBERT, etc.

BERT derives its power from its self-supervised pre-training task called Masked Language Modeling (MLM), where we randomly hide some words and train the model to predict the missing words given the words both before and after the missing word. Training over a massive corpus of text allows BERT to learn the semantic relationships between the various words in the language.

We can use BERT as a Cross Encoder by adding a classification head to the output of the BERT model. The cross-encoder model takes a pair of text documents as input and directly outputs the probability that the two documents are similar. By fine-tuning the pre-trained BERT model on labeled STS datasets, we can achieve state-of-the-art results on STS tasks!

In [None]:
# Sentence Transformers CrossEncoder

model_cross = CrossEncoder('cross-encoder/stsb-roberta-base')
model_cross.predict([page_three, page_four], show_progress_bar=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/499M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

0.5304151

In [None]:
model_cross.predict([page_three_correct, page_four_correct], show_progress_bar=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

0.59769106

In [None]:
model_cross.predict([page_three, sdg], show_progress_bar=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

0.33031276

In [None]:
model_cross.predict([page_three_correct, sdg], show_progress_bar=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

0.49858436

Another dataset

In [None]:
model_cross2 = CrossEncoder('cross-encoder/nli-deberta-v3-xsmall')
model_cross2.predict([page_three, page_four], show_progress_bar=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/283M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/419 [00:00<?, ?B/s]

Downloading (…)"spm.model";:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

array([-1.5627148 ,  1.1108276 , -0.10906483], dtype=float32)

In [None]:
model_cross2.predict([page_three_correct, page_four_correct], show_progress_bar=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

array([-2.4545915 ,  1.4422362 ,  0.23099314], dtype=float32)

In [None]:
model_cross2.predict([page_three, sdg], show_progress_bar=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

array([-1.6064209 ,  1.3806623 , -0.35204595], dtype=float32)

In [None]:
model_cross2.predict([page_three_correct, sdg], show_progress_bar=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

array([-2.4213748 ,  1.7268385 , -0.06080657], dtype=float32)

#OSDG-CD

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def prep_text(text):
    """
    function for preprocessing text
    """

    # remove trailing characters (\s\n) and convert to lowercase
    clean_sents = [] # append clean con sentences
    sent_tokens = sent_tokenize(str(text))
    for sent_token in sent_tokens:
        word_tokens = [str(word_token).strip().lower() for word_token in sent_token.split()]
        #word_tokens = [word_token for word_token in word_tokens if word_token not in punctuations]
        clean_sents.append(' '.join((word_tokens)))
    joined = ' '.join(clean_sents).strip(' ')
    joined = re.sub(r'`', "", joined)
    joined = re.sub(r'"', "", joined)
    return joined

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from nltk.tokenize import sent_tokenize
import regex as re

model_sdg = AutoModelForSequenceClassification.from_pretrained("sadickam/sdg-classification-bert")

tokenizer = AutoTokenizer.from_pretrained("sadickam/sdg-classification-bert")
tokenized_text = tokenizer(prep_text(page_three_correct), return_tensors="pt", truncation=True, max_length=512)

text_logits = model_sdg(**tokenized_text).logits
predictions = torch.softmax(text_logits, dim=1).tolist()[0]

In [None]:
predictions

[0.00030661869095638394,
 0.0005443863337859511,
 0.000676224532071501,
 0.0004927902482450008,
 0.00043788229231722653,
 0.0013127019628882408,
 0.7287085056304932,
 0.0031785089522600174,
 0.04326698184013367,
 0.00098191830329597,
 0.004733347333967686,
 0.07705501466989517,
 0.1286289542913437,
 0.0013339851284399629,
 0.0016069310950115323,
 0.00673532672226429]

#Sentence Transformer - SBERT Bi-Encoder

Sentence Transformers (also known as SBERT) are the current state-of-the-art NLP sentence embeddings. It uses BERT and its variants as the base model and is pre-trained utilizing a type of metric learning called contrastive learning. In contrastive learning, the contrastive loss function compares whether two embeddings are similar (0) or dissimilar (1).

Though the Bi-Encoder Sentence Transformer has slightly lower performance than the Cross Encoder on our STSB dataset, Bi-Encoders shine when scaling to billions or even trillions of documents by combining them with vector search databases such as Milvus!

In [None]:
# Sentence Transformers cosine similarity

# Load the pre-trained model
#model_strans = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L6-cos-v5')
#model_strans = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L12-cos-v5')
#model_strans = SentenceTransformer('sentence-transformers/msmarco-distilbert-cos-v5')
model_strans = SentenceTransformer('sentence-transformers/msmarco-distilbert-dot-v5')

# Generate Embeddings
sentence1_emb = model_strans.encode([page_three], show_progress_bar=True)
sentence2_emb = model_strans.encode([page_four], show_progress_bar=True)

sentence3_emb = model_strans.encode([page_three_correct], show_progress_bar=True)
sentence4_emb = model_strans.encode([page_four_correct], show_progress_bar=True)

sentence5_emb = model_strans.encode([page_three], show_progress_bar=True)
sentence6_emb = model_strans.encode([sdg], show_progress_bar=True)

sentence7_emb = model_strans.encode([page_three], show_progress_bar=True)
sentence8_emb = model_strans.encode([sdg], show_progress_bar=True)

cos_sim12 = cosine_similarity(sentence1_emb, sentence2_emb)
cos_sim34 = cosine_similarity(sentence3_emb, sentence4_emb)
cos_sim56 = cosine_similarity(sentence5_emb, sentence6_emb)
cos_sim78 = cosine_similarity(sentence7_emb, sentence8_emb)

# Cosine Similarity
print(cos_sim12)
print(cos_sim34)
print(cos_sim56)
print(cos_sim78)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[[0.91848886]]
[[0.9107652]]
[[0.728255]]
[[0.728255]]


#Supervised - SimCSE

A self-supervised algorithm for learning image embeddings using contrastive loss. The NLP equivalent of SimCLR.

SimCSE stands for Simple Contrastive Learning of Sentence Embeddings. We can train it either as a supervised model if labeled data is available or in a completely unsupervised fashion!

SimCSE models are Bi-Encoder Sentence Transformer models trained using the SimCSE approach.

In [None]:
########## Supervised ##########
# Load the pre-trained model
#model_sup = SentenceTransformer('princeton-nlp/sup-simcse-roberta-large')
model_sup = SentenceTransformer('princeton-nlp/sup-simcse-bert-large-uncased')

# Generate Embeddings
sentence1_emb = model_sup.encode([page_three], show_progress_bar=True)
sentence2_emb = model_sup.encode([page_four], show_progress_bar=True)

sentence3_emb = model_sup.encode([page_three_correct], show_progress_bar=True)
sentence4_emb = model_sup.encode([page_four_correct], show_progress_bar=True)

sentence5_emb = model_sup.encode([page_three], show_progress_bar=True)
sentence6_emb = model_sup.encode([sdg], show_progress_bar=True)

sentence7_emb = model_sup.encode([page_three], show_progress_bar=True)
sentence8_emb = model_sup.encode([sdg], show_progress_bar=True)

cos_sim12 = cosine_similarity(sentence1_emb, sentence2_emb)
cos_sim34 = cosine_similarity(sentence3_emb, sentence4_emb)
cos_sim56 = cosine_similarity(sentence5_emb, sentence6_emb)
cos_sim78 = cosine_similarity(sentence7_emb, sentence8_emb)

# Cosine Similarity
print(cos_sim12)
print(cos_sim34)
print(cos_sim56)
print(cos_sim78)

Downloading (…)e316f/.gitattributes:   0%|          | 0.00/736 [00:00<?, ?B/s]

Downloading (…)2f4d4e316f/README.md:   0%|          | 0.00/5.24k [00:00<?, ?B/s]

Downloading (…)4d4e316f/config.json:   0%|          | 0.00/621 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/253 [00:00<?, ?B/s]

Downloading (…)2f4d4e316f/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[[0.92067754]]
[[0.9175178]]
[[0.69518584]]
[[0.69518584]]


#Unsupervised - SimCSE

Despite being trained completely un-supervised just using Dropout to create “positive” pairs, unsupervised SimCSE could comfortably beat other methods such as WMD and USE. Thus, unsupervised SimCSE would be the go-to method in domains where sufficient labeled data is unavailable or expensive to collect.

In [None]:
########## Un-Supervised ##########
# Load the pre-trained model
model_unsup = SentenceTransformer('princeton-nlp/unsup-simcse-roberta-large')

sentence1_emb = model_unsup.encode([page_three], show_progress_bar=True)
sentence2_emb = model_unsup.encode([page_four], show_progress_bar=True)

sentence3_emb = model_unsup.encode([page_three_correct], show_progress_bar=True)
sentence4_emb = model_unsup.encode([page_four_correct], show_progress_bar=True)

sentence5_emb = model_unsup.encode([page_three], show_progress_bar=True)
sentence6_emb = model_unsup.encode([sdg], show_progress_bar=True)

sentence7_emb = model_unsup.encode([page_three], show_progress_bar=True)
sentence8_emb = model_unsup.encode([sdg], show_progress_bar=True)

cos_sim12 = cosine_similarity(sentence1_emb, sentence2_emb)
cos_sim34 = cosine_similarity(sentence3_emb, sentence4_emb)
cos_sim56 = cosine_similarity(sentence5_emb, sentence6_emb)
cos_sim78 = cosine_similarity(sentence7_emb, sentence8_emb)

# Cosine Similarity
print(cos_sim12)
print(cos_sim34)
print(cos_sim56)
print(cos_sim78)