# Connect To Google Drive

In [None]:
# connecting to google drive
from google.colab import drive
drive.mount('/content/drive')

# Install & Import Packages

In [None]:
!pip install sentence_transformers

In [None]:
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import pickle


# Prepare Data

In [None]:
# creating dataset using some labeled and a lot of unlabeled samples
data_dict = None
with open('/content/human_supervision_data.pickle', 'rb') as handle:
    data_dict = pickle.load(handle)


In [None]:
# change structure
prompt_template = "qnli question: qqqqqq sentence: cccccc"
inputs = []
outputs = []
for key in data_dict.keys():
  for item in data_dict[key]:
    inputs.append(prompt_template.replace('qqqqqq',item[0]).replace('cccccc',item[1]))
    outputs.append(2 if key == 'high' else 1 if key == 'medium' else 0)
labeled_data = (inputs , outputs)

In [None]:
# get unlabeled_data
with open('/content/unlabeled_data.pickle', 'rb') as handle:
    unlabeled_data = pickle.load(handle)

In [None]:

class Embedder:
    def __init__(self,data,labeled_data=None): # sample data : [prompted_q1_doc1,...] sample labeled data : ([prompted_q1_doc1,...],[label1,...])
        self.labels = []
        self.data = []
        self.LOW_LABEL = 0
        self.AMBIGOUS_LABEL = 1
        self.HIGH_LABEL = 2
        self.UNK_LABEL = -1
        for d , l in zip(labeled_data[0],labeled_data[1]):
            self.data.append(d)
            self.labels.append(l)
        for d in data:
            self.data.append(d)
            self.labels.append(self.UNK_LABEL) #for unknowns

    def convert_texts_to_embeddings(self):
        model = SentenceTransformer('sentence-transformers/gtr-t5-large').to('cuda')
        data = []
        # processing labled samples
        for d in tqdm(self.data,desc="converting labeled data ..."):
            data.append(model.encode(d))
        self.data=data


In [None]:
embedder = Embedder(unlabeled_data,labeled_data)
embedder.convert_texts_to_embeddings()

In [None]:
with open('/content/embeddings_data.pickle', 'wb') as handle:
    pickle.dump({"data" : embedder.data , "labels" : embedder.labels},handle)