# 1. Training Unsupervised SentenceTransformer

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

from sentence_transformers import SentenceTransformer, models, InputExample, losses
from sentence_transformers import evaluation

from datasets import Dataset
from torch.utils.data import DataLoader

# Custom libraries
from utils.unsupervised_utils import generate_topic_model_input, generate_content_model_input, read_data
from utils.utils import read_config

In [2]:
config = read_config()

In [3]:
DATA_PATH = "../raw_data/"

In [4]:
topics, content, correlations = read_data(data_path=DATA_PATH,
                                          config_obj=config,
                                          read_mode="train")

 
--------------------------------------------------
topics.shape: (55266, 4)
content.shape: (154047, 3)
correlations.shape: (61517, 2)


In [5]:
topics.rename(columns=lambda x: "topic_" + x, inplace=True)
content.rename(columns=lambda x: "content_" + x, inplace=True)

In [6]:
content.head(3)

Unnamed: 0,content_id,content_title,content_model_input
0,c_77105b4b84cc,,Un espíritu revestido de piel humana Un espíri...
1,c_77574ef20c1f,,Recurso Educativo Abierto para Ciencias Natura...
2,c_200ae87baf4d,,Recurso Educativo Abierto para Geografía e His...


In [7]:
topics.head(3)

Unnamed: 0,topic_id,topic_title,topic_topic_tree,topic_model_input
0,t_3d9ad9931021,,Libretext Open Educational Resource Library > ...,en 3 [SEP] BC: BIOL 2 - Introduction to Human ...
1,t_59bf60f88801,R,Goalkicker Tech Books > R,en 1 [SEP] R A book about R exported from the ...
2,t_70e95cceae67,巧算,Khan Academy (中国大陆) > 数学 > 三年级 > 除数是一位数的除法 > 巧算,zh 4 [SEP] 巧算 用多种方法计算除数是一位数的除法 Khan Academy (中...


In [8]:
correlations["content_id"] = correlations["content_ids"].str.split(" ")
corr = correlations.explode("content_id").drop(columns=["content_ids"])
corr.head(3)

Unnamed: 0,topic_id,content_id
0,t_00004da3a1b2,c_1108dd0c7a5d
0,t_00004da3a1b2,c_376c5a8eb028
0,t_00004da3a1b2,c_5bc0e1e2cba0


In [9]:
corr = corr.merge(topics, how="left", on="topic_id")
corr = corr.merge(content, how="left", on="content_id")
corr.head(3)

Unnamed: 0,topic_id,content_id,topic_title,topic_topic_tree,topic_model_input,content_title,content_model_input
0,t_00004da3a1b2,c_1108dd0c7a5d,Откриването на резисторите,Khan Academy (български език) > Наука > Физика...,bg 4 [SEP] Откриването на резисторите Изследва...,Молив като резистор,Молив като резистор Моливът причинява промяна ...
1,t_00004da3a1b2,c_376c5a8eb028,Откриването на резисторите,Khan Academy (български език) > Наука > Физика...,bg 4 [SEP] Откриването на резисторите Изследва...,Да чуем променливото съпротивление,Да чуем променливото съпротивление Тук чертаем...
2,t_00004da3a1b2,c_5bc0e1e2cba0,Откриването на резисторите,Khan Academy (български език) > Наука > Физика...,bg 4 [SEP] Откриването на резисторите Изследва...,Променлив резистор (реостат) с графит от молив,Променлив резистор (реостат) с графит от молив...


In [10]:
corr["set"] = corr[["topic_model_input", "content_model_input"]].values.tolist()
train_df = pd.DataFrame(corr["set"])

In [11]:
train_df.head(3)

Unnamed: 0,set
0,[bg 4 [SEP] Откриването на резисторите Изследв...
1,[bg 4 [SEP] Откриването на резисторите Изследв...
2,[bg 4 [SEP] Откриването на резисторите Изследв...


In [12]:
dataset = Dataset.from_pandas(train_df)
dataset

Dataset({
    features: ['set', '__index_level_0__'],
    num_rows: 279919
})

In [13]:
train_examples = []
train_data = dataset["set"]
n_examples = dataset.num_rows

for i in range(n_examples):
    example = train_data[i]
    if example[0] == None:
        continue        
    train_examples.append(InputExample(texts=[str(example[0]), str(example[1])]))

### Training

In [14]:
TARGET_MODEL = config["unsupervised_model"]["base_name"]
OUT_MODEL = config["unsupervised_model"]["save_name"]
TARGET_MODEL, OUT_MODEL

('sentence-transformers/all-MiniLM-L6-v2',
 'trained_models/unsupervised/all-MiniLM-L6-v2')

In [15]:
model = SentenceTransformer(TARGET_MODEL)
model.max_seq_length = config["unsupervised_model"]["seq_len"]

word_embedding_model = model._first_module()
tokens = ["[SEP]"]
word_embedding_model.tokenizer.add_tokens(tokens, special_tokens=True)
word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer))

Embedding(30522, 384, padding_idx=0)

In [16]:
train_dataloader = DataLoader(train_examples,
                              shuffle=True,
                              batch_size=config["unsupervised_model"]["batch_size"])

train_loss = losses.MultipleNegativesRankingLoss(model=model)

#k% of train data
num_epochs = config["unsupervised_model"]["epochs"]
warmup_steps = int(len(train_dataloader) * num_epochs * config["unsupervised_model"]["warmup_ratio"]) 

In [17]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          warmup_steps=warmup_steps,
         use_amp=True)

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/850 [00:00<?, ?it/s]

Iteration:   0%|          | 0/850 [00:00<?, ?it/s]

In [18]:
model.save(OUT_MODEL)