In [12]:
!pip install git+https://github.com/mrpeerat/SCT

Collecting git+https://github.com/mrpeerat/SCT
  Cloning https://github.com/mrpeerat/SCT to /tmp/pip-req-build-m6iw3no0
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.1.0-py3-none-any.whl size=168236 sha256=6fb37ed0e39582a1602ccca3cfb125a29a02d38a5c0310dd75be2007bdfa26d4
  Stored in directory: /tmp/pip-ephem-wheel-cache-_kau24ag/wheels/74/6f/d8/f729b08480a286791bebe8a348b976f35e5b70a74fbbb38e5c
Successfully built sentence-transformers
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [13]:
import os
import logging
from datetime import datetime
import io
import math
import numpy as np
import random
from glob import glob 
import pickle
import pandas as pd

import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
from sentence_transformers import models
from sentence_transformers import LoggingHandler, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

from sentence_transformers import SentenceTransformer, losses

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

SEED = 1000
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

# ตั้งค่า parameters
Best paramerters: https://github.com/mrpeerat/SCT#parameters

In [4]:
max_seq_length = 128
train_batch_size = 128
num_epochs = 20
early_stopping_patience = 7
queue_size = 65536
student_temp = 0.5
teacher_temp = 0.5
learning_rate = 1e-4
eval_batch_size = 16
model_save_path = 'SCT_model'

# โหลด Teacher model

In [5]:
teacher_model_name_or_path = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
teacher_model = SentenceTransformer(teacher_model_name_or_path)

# โหลด dataset
Link: https://drive.google.com/file/d/1u7kCk9xpTfQkxpJ0zfILpo9SR5KNMfaj/view?usp=share_link

In [6]:
train_data_path = "back_translated_mt_sct_2020.txt"
all_pairs = open(train_data_path, mode="rt", encoding="utf-8").readlines()
all_pairs = [sample.strip().split('\t') for sample in all_pairs]
# Two lists of sentences
sents1 = [p[0] for p in all_pairs]
sents2 = [p[1] for p in all_pairs]

In [7]:
try:
    print(f"Loading Sent1....")
    filename = open("rep_s1.pkl", "rb") 
    rep_s1 = pickle.load(filename)
    filename.close()
except:
    rep_s1 = teacher_model.encode(sents1, convert_to_tensor=True, normalize_embeddings=True, device=device, show_progress_bar=True)
    filename = 'rep_s1.pkl'
    pickle.dump(rep_s1, open(filename, 'wb'), protocol=4)

try:
    print(f"Loading Sent2....")
    filename = open("rep_s2.pkl", "rb") 
    rep_s2 = pickle.load(filename)
    filename.close()
except:
    rep_s2 = teacher_model.encode(sents2, convert_to_tensor=True, normalize_embeddings=True, device=device, show_progress_bar=True)
    filename = 'rep_s2.pkl'
    pickle.dump(rep_s2, open(filename, 'wb'), protocol=4)

teacher_dimension = rep_s1.shape[1]

Loading Sent1....
Loading Sent1....


# สร้าง data loader

In [8]:
train_samples = []
for en_text, non_en_text, en_teacher, non_en_teacher in zip(sents1, sents2, rep_s1, rep_s2): 
    train_samples.append(InputExample(texts=[en_text, non_en_text],label=[en_teacher,non_en_teacher]))

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)

# โหลด Student model

In [9]:
# student_model_name_or_path = "airesearch/wangchanberta-base-att-spm-uncased"
student_model_name_or_path = "kornwtp/ConGen-WangchanBERT-Small"

student_word_embedding_model = models.Transformer(student_model_name_or_path, max_seq_length=max_seq_length)
student_dimension = student_word_embedding_model.get_word_embedding_dimension()
student_pooling_model = models.Pooling(student_dimension)
if teacher_dimension != student_dimension:
    dense_model = models.Dense(in_features=student_dimension, out_features=teacher_dimension, activation_function=nn.Tanh())
    student_model = SentenceTransformer(modules=[student_word_embedding_model, student_pooling_model, dense_model])
else:
    student_model = SentenceTransformer(modules=[student_word_embedding_model, student_pooling_model])

Downloading:   0%|          | 0.00/658 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/115M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/388 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/573k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/808k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

# สร้าง instance queue

In [10]:
rep_instance_queue_edited_A = torch.randn(queue_size, teacher_dimension).to(device)
rep_instance_queue_edited_A = F.normalize(rep_instance_queue_edited_A, p=2, dim=1)

rep_instance_queue_edited_B = torch.randn(queue_size, teacher_dimension).to(device)
rep_instance_queue_edited_B = F.normalize(rep_instance_queue_edited_B, p=2, dim=1)

# เรียก training loss

In [14]:
training_loss = losses.SCTLoss_distillation(instanceQ_A=rep_instance_queue_edited_A,  
                    instanceQ_B=rep_instance_queue_edited_B, 
                    model=student_model,
                    student_temp=student_temp, 
                    teacher_temp=teacher_temp, 
                    device=device,
                    sentence_embedding_dimension=teacher_dimension,
                    path_model=model_save_path)
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up


AttributeError: module 'sentence_transformers.losses' has no attribute 'SCTLoss_distillation'

# สร้าง evaluator

In [15]:
evaluation_steps = 64
dev_samples = []
dev_df = pd.read_csv("sts-test_th.csv",header=None,names=['type','datasets','set','raw_score','score','sent1','sent2']) # https://github.com/mrpeerat/Thai-Sentence-Vector-Benchmark/blob/main/sts-test_th.csv
for text in dev_df.values.tolist():   
    sentence1 = text[5]
    sentence2 = text[6]
    score = float(text[4]) / 5.0  #Normalize score to range 0 ... 1
    dev_samples.append(InputExample(texts=[sentence1, sentence2], label=score))

dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, batch_size=eval_batch_size, name='sts-dev')


# Train model

In [None]:
student_model.fit(train_objectives=[(train_dataloader, training_loss)],
            evaluator=dev_evaluator,
            epochs=num_epochs,
            warmup_steps=warmup_steps,
            evaluation_steps=evaluation_steps,
            output_path=model_save_path,
            optimizer_params={"lr": learning_rate, 'eps': 1e-6, 'correct_bias': False},
            use_amp=True,
            early_stopping_patience=early_stopping_patience)