In [None]:
import torch
import faiss
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, models

In [2]:
word_embedding_model = models.Transformer("klue/roberta-base")
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
print(model)

Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: RobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)


In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    
    return sum_embeddings / sum_mask

In [None]:
def max_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    token_embeddings[input_mask_expanded == 0] = -1e9
    return torch.max(token_embeddings, 1)[0]

In [None]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('snunlp/KR-SBERT-V40K-klueNLI-augSTS')

embs = model.encode(['잠이 안 옵니다',
                     '졸음이 옵니다',
                     '기차가 옵니다'])

cos_scores = util.cos_sim(embs, embs)
print(cos_scores)
# tensor([[1.0000, 0.6410, 0.1887],
#         [0.6410, 1.0000, 0.2730],
#         [0.1887, 0.2730, 1.0000]])

In [None]:
from PIL import Image
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('clip-ViT-B-32')

img_embs = model.encode([Image.open('dog.jpg'), Image.open('cat.jpg')])
text_embs = model.encode(['A dog on grass', 'Brown cat on yellow background'])

cos_scores = util.cos_sim(img_embs, text_embs)
print(cos_scores)
# tensor([[0.2771, 0.1509],
#         [0.2071, 0.3180]])

In [None]:
klue_mrc_dataset = load_dataset('klue', 'mrc', split='train')
sentence_model = SentenceTransformer('snunlp/KR-SBERT-V40K-klueNLI-augSTS')

print(klue_mrc_dataset)
print(sentence_model)

In [None]:
klue_mrc_dataset = klue_mrc_dataset.train_test_split(train_size=1000, shuffle=False)
print(klue_mrc_dataset)

klue_mrc_dataset = klue_mrc_dataset['train']

embeddings = sentence_model.encode(klue_mrc_dataset['context'])
print(embeddings.shape)

In [None]:
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

In [None]:
query = "이번 연도에는 언제 비가 많이 올까?"
query_embedding = sentence_model.encode([query])

distances, indices = index.search(query_embedding, 3)
print(distances)
print(indices)

In [None]:
for idx in indices[0]:
    print(klue_mrc_dataset['context'][idx])

In [None]:
query = klue_mrc_dataset[3]['question']
print(query, '\n')

query_embedding = sentence_model.encode([query])
distances, indices = index.search(query_embedding, 3)

for idx in indices[0]:
    print("=" * 50)
    print(idx)
    print(klue_mrc_dataset['context'][idx])

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Document, VectorStoreIndex, ServiceContext, Settings

In [None]:
# embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-large-instruct")
# service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=None)

Settings.embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-large-instruct")
Settings.llm = None

In [None]:
text_list = klue_mrc_dataset[:100]['context']
documents = [Document(text=t) for t in text_list]

index_llama = VectorStoreIndex.from_documents(documents)

In [None]:
import math
import numpy as np

from typing import List
from collections import defaultdict
from transformers import PreTrainedTokenizer

In [None]:
class BM25:
    def __init__(self, corpus, tokenizer):
        """
        tokenizer : 토크나이저
        corpus : 문서 데이터셋
        """
        self.tokenizer = tokenizer
        self.corpus = corpus

        ## 각 문서를 토큰화
        self.tokenized_corpus = self.tokenizer(corpus, add_special_tokens=False)['input_ids']
        
        ## 토큰화된 문서들의 수
        self.n_docs = len(self.tokenized_corpus)

        ## 전체 문서들의 평균 길이
        self.avg_doc_lens = sum(len(lst) for lst in self.tokenized_corpus) / len(self.tokenized_corpus)

        ## IDF 계산
        self.idf = self.calculate_idf()

        ## TF 계산
        self.term_freqs = self.calculate_term_freqs()

    def calculate_idf(self):
        idf = defaultdict(float)
        ## 토큰화된 문서들을 순회
        for doc in self.tokenized_corpus:
            ## 토큰화된 문서의 토큰들의 등장 횟수를 카운팅하고 딕셔너리에 저장.
            for token_id in set(doc):
                idf[token_id] += 1

        for token_id, doc_frequency in idf.items():
            idf[token_id] = math.log(((self.n_docs - doc_frequency + 0.5) / (doc_frequency + 0.5)) + 1)

        return idf
    
    def calculate_term_freqs(self):
        term_freqs = [defaultdict(int) for _ in range(self.n_docs)]
        
        ## 토큰화된 문서들을 순회
        for i, doc in enumerate(self.tokenized_corpus):
            for token_id in doc:
                term_freqs[i][token_id] += 1

        return term_freqs
    
    def get_scores(self, query, k1=1.2, b=0.75):
        query = self.tokenizer([query], add_special_tokens=False)['input_ids'][0]
        scores = np.zeros(self.n_docs)

        for q in query:
            idf = self.idf[q]
            for i, term_freq in enumerate(self.term_freqs):
                q_frequency = term_freq[q]
                doc_len = len(self.tokenized_corpus[i])
                score_q = idf * (q_frequency * (k1 + 1)) / ((q_frequency) + k1 * (1 - b + b * (doc_len / self.avg_doc_lens)))
                scores[i] += score_q
        
        return scores
    
    def get_top_k(self, query, k):
        scores = self.get_scores(query)
        top_k_indices = np.argsort(scores)[-k:][::-1]
        top_k_scores = scores[top_k_indices]

        return top_k_scores, top_k_indices

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")

bm25 = BM25(['안녕하세요', '반갑습니다', '안녕 서울'], tokenizer)
bm25.get_scores('안녕')

In [None]:
bm25 = BM25(klue_mrc_dataset['context'], tokenizer)

query = '아번 연도에는 언제 비가 많이 올까?'
_, bm25_search_ranking = bm25.get_top_k(query, 100)

for idx in bm25_search_ranking[:3]:
    print(klue_mrc_dataset['context'][idx][:50])

In [None]:
query = klue_mrc_dataset[3]['question']
_, bm25_search_ranking = bm25.get_top_k(query, 100)

for idx in bm25_search_ranking[:3]:
    print(klue_mrc_dataset['context'][idx][:50])

In [None]:
from collections import defaultdict

def reciprocal_rank_fusion(rankings, k=5):
    rrf = defaultdict(float)
    for ranking in rankings:
        for i, doc_id in enumerate(ranking, 1):
            rrf[doc_id] += 1.0 / (k + i)

    return sorted(rrf.items(), key=lambda x : x[1], reverse=True)

In [None]:
rankings = [[1, 4, 3, 5, 6], [2, 1, 3, 6, 4]]
reciprocal_rank_fusion(rankings)

In [None]:
def dense_vector_search(query, k):
    query_embedding = sentence_model.encode([query])
    distances, indices = index.search(query_embedding, k)
    
    return distances[0], indices[0]

def hybrid_search(query, k=20):
    _, dense_search_ranking = dense_vector_search(query, 100)
    _, bm25_search_ranking = bm25.get_top_k(query, 100)

    results = reciprocal_rank_fusion([dense_search_ranking, bm25_search_ranking], k=k)

    return results

In [None]:
query = "이번 연도에는 언제 비가 많이 올까?"
print("검색 쿼리 문장: ", query)
results = hybrid_search(query)
for idx, score in results[:3]:
  print(klue_mrc_dataset['context'][idx][:50])

print("=" * 80)
query = klue_mrc_dataset[3]['question'] # 로버트 헨리 딕이 1946년에 매사추세츠 연구소에서 개발한 것은 무엇인가?
print("검색 쿼리 문장: ", query)

results = hybrid_search(query)
for idx, score in results[:3]:
  print(klue_mrc_dataset['context'][idx][:50])