PIP

In [None]:
# ! wget -P ~/ https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
# ! chmod +x ~/Miniconda3-latest-Linux-x86_64.sh
# ! ~/Miniconda3-latest-Linux-x86_64.sh -b
# ! export PATH=~/miniconda3/bin:$PATH
# ! conda init & conda config --set auto_activate_base false
# # close and start a new session
# ! conda activate base
# ! conda install cudatoolkit=11.0 -y
# !pip install sentence-transformers   transformers datasets peft accelerate bitsandbytes faiss-cpu faiss-gpu

Imports

In [61]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel
from sentence_transformers import SentenceTransformer, util
import faiss
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

if torch.backends.mps.is_available():  # Check for Apple Silicon GPU availability (requires PyTorch 1.12 or later)
    device = torch.device("mps")
elif torch.cuda.is_available():  # Check for NVIDIA GPU availability
    device = torch.device("cuda")
else:
    device = torch.device("cpu")  # Fall back to CPU

print(f"Using device: {device}")

Using device: mps


Dataset

In [2]:
dataset = load_dataset('ms_marco', 'v2.1')
print(dataset)

DatasetDict({
    validation: Dataset({
        features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
        num_rows: 101093
    })
    train: Dataset({
        features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
        num_rows: 808731
    })
    test: Dataset({
        features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
        num_rows: 101092
    })
})


In [3]:
train_dataset = dataset['train'].select(range(1000))
filtered_train_dataset = train_dataset.filter(lambda example: example['wellFormedAnswers'] != [] and example['wellFormedAnswers'] != "")
print(len(filtered_train_dataset))

134


Unique Documents List

In [4]:
unique_passages = set()
for row in filtered_train_dataset:
    unique_passages.update(row['passages']['passage_text'])
print(len(unique_passages))
documents = list(unique_passages)

1333


HUgginface login

In [5]:
from huggingface_hub import login
login(token="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")

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/a.diudiun/.cache/huggingface/token
Login successful


Load SentenceTransformer

In [6]:
# SentenceTransformer("all-MiniLM-L6-v2")
SentenceTranformer = SentenceTransformer(
    'sentence-transformers/msmarco-bert-base-dot-v5',
    device = device,
    )

Test SentenceTransformer

In [20]:
query_embedding = SentenceTranformer.encode('How big is London')
print("embedding length: ", len(query_embedding))
document_embedding = SentenceTranformer.encode(
    [
        'London has 9,787,426 inhabitants at the 2011 census',
        'London is known for its finacial district',
    ])

print("Similarity:", util.dot_score(query_embedding, document_embedding))


embedding length:  768
Similarity: tensor([[166.5561, 159.5406]])


Generate Embeddings from all documents

In [50]:
# Encode documents
document_embeddings = SentenceTranformer.encode(
    documents, 
    show_progress_bar=True, 
    device = device,
    batch_size=100
)

Batches: 100%|██████████| 14/14 [00:14<00:00,  1.00s/it]


Create Faiss Index from all documents

In [55]:
# Build Faiss index
# index = faiss.IndexFlatL2(document_embeddings.shape[1])  # L2 distance
faiss.normalize_L2(document_embeddings)

In [56]:
index = faiss.IndexFlatIP(document_embeddings.shape[1])  # L2 distance
index.add(document_embeddings)

Store Faiss index to storage and read from storage

In [57]:
# Save the index to a file
faiss.write_index(index, "index_docs.index")
# Load the index from a file
# index = faiss.read_index("index_docs.index")

Test Faiss Index

In [60]:
# Query
query = "This is a query document."
query_embedding = SentenceTranformer.encode([query])

# Perform document similarity search
faiss.normalize_L2(query_embedding)
k = 5  # Number of similar documents to retrieve
D, I = index.search(query_embedding, k)

print(D)
print(I)
# for d in D:
#     print((d-11.71897)*1000)
# Print similar documents
D_tensor = torch.tensor(D)
D_softmax = F.softmax(D_tensor, dim=1)  # Apply softmax along the rows
# D_softmax_neg = F.softmax(-D_tensor, dim=1)  # Apply softmax along the rows

print("Softmax")
print(D_softmax)
# print(D_softmax_neg)
# Convert softmax result back to numpy array
D_softmax_np = D_softmax.numpy()

print("Most similar documents to the query:")
for i, idx in enumerate(I[0]):
    print(f"Rank {i+1}: {documents[idx]}")

[[0.8906977  0.88991797 0.88836265 0.88797027 0.8852686 ]]
[[ 389 1179  701   52  615]]
Softmax
tensor([[0.2005, 0.2003, 0.2000, 0.1999, 0.1994]])
Most similar documents to the query:
Rank 1: Definition of exhibit. 1  transitive verb. 2  1 : to submit (as a document) to a court or officer in course of proceedings; also : to present or offer officially or in legal form.
Rank 2: Definition of exhibit. 1  1 : a document or material object produced and identified in court or before an examiner for use as evidence. 2  2 : something exhibited. 3  3 : an act or instance of exhibiting : exhibition.
Rank 3: Quick Answer. The abbreviation et al. is short for the Latin phrase et alia, meaning and others.. When it appears on a property deed, it indicates that a list of items or persons named on the deed includes others as well. Continue Reading.
Rank 4: et al. n. abbreviation for the Latin phrase et alii meaning and others.. This is commonly used in shortening the name of a case, as in Pat Murgatr

Configure LoRA and sentenceTranformer of query

In [62]:
from sentence_transformers import SentenceTransformer
QueryTranformer = SentenceTransformer(
    'sentence-transformers/msmarco-bert-base-dot-v5',
    device = device,
)

Define sentenceTransformer in training mode 

Define LoRA and EncoderDecoder GPT2 

In [64]:
from transformers import GPT2Model, GPT2Tokenizer
# Generator = GPT2Model.from_pretrained('gpt2')
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
Generator = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side='left')

Define GPT2 in training mode

Define custom lost function???


Create training object combined????

In [71]:
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token("[PAD]")

inputs = tokenizer(
    train_dataset['query'], 
    max_length=512, 
    padding='max_length', 
    truncation=True, 
    pad_token = tokenizer.pad_token, 
    return_tensors='pt'
)

targets = tokenizer(
    train_dataset['wellFormedAnswers'], 
    max_length=512, 
    padding='max_length', 
    truncation=True, 
    pad_token = tokenizer.pad_token, 
    return_tensors='pt'
)

TypeError: 'str' object is not callable

In [None]:
def custom_loss(logits, labels):
    loss = torch.mean((logits - labels) ** 2)  # For example, mean squared error
    return loss

In [None]:
class QueryDataset(Dataset):
    def __init__(self, tokenizer, data):
        self.queries = data['query']
        self.answers = data['wellPreparedAnswer']
        self.device = device
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.queries)

    def __getitem__(self, idx):
        return {
            'query_ids': torch.tensor(self.tokenizer.encode(self.queries[idx]), dtype=torch.float),
            'answer_ids': torch.tensor(self.tokenizer.encode(self.answers[idx]), dtype=torch.float),
            'query': self.queries[idx],
        }


In [None]:
K = 2
batch_size = 2

In [None]:
TrainingDataset = QueryDataset(tokenizer, train_dataset)
TrainingDataloader = DataLoader(TrainingDataset, batch_size, shuffle=False, collate_fn=collate_fn)  

# item.toDevice(de)

In [57]:
class CustomModel(torch.nn.Module):
    def __init__(self, EncoderDecoder, index, documents):
        super().__init__()
        self.EncoderDecoder = EncoderDecoder
        self.QueryTranformer = QueryTranformer
        self.documents = documents
        self.index = index

    def forward(self, Q, K):
        query_embedding = self.QueryTranformer.encode([Q])  # Pass appropriate inputs

        D, I = index.search(query_embedding, K)
        D_tensor = torch.tensor(D)
        D_softmax = F.softmax(D_tensor, dim=1) 

        # tensor zero [Batch, K, seq_len]

        decoder_outputs = []
        for i, idx in enumerate(I[0]):
            input_text = self.documents[idx] + " " + Q
            input_ids = tokenizer.encode(input_text, return_tensors='pt')
            
        decoder_output = self.EncoderDecoder(
            input_ids=input_ids
        )

        decoder_outputs.append(decoder_output)


        # run decoder on K documents and Q 
        # avarage output from decoder 
        return gpt2_output, sentence_embedding

In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = CustomModel(**inputs)
        logits = outputs.logits
        loss = custom_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
# Define your model, tokenizer, and training arguments
model = ...  # Define your model here
tokenizer = ...  # Define your tokenizer here
training_args = TrainingArguments(
    ...
)  # Define your training arguments here

In [None]:
trainer = CustomTrainer(
    modelok=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tenizer,
)

Run Training

In [None]:
trainer.train()

Inference test data

Store model

Build API 

Pack to Docker Container

Publish

In [None]:

# Select and Prepare a Pre-trained Seq2Seq Model
# Generate the Response
# Evaluation and Iteration
