PIP

In [None]:
# ! wget -P ~/ https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
# ! chmod +x ~/Miniconda3-latest-Linux-x86_64.sh
# ! ~/Miniconda3-latest-Linux-x86_64.sh -b
# ! export PATH=~/miniconda3/bin:$PATH
# ! conda init & conda config --set auto_activate_base false
# # close and start a new session
# ! conda activate base
# ! conda install cudatoolkit=11.0 -y
# !pip install sentence-transformers   transformers datasets peft accelerate bitsandbytes faiss-cpu faiss-gpu

Imports

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel
from sentence_transformers import SentenceTransformer, util
import faiss
import pandas as pd
import torch

if torch.backends.mps.is_available():  # Check for Apple Silicon GPU availability (requires PyTorch 1.12 or later)
    device = torch.device("mps")
elif torch.cuda.is_available():  # Check for NVIDIA GPU availability
    device = torch.device("cuda")
else:
    device = torch.device("cpu")  # Fall back to CPU

print(f"Using device: {device}")

Dataset

In [None]:
dataset = load_dataset('ms_marco', 'v1.1')
print(dataset)
train_dataset = dataset['train']
test_dataset = dataset['test']

Unique Documents List

In [None]:
# Assuming dataframe is your DataFrame containing passages
print(train_dataset['passages'][:1])
# passages_list = train_dataset['passages']['passage_text'].tolist()

unique_passages = set()
for row in train_dataset:
    unique_passages.update(row['passages']['passage_text'])
print(len(unique_passages))
documents = list(unique_passages)

HUgginface login

In [None]:
from huggingface_hub import login
login(token="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")

Load SentenceTransformer

In [None]:
# SentenceTransformer("all-MiniLM-L6-v2")
SentenceTranformer = SentenceTransformer(
    'sentence-transformers/msmarco-bert-base-dot-v5',
    device = device,
    )

Test SentenceTransformer

In [None]:
query_embedding = SentenceTranformer.encode('How big is London')
print(len(query_embedding))
document_embedding = SentenceTranformer.encode(['London has 9,787,426 inhabitants at the 2011 census',
                                  'London is known for its finacial district'])

print("Similarity:", util.dot_score(query_embedding, document_embedding))


Generate Embeddings from all documents

In [30]:
# Encode documents
document_embeddings = SentenceTranformer.encode(
    documents, 
    show_progress_bar=True, 
    device = device,
    batch_size=100
)

Batches: 100%|██████████| 6270/6270 [3:07:58<00:00,  1.80s/it]  


Create Faiss Index from all documents

In [31]:
# Build Faiss index
# index = faiss.IndexFlatL2(document_embeddings.shape[1])  # L2 distance
faiss.normalize_L2(document_embeddings)

Store Faiss index to storage and read from storage

In [35]:
index = faiss.IndexFlatIP(document_embeddings.shape[1])  # L2 distance
index.add(document_embeddings)
faiss.write_index(index, "index_docs.index")


Test Faiss Index

In [54]:
# Query
query = "This is a query document."
query_embedding = SentenceTranformer.encode([query])

# Perform document similarity search
k = 10  # Number of similar documents to retrieve
D, I = index.search(query_embedding, k)

print(D)
# Print similar documents


print("Most similar documents to the query:")
for i, idx in enumerate(I[0]):
    print(f"Rank {i+1}: {documents[idx]}")

Most similar documents to the query:
Rank 1: A query language is a language in which a user requests information from the database. These languages are usually on a level higher than that of a standard programming lang … uage. Query languages can be categorized as either procedural or non procedural.
Rank 2: Overview. Structured Query Language (SQL) is a specialized language for updating, deleting, and requesting information from databases. SQL is an ANSI and ISO standard, and is the de facto standard database query language. 
Rank 3: A system specification document is used to present the functions, performance and limitations of a software product or system.
Rank 4: The PUB Document file format. PUB is the file extension which is generally used by the Microsoft Publisher application which is a part of the Microsoft Office product set. This file format can comprise various objects such as graphics, images, formatted text, or any other kind of object. 
Rank 5: Query Definition: Stored P

Configure LoRA and sentenceTranformer of query

In [None]:
from sentence_transformers import SentenceTransformer
QueryTranformer = SentenceTransformer(
    'sentence-transformers/msmarco-bert-base-dot-v5',
    device = device,
)

Define sentenceTransformer in training mode 

Define LoRA and EncoderDecoder GPT2 

In [None]:
from transformers import GPT2Model, GPT2Tokenizer
# EncoderDecoder = GPT2Model.from_pretrained('gpt2')
EncoderDecoder = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side='left')

Define GPT2 in training mode

In [None]:
from transformers import GPT2Tokenizer, GPT2Config, GPT2LMHeadModel, AdamW
from sentence_transformers import SentenceTransformer
from peft import PeftModel, LoraConfig, prepare_model_for_kbit_training

In [None]:
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2')
lora_config = LoraConfig(lora_dim=32, lora_alpha=2, merge_weights=True)
peft_gpt2_model = PeftModel(gpt2_model, lora_config)
peft_gpt2_model = prepare_model_for_kbit_training(peft_gpt2_model, gpt2_config)

# doc_embeddings = torch.load('doc_embeddings.pt')
# faiss_index = faiss.IndexFlatL2(doc_embeddings.shape[1])
# faiss_index.add(doc_embeddings.numpy())


In [None]:
optimizer = AdamW(peft_gpt2_model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()

def train(model, data_loader, epochs, device):
    model.train()
    model.to(device)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

    for epoch in range(epochs):
        for inputs, labels in data_loader:
            inputs = inputs.to(device)  # Shape [batch_size, num_documents, seq_length, vocab_size]
            labels = labels.to(device)  # Shape adjusted if necessary

            batch_size, num_documents, seq_length, _ = inputs.shape
            loss_total = 0

            for b in range(batch_size):
                for n in range(num_documents):
                    input_ids = inputs[b, n]  # Get input for one document
                    output_labels = labels[b]  # Assuming labels are shared across documents

                    # Assuming your model can handle seq_length and vocab_size directly,
                    # otherwise adjust model input handling
                    outputs = model(input_ids)
                    loss = criterion(outputs.view(-1, outputs.shape[-1]), output_labels.view(-1))
                    loss.backward()
                    loss_total += loss.item()

            optimizer.step()
            optimizer.zero_grad()
            print(f"Epoch {epoch}, Batch Loss: {loss_total / (batch_size * num_documents)}")


In [19]:
# Example query and document
query = "What is the impact of climate change on polar bears?"

document = "Climate change is reducing the overall extent and seasonal duration of sea ice, impacting the polar bear's ability to survive."

model.eval()

# Concatenate query with the document context
input_text = query + " " + document

# Encode the text into tensor of integers using the tokenizer
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# Generate a response from the model
output = model.generate(input_ids, max_length=200, num_return_sequences=1)

# Decode the output tensor to a string
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


What is the impact of climate change on polar bears? Climate change is reducing the overall extent and seasonal duration of sea ice, impacting the polar bear's ability to survive.reperepereperepereperepe Kingston Kingston apparently litres litres litres litresAnn) Bo Bo Programme Programme Programme Jim Jim Jim Jim transitioning Express Programme Programme Programme Programme Programme Programme Property Property Property PropertySSLSSLSSL apparentlyiologicaliologicaliologicaliologicalPowerPoweréeéeibus KHPowerxmlxml UNCLASSIFIED knees knees knees knees knees knees knees requests requests449SSLSSLSSLSSLSSLSSLSSLSSLSSL scr infrared pizz touchscreen US US US US US US US US US US US US US US US US US US US US US US goodwill goodwill goodwill goodwill goodwill Express badge badge vendors OmahaCt delic delic delic delic delic delic delic delic delic delic delic delic delic delic delicmanagementgradinggradinggradinggradinggrading Jim Jim eternityarticle vendors vendors vendors quant quant qu

Define custom lost function???


Create training object combined????

Run Training

Inference test data

Store model

Build API 

Pack to Docker Container

Publish

In [None]:

# Select and Prepare a Pre-trained Seq2Seq Model
# Generate the Response
# Evaluation and Iteration
