# Text Embedding

Resource: https://www.deeplearning.ai/short-courses/google-cloud-vertex-ai/

In [1]:
import warnings

warnings.filterwarnings('ignore')

## Helper functions

In [None]:
import os
from dotenv import load_dotenv
import json
import base64
from google.auth.transport.requests import Request
from google.oauth2.service_account import Credentials
import matplotlib.pyplot as plt
import mplcursors
import numpy as np

def authenticate():
    return "DALI-credentials", "DLAI-PROJECT"
    #Load .env
    load_dotenv()
    
    #Decode key and store in .JSON
    SERVICE_ACCOUNT_KEY_STRING_B64 = os.getenv('SERVICE_ACCOUNT_KEY')
    SERVICE_ACCOUNT_KEY_BYTES_B64 = SERVICE_ACCOUNT_KEY_STRING_B64.encode("ascii")
    SERVICE_ACCOUNT_KEY_STRING_BYTES = base64.b64decode(SERVICE_ACCOUNT_KEY_BYTES_B64)
    SERVICE_ACCOUNT_KEY_STRING = SERVICE_ACCOUNT_KEY_STRING_BYTES.decode("ascii")

    SERVICE_ACCOUNT_KEY = json.loads(SERVICE_ACCOUNT_KEY_STRING)


    # Create credentials based on key from service account
    # Make sure your account has the roles listed in the Google Cloud Setup section
    credentials = Credentials.from_service_account_info(
        SERVICE_ACCOUNT_KEY,
        scopes=['https://www.googleapis.com/auth/cloud-platform'])

    if credentials.expired:
        credentials.refresh(Request())
    
    #Set project ID accoridng to environment variable    
    PROJECT_ID = os.getenv('PROJECT_ID')
        
    return credentials, PROJECT_ID

In [None]:
def plot_heatmap(data, x_labels=None, y_labels=None, title=None):
    fig, ax = plt.subplots(figsize=(50, 3))
    heatmap = ax.pcolor(data, cmap='coolwarm', edgecolors='k', linewidths=0.1)

    # Add color bar to the right of the heatmap
    cbar = plt.colorbar(heatmap, ax=ax)
    cbar.remove()

    # Set labels for each axis
    if x_labels:
        ax.set_xticks(np.arange(data.shape[1]) + 0.5, minor=False)
        ax.set_xticklabels(x_labels, rotation=45, ha="right")
    if y_labels:
        ax.set_yticks(np.arange(data.shape[0]) + 0.5, minor=False)
        ax.set_yticklabels(y_labels, va="center")

    # Set title
    if title:
        ax.set_title(title)
        
    plt.tight_layout()

    # Show the plot
    plt.show()
    
def plot_2D(x_values, y_values, labels):

    # Create scatter plot
    fig, ax = plt.subplots()
    scatter = ax.scatter(x_values, 
                         y_values, 
                         alpha = 0.5, 
                         edgecolors='k',
                         s = 40) 

    # Create a mplcursors object to manage the data point interaction
    cursor = mplcursors.cursor(scatter, hover=True)

    #aes
    ax.set_title('Embedding visualization in 2D')  # Add a title
    ax.set_xlabel('X_1')  # Add x-axis label
    ax.set_ylabel('X_2')  # Add y-axis label

    # Define how each annotation should look
    @cursor.connect("add")
    def on_add(sel):
        sel.annotation.set_text(labels[sel.target.index])
        sel.annotation.get_bbox_patch().set(facecolor='white', alpha=0.5) # Set annotation's background color
        sel.annotation.set_fontsize(12) 

    plt.show()

In [None]:
def generate_batches(sentences, batch_size = 5):
    for i in range(0, len(sentences), batch_size):
        yield sentences[i : i + batch_size]

def encode_texts_to_embeddings(sentences):
    model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")
    try:
        embeddings = model.get_embeddings(sentences)
        return [embedding.values for embedding in embeddings]
    except Exception:
        return [None for _ in range(len(sentences))]
        
def encode_text_to_embedding_batched(sentences, api_calls_per_second = 0.33, batch_size = 5):
    # Generates batches and calls embedding API
    
    embeddings_list = []

    # Prepare the batches using a generator
    batches = generate_batches(sentences, batch_size)

    seconds_per_job = 1 / api_calls_per_second

    with ThreadPoolExecutor() as executor:
        futures = []
        for batch in tqdm(
            batches, total = math.ceil(len(sentences) / batch_size), position=0
        ):
            futures.append(
                executor.submit(functools.partial(encode_texts_to_embeddings), batch)
            )
            time.sleep(seconds_per_job)

        for future in futures:
            embeddings_list.extend(future.result())

    is_successful = [
        embedding is not None for sentence, embedding in zip(sentences, embeddings_list)
    ]
    embeddings_list_successful = np.squeeze(
        np.stack([embedding for embedding in embeddings_list if embedding is not None])
    )
    return embeddings_list_successful

def clusters_2D(x_values, y_values, labels, kmeans_labels):
    fig, ax = plt.subplots()
    scatter = ax.scatter(x_values, 
                         y_values, 
                         c = kmeans_labels, 
                         cmap='Set1', 
                         alpha=0.5, 
                         edgecolors='k', 
                         s = 40)  # Change the denominator as per n_clusters

    # Create a mplcursors object to manage the data point interaction
    cursor = mplcursors.cursor(scatter, hover=True)

    #axes
    ax.set_title('Embedding clusters visualization in 2D')  # Add a title
    ax.set_xlabel('X_1')  # Add x-axis label
    ax.set_ylabel('X_2')  # Add y-axis label

    # Define how each annotation should look
    @cursor.connect("add")
    def on_add(sel):
        sel.annotation.set_text(labels.category[sel.target.index])
        sel.annotation.get_bbox_patch().set(facecolor='white', alpha=0.95) # Set annotation's background color
        sel.annotation.set_fontsize(14) 

    plt.show()

In [None]:
def generate_batches(sentences, batch_size = 5):
    for i in range(0, len(sentences), batch_size):
        yield sentences[i : i + batch_size]

def encode_texts_to_embeddings(sentences):
    model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")
    try:
        embeddings = model.get_embeddings(sentences)
        return [embedding.values for embedding in embeddings]
    except Exception:
        return [None for _ in range(len(sentences))]
        
def encode_text_to_embedding_batched(sentences, api_calls_per_second = 0.33, batch_size = 5):
    # Generates batches and calls embedding API
    
    embeddings_list = []

    # Prepare the batches using a generator
    batches = generate_batches(sentences, batch_size)

    seconds_per_job = 1 / api_calls_per_second

    with ThreadPoolExecutor() as executor:
        futures = []
        for batch in tqdm(
            batches, total = math.ceil(len(sentences) / batch_size), position=0
        ):
            futures.append(
                executor.submit(functools.partial(encode_texts_to_embeddings), batch)
            )
            time.sleep(seconds_per_job)

        for future in futures:
            embeddings_list.extend(future.result())

    is_successful = [
        embedding is not None for sentence, embedding in zip(sentences, embeddings_list)
    ]
    embeddings_list_successful = np.squeeze(
        np.stack([embedding for embedding in embeddings_list if embedding is not None])
    )
    return embeddings_list_successful

# configure ScaNN as a tree - asymmetric hash hybrid with reordering
# anisotropic quantization as described in the paper; see README
def create_index(embedded_dataset, 
                 num_leaves,
                 num_leaves_to_search,
                 training_sample_size):
    
    # normalize data to use cosine sim as explained in the paper
    normalized_dataset = embedded_dataset / np.linalg.norm(embedded_dataset, axis=1)[:, np.newaxis]
    
    searcher = (
        scann.scann_ops_pybind.builder(normalized_dataset, 10, "dot_product")
        .tree(
            num_leaves = num_leaves,
            num_leaves_to_search = num_leaves_to_search,
            training_sample_size = training_sample_size,
        )
        .score_ah(2, anisotropic_quantization_threshold = 0.2)
        .reorder(100)
        .build()
    )
    return searcher

## Embbedings

In [None]:
print(PROJECT_ID)

In [None]:
REGION = 'us-central1'

In [None]:
# Import and initialize the Vertex AI Python SDK

import vertexai
vertexai.init(project = PROJECT_ID, 
              location = REGION, 
              credentials = credentials

### Try embbedings model

In [None]:
from vertexai.language_models import TextEmbeddingModel

### Text embeddings

In [None]:
embedding_model = TextEmbeddingModel.from_pretrained(
    "textembedding-gecko@001")

In [None]:
embedding = embedding_model.get_embeddings(
    ["life"])

In [None]:
vector = embedding[0].values
print(f"Length = {len(vector)}")
print(vector[:10])

In [None]:
embedding = embedding_model.get_embeddings(
    ["What is the meaning of life?"])

In [None]:
vector = embedding[0].values
print(f"Length = {len(vector)}")
print(vector[:10])

### Similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
emb_1 = embedding_model.get_embeddings(
    ["What is the meaning of life?"]) # 42!

emb_2 = embedding_model.get_embeddings(
    ["How does one spend their time well on Earth?"])

emb_3 = embedding_model.get_embeddings(
    ["Would you like a salad?"])

vec_1 = [emb_1[0].values]
vec_2 = [emb_2[0].values]
vec_3 = [emb_3[0].values]

In [None]:
print(cosine_similarity(vec_1,vec_2)) 
print(cosine_similarity(vec_2,vec_3))
print(cosine_similarity(vec_1,vec_3))

### Sentence embbedings

In [None]:
in_1 = "The kids play in the park."
in_2 = "The play was for kids in the park."

In [None]:
in_pp_1 = ["kids", "play", "park"]
in_pp_2 = ["play", "kids", "park"]

In [None]:
embeddings_1 = [emb.values for emb in embedding_model.get_embeddings(in_pp_1)]

In [None]:
import numpy as np
emb_array_1 = np.stack(embeddings_1)
print(emb_array_1.shape)

In [None]:
embeddings_2 = [emb.values for emb in embedding_model.get_embeddings(in_pp_2)]
emb_array_2 = np.stack(embeddings_2)
print(emb_array_2.shape)

In [None]:
emb_1_mean = emb_array_1.mean(axis = 0) 
print(emb_1_mean.shape)

In [None]:
emb_2_mean = emb_array_2.mean(axis = 0)

In [None]:
print(emb_1_mean[:4])
print(emb_2_mean[:4])

In [None]:
print(in_1)
print(in_2)

In [None]:
embedding_1 = embedding_model.get_embeddings([in_1])
embedding_2 = embedding_model.get_embeddings([in_2])

In [None]:
vector_1 = embedding_1[0].values
print(vector_1[:4])
vector_2 = embedding_2[0].values
print(vector_2[:4])

## Visualize embeddings

In [None]:
# Import and initialize the Vertex AI Python SDK

import vertexai
vertexai.init(project=PROJECT_ID, 
              location=REGION, 
              credentials = credentials)

### Meanings

In [None]:
in_1 = "Missing flamingo discovered at swimming pool"

in_2 = "Sea otter spotted on surfboard by beach"

in_3 = "Baby panda enjoys boat ride"


in_4 = "Breakfast themed food truck beloved by all!"

in_5 = "New curry restaurant aims to please!"


in_6 = "Python developers are wonderful people"

in_7 = "TypeScript, C++ or Java? All are great!" 


input_text_lst_news = [in_1, in_2, in_3, in_4, in_5, in_6, in_7]

In [None]:
import numpy as np
from vertexai.language_models import TextEmbeddingModel

embedding_model = TextEmbeddingModel.from_pretrained(
    "textembedding-gecko@001")

In [None]:
embeddings = []
for input_text in input_text_lst_news:
    emb = embedding_model.get_embeddings(
        [input_text])[0].values
    embeddings.append(emb)
    
embeddings_array = np.array(embeddings) 

In [None]:
print("Shape: " + str(embeddings_array.shape))
print(embeddings_array)

### Dimension reduction

In [None]:
from sklearn.decomposition import PCA

# Perform PCA for 2D visualization
PCA_model = PCA(n_components = 2)
PCA_model.fit(embeddings_array)
new_values = PCA_model.transform(embeddings_array)

In [None]:
print("Shape: " + str(new_values.shape))
print(new_values)

In [None]:
import matplotlib.pyplot as plt
import mplcursors
%matplotlib ipympl

from utils import plot_2D
plot_2D(new_values[:,0], new_values[:,1], input_text_lst_news)

### Similarity

In [None]:
in_1 = """He couldn’t desert 
          his post at the power plant."""

in_2 = """The power plant needed 
          him at the time."""

in_3 = """Cacti are able to 
          withstand dry environments.""" 

in_4 = """Desert plants can 
          survive droughts.""" 

input_text_lst_sim = [in_1, in_2, in_3, in_4]

In [None]:
embeddings = []
for input_text in input_text_lst_sim:
    emb = embedding_model.get_embeddings([input_text])[0].values
    embeddings.append(emb)
    
embeddings_array = np.array(embeddings) 

In [None]:
y_labels = input_text_lst_sim

# Plot the heatmap
plot_heatmap(embeddings_array, y_labels = y_labels, title = "Embeddings Heatmap")

### Cosine similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def compare(embeddings,idx1,idx2):
    return cosine_similarity([embeddings[idx1]],[embeddings[idx2]])

In [None]:
print(in_1)
print(in_2)
print(compare(embeddings,0,1))

In [None]:
print(in_1)
print(in_4)
print(compare(embeddings,0,3))

## Application of embeddings

### Load stackoverflow data

In [None]:
from google.cloud import bigquery
import pandas as pd

In [None]:
def run_bq_query(sql):

    # Create BQ client
    bq_client = bigquery.Client(project = PROJECT_ID, 
                                credentials = credentials)

    # Try dry run before executing query to catch any errors
    job_config = bigquery.QueryJobConfig(dry_run=True, 
                                         use_query_cache=False)
    bq_client.query(sql, job_config=job_config)

    # If dry run succeeds without errors, proceed to run query
    job_config = bigquery.QueryJobConfig()
    client_result = bq_client.query(sql, 
                                    job_config=job_config)

    job_id = client_result.job_id

    # Wait for query/job to finish running. then get & return data frame
    df = client_result.result().to_arrow().to_pandas()
    print(f"Finished job_id: {job_id}")
    return df

In [None]:
# define list of programming language tags we want to query

language_list = ["python", "html", "r", "css"]

In [None]:
so_df = pd.DataFrame()

for language in language_list:
    
    print(f"generating {language} dataframe")
    
    query = f"""
    SELECT
        CONCAT(q.title, q.body) as input_text,
        a.body AS output_text
    FROM
        `bigquery-public-data.stackoverflow.posts_questions` q
    JOIN
        `bigquery-public-data.stackoverflow.posts_answers` a
    ON
        q.accepted_answer_id = a.id
    WHERE 
        q.accepted_answer_id IS NOT NULL AND 
        REGEXP_CONTAINS(q.tags, "{language}") AND
        a.creation_date >= "2020-01-01"
    LIMIT 
        500
    """

    
    language_df = run_bq_query(query)
    language_df["category"] = language
    so_df = pd.concat([so_df, language_df], 
                      ignore_index = True) 

In [None]:
# Run this cell if you get any errors or you don't want to wait for the query to be completed
# so_df = pd.read_csv('so_database_app.csv')

In [None]:
so_df

### Generate text embeddings

In [None]:
from vertexai.language_models import TextEmbeddingModel

In [None]:
model = TextEmbeddingModel.from_pretrained(
    "textembedding-gecko@001")

In [None]:
import time
import numpy as np

In [None]:
# Generator function to yield batches of sentences

def generate_batches(sentences, batch_size = 5):
    for i in range(0, len(sentences), batch_size):
        yield sentences[i : i + batch_size]

In [None]:
so_questions = so_df[0:200].input_text.tolist() 
batches = generate_batches(sentences = so_questions)

In [None]:
batch = next(batches)
len(batch)

### Get embeddings on a batch of data

In [None]:
def encode_texts_to_embeddings(sentences):
    try:
        embeddings = model.get_embeddings(sentences)
        return [embedding.values for embedding in embeddings]
    except Exception:
        return [None for _ in range(len(sentences))]

In [None]:
batch_embeddings = encode_texts_to_embeddings(batch)

In [None]:
f"{len(batch_embeddings)} embeddings of size \
{len(batch_embeddings[0])}"

### Get entire data

In [None]:
so_questions = so_df.input_text.tolist()
question_embeddings = encode_text_to_embedding_batched(
                            sentences=so_questions,
                            api_calls_per_second = 20/60, 
                            batch_size = 5)

### Load data from file

In [None]:
so_df = pd.read_csv("../data/so_database_app.csv")
so_df.head()

In [None]:
import pickle

In [None]:
with open("../data/question_embeddings_app.pkl", 'rb') as file:
    question_embeddings = pickle.load(file)

In [None]:
print("Shape: " + str(question_embeddings.shape))
print(question_embeddings)

### Cluster embeddings

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [None]:
clustering_dataset = question_embeddings[:1000]

In [None]:
n_clusters = 2
kmeans = KMeans(n_clusters=n_clusters, 
                random_state=0, 
                n_init = 'auto').fit(clustering_dataset)

In [None]:
kmeans_labels = kmeans.labels_

In [None]:
PCA_model = PCA(n_components=2)
PCA_model.fit(clustering_dataset)
new_values = PCA_model.transform(clustering_dataset)

In [None]:
import matplotlib.pyplot as plt
import mplcursors
%matplotlib ipympl

In [None]:
clusters_2D(x_values = new_values[:,0], y_values = new_values[:,1], 
            labels = so_df[:1000], kmeans_labels = kmeans_labels)

### Anomaly / outlier detection

In [None]:
from sklearn.ensemble import IsolationForest

In [None]:
input_text = """I am making cookies but don't 
                remember the correct ingredient proportions. 
                I have been unable to find 
                anything on the web."""

In [None]:
emb = model.get_embeddings([input_text])[0].values

In [None]:
embeddings_l = question_embeddings.tolist()
embeddings_l.append(emb)

In [None]:
embeddings_array = np.array(embeddings_l)

In [None]:
print("Shape: " + str(embeddings_array.shape))
print(embeddings_array)

In [None]:
# Add the outlier text to the end of the stack overflow dataframe
so_df = pd.read_csv('so_database_app.csv')
new_row = pd.Series([input_text, None, "baking"], 
                    index=so_df.columns)
so_df.loc[len(so_df)+1] = new_row
so_df.tail()

In [None]:
clf = IsolationForest(contamination=0.005, 
                      random_state = 2) 

In [None]:
preds = clf.fit_predict(embeddings_array)

print(f"{len(preds)} predictions. Set of possible values: {set(preds)}")

In [None]:
so_df.loc[preds == -1]

In [None]:
so_df = so_df.drop(so_df.index[-1])

In [None]:
so_df

### Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
# re-load the dataset from file
so_df = pd.read_csv("../data/so_database_app.csv")
X = question_embeddings
X.shape

In [None]:
y = so_df['category'].values
y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.2, 
                                                    random_state = 2)

In [None]:
clf = RandomForestClassifier(n_estimators=200)

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred) # compute accuracy
print("Accuracy:", accuracy)

In [None]:
# choose a number between 0 and 1999
i = 2
label = so_df.loc[i,'category']
question = so_df.loc[i,'input_text']

# get the embedding of this question and predict its category
question_embedding = model.get_embeddings([question])[0].values
pred = clf.predict([question_embedding])

print(f"For question {i}, the prediction is `{pred[0]}`")
print(f"The actual label is `{label}`")
print("The question text is:")
print("-"*50)
print(question)

## Text generation

### Prompt model

In [None]:
import vertexai
vertexai.init(project=PROJECT_ID, 
              location=REGION, 
              credentials = credentials)

In [None]:
from vertexai.language_models import TextGenerationModel

In [None]:
generation_model = TextGenerationModel.from_pretrained(
    "text-bison@001")

### Question answering

In [None]:
prompt = "I'm a high school student. \
Recommend me a programming activity to improve my skills."

In [None]:
print(generation_model.predict(prompt=prompt).text)

### Classify and elaborate

In [None]:
prompt = """I'm a high school student. \
Which of these activities do you suggest and why:
a) learn Python
b) learn Javascript
c) learn Fortran
"""

In [None]:
print(generation_model.predict(prompt=prompt).text)

### Extract information

In [None]:
prompt = """ A bright and promising wildlife biologist \
named Jesse Plank (Amara Patel) is determined to make her \
mark on the world. 
Jesse moves to Texas for what she believes is her dream job, 
only to discover a dark secret that will make \
her question everything. 
In the new lab she quickly befriends the outgoing \
lab tech named Maya Jones (Chloe Nguyen), 
and the lab director Sam Porter (Fredrik Johansson). 
Together the trio work long hours on their research \
in a hope to change the world for good. 
Along the way they meet the comical \
Brenna Ode (Eleanor Garcia) who is a marketing lead \
at the research institute, 
and marine biologist Siri Teller (Freya Johansson).

Extract the characters, their jobs \
and the actors who played them from the above message as a table
"""

In [None]:
response = generation_model.predict(prompt=prompt)

print(response.text)

### Adjust randomness

In [None]:
temperature = 0.0

In [None]:
prompt = "Complete the sentence: \
As I prepared the picture frame, \
I reached into my toolkit to fetch my:"

In [None]:
response = generation_model.predict(
    prompt=prompt,
    temperature=temperature,
)

In [None]:
print(f"[temperature = {temperature}]")
print(response.text)

In [None]:
temperature = 1.0

In [None]:
response = generation_model.predict(
    prompt=prompt,
    temperature=temperature,
)

In [None]:
print(f"[temperature = {temperature}]")
print(response.text)

### Top P

In [None]:
top_p = 0.2

In [None]:
prompt = "Write an advertisement for jackets \
that involves blue elephants and avocados."

In [None]:
response = generation_model.predict(
    prompt=prompt, 
    temperature=0.9, 
    top_p=top_p,
)

In [None]:
print(f"[top_p = {top_p}]")
print(response.text)

### Top K

In [None]:
top_k = 20
top_p = 0.7

In [None]:
response = generation_model.predict(
    prompt=prompt, 
    temperature=0.9, 
    top_k=top_k,
    top_p=top_p,
)

In [None]:
print(f"[top_p = {top_p}]")
print(response.text)

## Semantic Search, Building a Q&A System

### Load stackoverflow questions

In [None]:
import pandas as pd

In [None]:
so_database = pd.read_csv("../data/so_database_app.csv")

In [None]:
print("Shape: " + str(so_database.shape))
print(so_database)

### Load question embeddings

In [None]:
from vertexai.language_models import TextEmbeddingModel

In [None]:
embedding_model = TextEmbeddingModel.from_pretrained(
    "textembedding-gecko@001")

In [None]:
import numpy as np


In [None]:
so_questions = so_database.input_text.tolist()
question_embeddings = encode_text_to_embedding_batched(
            sentences = so_questions,
            api_calls_per_second = 20/60, 
            batch_size = 5)

In [None]:
import pickle
with open("../data/question_embeddings_app.pkl", "rb") as file:
      
    # Call load method to deserialze
    question_embeddings = pickle.load(file)
  
    print(question_embeddings)

In [None]:
so_database['embeddings'] = question_embeddings.tolist()

In [None]:
so_database

### Semantic search

In [None]:
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances_argmin as distances_argmin

In [None]:
query = ['How to concat dataframes pandas']

In [None]:
query_embedding = embedding_model.get_embeddings(query)[0].values

In [None]:
cos_sim_array = cosine_similarity([query_embedding],
                                  list(so_database.embeddings.values))

In [None]:
cos_sim_array.shape

In [None]:
index_doc_cosine = np.argmax(cos_sim_array)

In [None]:
index_doc_distances = distances_argmin([query_embedding], 
                                       list(so_database.embeddings.values))[0]

In [None]:
so_database.input_text[index_doc_cosine]

In [None]:
so_database.output_text[index_doc_cosine]

### Question answering with relevant context

In [None]:
from vertexai.language_models import TextGenerationModel

In [None]:
generation_model = TextGenerationModel.from_pretrained(
    "text-bison@001")

In [None]:
context = "Question: " + so_database.input_text[index_doc_cosine] +\
"\n Answer: " + so_database.output_text[index_doc_cosine]

In [None]:
prompt = f"""Here is the context: {context}
             Using the relevant information from the context,
             provide an answer to the query: {query}."
             If the context doesn't provide \
             any relevant information, \
             answer with \
             [I couldn't find a good match in the \
             document database for your query]
             """

In [None]:
from IPython.display import Markdown, display

t_value = 0.2
response = generation_model.predict(prompt = prompt,
                                    temperature = t_value,
                                    max_output_tokens = 1024)

display(Markdown(response.text))

In [None]:
query = ['How to make the perfect lasagna']

In [None]:
query_embedding = embedding_model.get_embeddings(query)[0].values

In [None]:
cos_sim_array = cosine_similarity([query_embedding], 
                                  list(so_database.embeddings.values))

In [None]:
cos_sim_array

In [None]:
index_doc = np.argmax(cos_sim_array)

In [None]:
context = so_database.input_text[index_doc] + \
"\n Answer: " + so_database.output_text[index_doc]

In [None]:
prompt = f"""Here is the context: {context}
             Using the relevant information from the context,
             provide an answer to the query: {query}."
             If the context doesn't provide \
             any relevant information, answer with 
             [I couldn't find a good match in the \
             document database for your query]
             """

In [None]:
t_value = 0.2
response = generation_model.predict(prompt = prompt,
                                    temperature = t_value,
                                    max_output_tokens = 1024)
display(Markdown(response.text))

### Scale with approximate nearest neighbor search

In [None]:
import scann

#Create index using scann
index = create_index(embedded_dataset = question_embeddings, 
                     num_leaves = 25,
                     num_leaves_to_search = 10,
                     training_sample_size = 2000)

In [None]:
query = "how to concat dataframes pandas"

In [None]:
import time 

start = time.time()
query_embedding = embedding_model.get_embeddings([query])[0].values
neighbors, distances = index.search(query_embedding, final_num_neighbors = 1)
end = time.time()

for id, dist in zip(neighbors, distances):
    print(f"[docid:{id}] [{dist}] -- {so_database.input_text[int(id)][:125]}...")

print("Latency (ms):", 1000 * (end - start))

In [None]:
start = time.time()
query_embedding = embedding_model.get_embeddings([query])[0].values
cos_sim_array = cosine_similarity([query_embedding], list(so_database.embeddings.values))
index_doc = np.argmax(cos_sim_array)
end = time.time()

print(f"[docid:{index_doc}] [{np.max(cos_sim_array)}] -- {so_database.input_text[int(index_doc)][:125]}...")

print("Latency (ms):", 1000 * (end - start))