#Build and Save FAISS Index

### This program will embed vectors and can store and retrieve ads based on how similar they are to a given prompt or context

## Installation & Import

In [1]:
pip install openai faiss-cpu pandas



## Imports all necessary packages for the project and loads the dataset from Github

In [3]:
import openai
import faiss
import pandas as pd
import numpy as np
import ast

# Set your OpenAI API Key
openai.api_key = AI_KEY

# Load advertisement data
csv_file = "https://raw.githubusercontent.com/m1chae11u/llm-ad-integration/refs/heads/main/sampled_ads.csv"
df = pd.read_csv(csv_file)

print(df.head())


   product_id    ad_id         user_search_query  \
0     2634449     6511                      spad   
1     1588292  3219024     e1796 bausch %26 lomb   
2     3285600   218399           ps4 used amazon   
3     4688264   147690  vegetable bouillon cubes   
4     4539593   890666       star sapphire rings   

                                            ad_title  \
0                                   Spad XIII French   
1  Bausch + Lomb Biotrue Multi-Purpose Solution -...   
2  Sony PlayStation 4 VR, Astro BOT Rescue Missio...   
3  Massel 7's, Vegan Bouillon Stock Cubes - Glute...   
4  Sterling Silver 7mm Created Sapphire Ring with...   

                                      ad_description                  url  \
0  Are you one of the legions of aircraft enthusi...  simpshopifyapps.com   
1  Inspired by The Biology of Your Eyes- Works li...           google.com   
2  VR gaming fun for the whole family. Color: White.          walmart.com   
3  QUALITY VEGETABLE STOCK - Turn normal m

## Parse through the dataframe and clean the data



*   Go through ad_key_words get rid of any character that's not a string and remove it
*   Go through ad_benefit get rid of any character that's not a string and remove it



In [8]:
# # Function to preprocess ad data
# def preprocess_ad(row):
#     keywords = " ".join(ast.literal_eval(row["ad_key_words"])) if isinstance(row["ad_key_words"], str) else ""
#     benefits = " ".join(ast.literal_eval(row["ad_benefits"])) if isinstance(row["ad_benefits"], str) else ""
#     return f"{row['product']} {row['ad_description']} {keywords} {benefits}"

# Apply preprocessing to each row
# ad_texts = df.apply(preprocess_ad, axis=1).tolist()
ad_texts = df["ad_description"].tolist()
ad_ids = df["ad_id"].tolist()  # Store ad IDs for retrieval

## Use OpenAI api to embed Vectors

Embed a vector for each value in the data set so that we can retrieve the most similar ad based on the user provided prompt

In [9]:
# Function to get embeddings from OpenAI
def get_embedding(text):
    response = openai.embeddings.create(
        input=text,
        model="text-embedding-ada-002"
    )
    return response.data[0].embedding

# Compute embeddings for all ads
ad_embeddings = np.array([get_embedding(text) for text in ad_texts]).astype("float32")

## Creates the FAISS index

The FAISS index will store all of the vector embeddings which will allow us to retrieve the most similar embeddings based on the user's query


In [18]:
# Create FAISS index
embedding_dim = len(ad_embeddings[0])  # 1536 for text-embedding-ada-002
index = faiss.IndexFlatL2(embedding_dim)
index.add(ad_embeddings)  # Add vectors to FAISS

# Save FAISS index and metadata
faiss.write_index(index, "ads_faiss.index")
df[["ad_id", "ad_title", "ad_description"]].to_csv("ads_metadata.csv", index=False)

print("FAISS index and metadata saved successfully!")

FAISS index and metadata saved successfully!


## Testing out searching through the FAISS with an example query


In [19]:
# Function to search for similar ads
def search_similar_ads(query, top_k=5):
    query_embedding = np.array(get_embedding(query)).astype("float32").reshape(1, -1)
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve matching ads
    results = [{"ad_id": ad_ids[idx], "distance": float(dist)} for idx, dist in zip(indices[0], distances[0])]
    return results

# Example query
query = "Looking for wireless earbuds with noise cancellation"
similar_ads = search_similar_ads(query)
print("Similar Ads:", similar_ads)

Similar Ads: [{'ad_id': 2084551, 'distance': 0.2493971884250641}, {'ad_id': 3418501, 'distance': 0.26797980070114136}, {'ad_id': 312758, 'distance': 0.292544960975647}, {'ad_id': 930698, 'distance': 0.29640138149261475}, {'ad_id': 2297709, 'distance': 0.30459341406822205}]
