# Install Required Packages

In [None]:
pip install -r requirements.txt

# Load Movie Data from CSV

In [None]:
import pandas as pd

df = pd.read_csv("movies.csv")

print(df.head(5))

# Initialize OpenAI Client for Embedding Generation

In [None]:
from openai import OpenAI

endpoint = "<replace-with-your-endpoint>"
deployment_name = "<replace-with-your-deployment_name>"
api_key = "<replace-with-your-api-key>"

client = OpenAI(
    base_url = endpoint,
    api_key = api_key,
)

Note: This uses Azure AI Foundry pretrained model text-embedding-3-small, so please ensure you have set up your Azure OpenAI endpoint and API keys accordingly.

# Generate Embeddings for Movie Descriptions

In [None]:
descriptions = df["description"].astype(str).tolist()

embeddings = []
for description in descriptions:
    response = client.embeddings.create(
        input=description,
        model=deployment_name
    )
    embeddings.append(response.data[0].embedding)

df["embedding"] = embeddings

print(df)

# Connect to Elasticsearch (ensure Elasticsearch Docker Container is running)

In [None]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")

print(es.ping())

# Define Index and Mapping in Elasticsearch

In [None]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")

index_name = "movies"

mapping = {
    "mappings": {
        "properties": {
            "movie_title": {
                "type": "text"
            },
            "description": {
                "type": "text"
            },
            "embedding": {
                "type": "dense_vector",
                "index": True,
                "similarity": "l2_norm"
            }
        }
    }
}

es.options(ignore_status=[400, 404]).indices.delete(index=index_name)

es.indices.create(index=index_name, body=mapping)
print(f"Index '{index_name}' created.")

# Index Documents into Elasticsearch

In [16]:
from elasticsearch import Elasticsearch, helpers

es = Elasticsearch("http://localhost:9200")
index_name = "movies"

def create_documents(df):
    for row_index, row_data in df.iterrows():
        movie_title = row_data["movie_title"]
        description = row_data["description"]
        embedding_vector = list(row_data["embedding"])  

        yield {
            "_index": index_name,
            "_id": row_index,
            "_source": {
                "movie_title": movie_title,
                "description": description,
                "embedding": embedding_vector
            }
        }

helpers.bulk(es, create_documents(df))

print(f"Inserted {len(df)} documents into Elasticsearch index '{index_name}'.")


Inserted 49 documents into Elasticsearch index 'movies'.


# Retrieve All Documents from Index (Basic Query)

In [17]:
search_body = {
    "query": {"match_all": {}},
    "size": 100  
}

response = es.search(index=index_name, body=search_body)

print(response)

for doc in response["hits"]["hits"]:
    source = doc["_source"]
    movie_title = source["movie_title"]
    description = source["description"]
    embedding = source["embedding"]

    print(f"Title: {movie_title} |  Description: {description} | Embedding: {embedding}")



{'took': 36, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 49, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': 'movies', '_id': '0', '_score': 1.0, '_source': {'movie_title': 'The Shawshank Redemption', 'description': 'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.', 'embedding': [-0.01915094442665577, 0.028280040249228477, 0.013074479065835476, 0.0654875859618187, -0.01900695264339447, -0.02476663514971733, 0.005957671441137791, 0.047574978321790695, 0.02884160913527012, -0.05483217537403107, 0.03182224556803703, -0.014269613660871983, -0.01170655433088541, 0.024781033396720886, 0.05249950662255287, -0.010878600180149078, 0.07406951487064362, 0.031851042062044144, -0.027488084509968758, 0.0625501498579979, -0.012678501196205616, -0.005651688203215599, 0.0528738833963871, 0.030411122366786003, -0.023945879191160202, 0.034558095037937164

# Delete Index (Optional Cleanup)

In [None]:
es.indices.delete(index=index_name)

# Perform Vector Search Query

In [None]:
query_text = "A man finds out his world is fake and tries to escape it"

embedding_response = client.embeddings.create(
    input=query_text,
    model=deployment_name
)

query_vector = embedding_response.data[0].embedding

body = {
    "size": 3,
    "query": {
        "knn": {
            "field": "embedding",
            "query_vector": query_vector,
        }
    }
}

response = es.search(index=index_name, body=body)

print("Top 3 results:")

for hit in response["hits"]["hits"]:
    movie_title = hit["_source"]["movie_title"]
    description = hit["_source"]["description"]
    print(f"Movie: {movie_title} | Description: {description}")