# Install Required Packages

In [None]:
pip install pandas
pip install openai
pip install elasticsearch==8.13.0

# Load Movie Data from CSV

In [None]:
import pandas as pd

df = pd.read_csv("movies.csv")

print(df.head(5))

# Initialize OpenAI Client for Embedding Generation

In [None]:
from openai import OpenAI

endpoint = ""
deployment_name = ""
api_key = ""

client = OpenAI(
    base_url = endpoint,
    api_key = api_key,
)

Note: Replace the endpoint, deployment_name, and api_key values with your own Azure OpenAI service details

# Generate Embeddings for Movie Descriptions

In [None]:
descriptions = df["description"].astype(str).tolist()

embeddings = []
for description in descriptions:
    response = client.embeddings.create(
        input=description,
        model=deployment_name
    )
    embeddings.append(response.data[0].embedding)

df["embedding"] = embeddings

print(df)

# Connect to Elasticsearch (ensure Elasticsearch Docker Container is running)

In [None]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")

print(es.ping())

# Define Index and Mapping in Elasticsearch

In [None]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")

index_name = "movies"

mapping = {
    "mappings": {
        "properties": {
            "movie_title": {
                "type": "text"
            },
            "description": {
                "type": "text"
            },
            "embedding": {
                "type": "dense_vector",
                "index": True,
                "similarity": "l2_norm"
            }
        }
    }
}

es.options(ignore_status=[400, 404]).indices.delete(index=index_name)

es.indices.create(index=index_name, body=mapping)
print(f"Index '{index_name}' created.")

# Index Documents into Elasticsearch

In [None]:
from elasticsearch import Elasticsearch, helpers

es = Elasticsearch("http://localhost:9200")
index_name = "movies"

def create_documents(df):
    for row_index, row_data in df.iterrows():
        movie_title = row_data["movie_title"]
        description = row_data["description"]
        embedding_vector = list(row_data["embedding"])  

        yield {
            "_index": index_name,
            "_id": row_index,
            "_source": {
                "movie_title": movie_title,
                "description": description,
                "embedding": embedding_vector
            }
        }

helpers.bulk(es, create_documents(df))

print(f"Inserted {len(df)} documents into Elasticsearch index '{index_name}'.")


# Retrieve All Documents from Index (Basic Query)

In [None]:
search_body = {
    "query": {"match_all": {}},
    "size": 100  
}

response = es.search(index=index_name, body=search_body)

print(response)

for doc in response["hits"]["hits"]:
    source = doc["_source"]
    movie_title = source["movie_title"]
    description = source["description"]
    embedding = source["embedding"]

    print(f"Title: {movie_title} |  Description: {description} | Embedding: {embedding}")



# Delete Index (Optional Cleanup)

In [None]:
es.indices.delete(index=index_name)

# Perform Vector Search Query

In [None]:
query_text = "query_text"

embedding_response = client.embeddings.create(
    input=query_text,
    model=deployment_name
)

query_vector = embedding_response.data[0].embedding

body = {
    "size": 3,
    "query": {
        "knn": {
            "field": "embedding",
            "query_vector": query_vector,
        }
    }
}

response = es.search(index=index_name, body=body)

print("Top 3 results:")

for hit in response["hits"]["hits"]:
    movie_title = hit["_source"]["movie_title"]
    description = hit["_source"]["description"]
    score = hit["_score"]
    print(f"Movie: {movie_title} | Description: {description}")