In [1]:
%%sh
pip install opensearch-py
pip install boto3
pip install sentence-transformers 




[notice] A new release of pip available: 22.3.1 -> 23.3.2
[notice] To update, run: pip install --upgrade pip





[notice] A new release of pip available: 22.3.1 -> 23.3.2
[notice] To update, run: pip install --upgrade pip





[notice] A new release of pip available: 22.3.1 -> 23.3.2
[notice] To update, run: pip install --upgrade pip


In [11]:
import pandas as pd

# Replace 'your_file.csv' with the actual file path of your CSV file
file_path = 'tmdb_5000_movies.csv'

# Specify the columns you want to include in the DataFrame
columns = ['id', 'original_title', 'overview']

# Load the CSV file into a DataFrame with selected columns
df_movies = pd.read_csv(file_path, usecols=columns)

# Display the DataFrame
print(df_movies.head(5))


       id                            original_title  \
0   19995                                    Avatar   
1     285  Pirates of the Caribbean: At World's End   
2  206647                                   Spectre   
3   49026                     The Dark Knight Rises   
4   49529                               John Carter   

                                            overview  
0  In the 22nd century, a paraplegic Marine is di...  
1  Captain Barbossa, long believed to be dead, ha...  
2  A cryptic message from Bond’s past sends him o...  
3  Following the death of District Attorney Harve...  
4  John Carter is a war-weary, former military ca...  


In [12]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings("ignore", message="Unverified HTTPS request")
warnings.filterwarnings("ignore", message="InsecureRequestWarning")

In [4]:
from opensearchpy import OpenSearch

CLUSTER_URL = 'https://localhost:9200'

def get_os_client(cluster_url = CLUSTER_URL,
                  username='admin',
                  password='admin'):
    '''
    Get OpenSearch client
    :param cluster_url: cluster URL like https://ml-te-netwo-1s12ba42br23v-ff1736fa7db98ff2.elb.us-west-2.amazonaws.com:443
    :return: OpenSearch client
    '''
    client = OpenSearch(
        hosts=[cluster_url],
        http_auth=(username, password),
        verify_certs=False
    )
    return client

client = get_os_client()



In [8]:
from sentence_transformers import SentenceTransformer

MODEL_NAME = "all-MiniLM-L6-v2"
model = SentenceTransformer(MODEL_NAME)

EMBEDDING_DIM = model.encode(["Sample sentence"])[0].shape[0]


In [9]:
index_name = "movies"

index_body = {
  "settings": {
    "index": {
      "knn": True,
      "knn.algo_param.ef_search": 100
    }
  },
  "mappings": { #how do we store, 
    "properties": {
        "embedding": {
          "type": "knn_vector", #we are going to put 
          "dimension": EMBEDDING_DIM,
          "method": {
            "name": "hnsw",
            "space_type": "l2",
            "engine": "nmslib",
            "parameters": {
              "ef_construction": 128,
              "m": 24
            }
          }
        }
        }
    }
}

In [22]:
response = client.indices.create(index=index_name, body=index_body)

In [21]:
response = client.indices.delete(index = index_name)

In [23]:
import pandas as pd

for index, row in df_movies.head(50).iterrows():
    print(f"Id: {row['id']}, Title: {row['original_title']}, Overview: {row['overview']}")
    original_title = row['original_title']
    overview = row['overview']
    id = row['id']

    # Sentence transformer model takes list of documents as input and returns list of embeddings.
    embedding = model.encode([overview])[0]
    # We are inserting a data point with 3 attribute, "id", "text" and "embedding" as knn_vector type.
    my_doc = {"id": id, "title": original_title, "plot": overview, "embedding": embedding}
    res = client.index(
        index=index_name,
        body=my_doc,
        id = str(index),
        refresh = True
        )

Id: 19995, Title: Avatar, Overview: In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.
Id: 285, Title: Pirates of the Caribbean: At World's End, Overview: Captain Barbossa, long believed to be dead, has come back to life and is headed to the edge of the Earth with Will Turner and Elizabeth Swann. But nothing is quite as it seems.
Id: 206647, Title: Spectre, Overview: A cryptic message from Bond’s past sends him on a trail to uncover a sinister organization. While M battles political forces to keep the secret service alive, Bond peels back the layers of deceit to reveal the terrible truth behind SPECTRE.
Id: 49026, Title: The Dark Knight Rises, Overview: Following the death of District Attorney Harvey Dent, Batman assumes responsibility for Dent's crimes to protect the late attorney's reputation and is subsequently hunted by the Gotham City Police Department. Eigh

In [30]:
""" Example query text """
user_query = "A spy goes on a mission"

""" Embedding the query by using the same model """
query_embedding = model.encode((user_query))


query_body = {
    "size" : 4,
    "query": {"knn": {"embedding": {"vector": query_embedding, "k": 3}}},
    "_source": False,
    "fields": ["id", "title", "plot"],
}

results = client.search(
    body=query_body,
    index=index_name
)

for i, result in enumerate(results["hits"]["hits"]):
    plot = result['fields']['plot'][0]
    title = result['fields']['title'][0]
    score = result['_score']
    print(f"{i+1}. Title: {title}, Score: {score}, Plot: {plot}")


1. Title: Spectre, Score: 0.47271392, Plot: A cryptic message from Bond’s past sends him on a trail to uncover a sinister organization. While M battles political forces to keep the secret service alive, Bond peels back the layers of deceit to reveal the terrible truth behind SPECTRE.
2. Title: Skyfall, Score: 0.47144258, Plot: When Bond's latest assignment goes gravely wrong and agents around the world are exposed, MI6 is attacked forcing M to relocate the agency. These events cause her authority and position to be challenged by Gareth Mallory, the new Chairman of the Intelligence and Security Committee. With MI6 now compromised from both inside and out, M is left with one ally she can trust: Bond. 007 takes to the shadows - aided only by field agent, Eve - following a trail to the mysterious Silva, whose lethal and hidden motives have yet to reveal themselves.
3. Title: TRON: Legacy, Score: 0.44935665, Plot: Sam Flynn, the tech-savvy and daring son of Kevin Flynn, investigates his fat