In [1]:
import pandas as pd
from opensearchpy import OpenSearch
import streamlit as st

In [2]:
df = pd.read_csv('imdb_top_1000.csv')

In [3]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('intfloat/multilingual-e5-large')

  from tqdm.autonotebook import tqdm, trange


In [4]:
df["Vector"] = df["Series_Title"].apply(lambda x: model.encode(x))

In [86]:
client = OpenSearch(
    hosts=[{'host': 'localhost', 'port': 9200}],
    http_auth=('admin', 'admin') 
)

index_name = 'imdb_movie'

def create_index_with_knn_vector(client, index_name):
    if client.indices.exists(index=index_name):
        client.indices.delete(index=index_name)

    index_body = {
        "settings": {
            "index.knn": True 
        },
        "mappings": {
            "properties": {
                "vector": {
                    "type": "knn_vector",
                    "dimension": 1024  
                },
                "Series_Title": {
                    "type": "text"
                },
                "Released_Year": {
                    "type": "text"
                },
                "Runtime": {
                    "type": "keyword"
                },
                "Genre": {
                    "type": "text"
                },
                "IMDB_Rating": {
                    "type": "float"
                },
                "Overview": {
                    "type": "text"
                },
                "Director": {
                    "type": "text"
                },
                "Star1": {
                    "type": "text"
                },
                "Star2": {
                    "type": "text"
                },
                "Star3": {
                    "type": "text"
                },
                "Star4": {
                    "type": "text"
                },
                "No_of_Votes": {
                    "type": "integer"
                }
            }
        }
    }
    response = client.indices.create(index=index_name, body=index_body)

create_index_with_knn_vector(client, index_name)


In [87]:
def index_documents():
    for idx, row in df.iterrows():
        document = {
            'Series_Title': row['Series_Title'],
            'vector': row['Vector'],
            'Released_Year': row['Released_Year'],
            'Runtime': row['Runtime'],
            'Genre': row['Genre'],
            'IMDB_Rating': row['IMDB_Rating'],
            'Overview': row['Overview'],
            'Director': row['Director'],
            'Star1': row['Star1'],
            'Star2': row['Star2'],
            'Star3': row['Star3'],
            'Star4': row['Star4'],
            'No_of_Votes': row['No_of_Votes']
        }
        response = client.index(index=index_name, body=document)  
index_documents()

In [85]:
def search_movies():
    query_vector = list(model.encode("good"))   
    query = {
            "size": 5, 
            "query": {
                    "script_score": {
                        "query": {"match_all": {}},
                        "script": {
                            "source": "knn_score",
                            "lang": "knn",
                            "params": {
                                "field": "vector",
                                "query_value": query_vector,
                                "space_type": "cosinesimil"
                            }
                        }
                    }
                }
            }
    response = client.search(index=index_name, body=query)
    return response



def main():
    st.title('IMDb Movie Search')
    query = st.text_input('Enter your search query:', 'Movie')

    if st.button('Search'):
        results = search_movies(query)
        st.subheader('Search Results:')
        for result in results:
            st.write(f"ID: {result['_id']}")
            st.write(f"Title: {result['_source']['Series_Title']}")
            st.write(f"Overview: {result['_source']['Overview']}")
            st.write(f"IMDB Rating: {result['_source']['IMDB_Rating']}")
            st.write(f"Director: {result['_source']['Director']}")
            st.write("---")

if __name__ == '__main__':
    main()