In [1]:
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer
from llama_index import (
    VectorStoreIndex,
    ServiceContext,
    get_response_synthesizer
)
from llama_index.storage.storage_context import StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.embeddings import HuggingFaceEmbedding
from llama_index.llms import HuggingFaceLLM, OpenAI
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.schema import Document
from llama_index.postprocessor import SimilarityPostprocessor

from tqdm import tqdm
from glob import glob
from IPython.display import Markdown, display
import pandas as pd
import os
import openai

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def add_concatenated_column(df):
    df['concatenated'] = df.apply(
        lambda row: '\n'.join(
            [
                f"Title: {str(row['Series_Title'])}",
                f"Overview: {str(row['Overview'])}",
                f"Released year: {str(row['Released_Year'])}",
                f"Runtime: {str(row['Runtime'])}",
                f"Genre: {str(row['Genre'])}",
                f"IMDB Rating: {str(row['IMDB_Rating'])}",
                f"Meta Score: {str(row['Meta_score'])}",
                f"Number of Votes: {str(row['No_of_Votes'])}",
                f"Gross sales: {str(row['Gross'])}",
                f"Director: {str(row['Director'])}",
                f"Stars: {str(row['Star1'])}, {str(row['Star2'])}, {str(row['Star3'])}, {str(row['Star4'])}",
                f"Director: {str(row['Director'])}"
            ]), 
        axis=1)
    
    return df

In [3]:
data_folder = './data'

In [4]:
movies_df = pd.read_csv(data_folder + '/imdb_top_1000.csv')
movies_df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [5]:
movies_df = add_concatenated_column(movies_df)
movies_df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross,concatenated
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469,Title: The Shawshank Redemption\nOverview: Two...
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411,Title: The Godfather\nOverview: An organized c...
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444,Title: The Dark Knight\nOverview: When the men...
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000,Title: The Godfather: Part II\nOverview: The e...
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000,Title: 12 Angry Men\nOverview: A jury holdout ...


In [6]:
docs = []
movies_df.apply(lambda row: docs.append(Document(text=row['concatenated'])), axis=1)

0      None
1      None
2      None
3      None
4      None
       ... 
995    None
996    None
997    None
998    None
999    None
Length: 1000, dtype: object

In [None]:
print(len(docs))
print(docs[0].text)

In [7]:
client = QdrantClient(
    host="localhost", grpc_port=6334, prefer_grpc=True
)
# client.delete_collection("test")

In [8]:
embed_model = HuggingFaceEmbedding(model_name='sentence-transformers/all-MiniLM-L6-v2')

In [9]:
os.environ["OPENAI_API_KEY"] = "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"
openai.api_key = os.environ["OPENAI_API_KEY"]
llm_model = OpenAI(model="gpt-3.5-turbo", temperature=0.0)

In [None]:
llm_model = HuggingFaceLLM(context_window=2048,
                           max_new_tokens=256,
                           generate_kwargs={"temperature": 0.25, "do_sample": False},
                           query_wrapper_prompt=query_wrapper_prompt,
                           tokenizer_name="Writer/camel-5b-hf",
                           model_name="Writer/camel-5b-hf",
                           device_map="auto",
                           tokenizer_kwargs={"max_length": 2048})

In [10]:
client.delete_collection("movies")

True

In [11]:
service_context = ServiceContext.from_defaults(llm=llm_model, embed_model=embed_model)
vector_store = QdrantVectorStore(client=client, collection_name="movies")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(docs, storage_context=storage_context, service_context=service_context, show_progress=True) 

Parsing nodes: 100%|██████████| 1000/1000 [00:00<00:00, 4543.13it/s]
Generating embeddings: 100%|██████████| 1000/1000 [00:05<00:00, 186.14it/s]


In [None]:
# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=10,
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer(llm_model, verbose=True)

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)]
)

In [None]:
response = query_engine.query("Father tries to save the Earth by travelling through time")

In [12]:
chat_engine = index.as_chat_engine(
    chat_mode="condense_plus_context", streaming=True
)

In [13]:
response_stream = chat_engine.chat("Can you name the movie that a father tries to save the Earth by travelling through time")
response_stream

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


AgentChatResponse(response='Yes, the movie you are referring to is "Back to the Future". In this film, the main character Marty McFly, with the help of his eccentric scientist friend Doc Brown, travels through time in a DeLorean car to save the Earth and fix the timeline. Marty\'s goal is not specifically to save the Earth, but rather to ensure that his parents meet and fall in love in the past, which will secure his own existence in the future.', sources=[ToolOutput(content='Title: Back to the Future\nOverview: Marty McFly, a 17-year-old high school student, is accidentally sent thirty years into the past in a time-traveling DeLorean invented by his close friend, the eccentric scientist Doc Brown.\nReleased year: 1985\nRuntime: 116 min\nGenre: Adventure, Comedy, Sci-Fi\nIMDB Rating: 8.5\nMeta Score: 87.0\nNumber of Votes: 1058081\nGross sales: 210,609,762\nDirector: Robert Zemeckis\nStars: Michael J. Fox, Christopher Lloyd, Lea Thompson, Crispin Glover\nDirector: Robert Zemeckis\n\nTi

In [16]:
response_stream.sources[0].content

'Title: Back to the Future\nOverview: Marty McFly, a 17-year-old high school student, is accidentally sent thirty years into the past in a time-traveling DeLorean invented by his close friend, the eccentric scientist Doc Brown.\nReleased year: 1985\nRuntime: 116 min\nGenre: Adventure, Comedy, Sci-Fi\nIMDB Rating: 8.5\nMeta Score: 87.0\nNumber of Votes: 1058081\nGross sales: 210,609,762\nDirector: Robert Zemeckis\nStars: Michael J. Fox, Christopher Lloyd, Lea Thompson, Crispin Glover\nDirector: Robert Zemeckis\n\nTitle: The Man from Earth\nOverview: An impromptu goodbye party for Professor John Oldman becomes a mysterious interrogation after the retiring scholar reveals to his colleagues he has a longer and stranger past than they can imagine.\nReleased year: 2007\nRuntime: 87 min\nGenre: Drama, Fantasy, Mystery\nIMDB Rating: 7.9\nMeta Score: nan\nNumber of Votes: 174125\nGross sales: nan\nDirector: Richard Schenkman\nStars: David Lee Smith, Tony Todd, John Billingsley, Ellen Crawford\n

In [None]:
response_stream = chat_engine.chat("The movie I am looking for was directed by Christophe Nolan")
response_stream

In [None]:
response_stream = chat_engine.chat("What is its rating?")
response_stream

In [None]:
response_stream = chat_engine.chat("How did it go in the Box Office? Also, please remind me of the actors that play in the movie please")
response_stream

In [None]:
query_engine = index.as_query_engine(similarity_top_k=6, response_mode="no_text", verbose=True)
response = query_engine.query("Father tries to save the Earth by travelling through time")

In [None]:
for node in response.source_nodes:
    print(node.text)
    print("\n")