# Integrating LangChain and LLMs to store and embed our podcast transcript 

In [1]:
pip install langchain==0.0.153

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install openai

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Set OpenAI API key as environment variable
# import os
# os.environ["OPENAI_API_KEY"] = ""

In [None]:
# Test if the api key is stored properly in the environment
# api_key = os.environ["OPENAI_API_KEY"]
# print(api_key)

In [1]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [6]:
pip install "unstructured[local-inference]"

Collecting unstructured[local-inference]
  Downloading unstructured-0.6.2-py3-none-any.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hCollecting pypandoc
  Downloading pypandoc-1.11-py3-none-any.whl (20 kB)
Collecting python-pptx
  Downloading python-pptx-0.6.21.tar.gz (10.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0mm
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting argilla
  Downloading argilla-1.6.0-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting pillow
  Downloading Pillow-9.5.0-cp38-cp38-macosx_10_10_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m10.3 MB/s[0m eta [36m0:0

### Load Data Files

In [3]:
loader = UnstructuredPDFLoader("./justin_interview.pdf")

In [5]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/mike24dzy/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
data = loader.load()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mike24dzy/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with the fast strategy.


### Chunk data into smaller documents

In [7]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [8]:
print(f'Now you have {len(texts)} documents')

Now you have 66 documents


### Create embeddings of the document with ChromaDB and OpenAI Embeddings

In [9]:
pip install chromadb

Collecting chromadb
  Downloading chromadb-0.3.21-py3-none-any.whl (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.4/46.4 kB[0m [31m307.9 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting sentence-transformers>=2.2.2
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting clickhouse-connect>=0.5.7
  Downloading clickhouse_connect-0.5.22-cp38-cp38-macosx_10_9_x86_64.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.1/236.1 kB[0m [31m921.5 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting duckdb>=0.7.1
  Downloading duckdb-0.7.1-cp38-cp38-macosx_10_9_x86_64.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollect

In [11]:
# Pass OpenAI API Key
from getpass import getpass
OPENAI_API_KEY = getpass()

In [12]:
import os

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [13]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

In [14]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY) # type: ignore

In [16]:
pip install tiktoken

Collecting tiktoken
[0m  Downloading tiktoken-0.3.3-cp38-cp38-macosx_10_9_x86_64.whl (736 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m736.3/736.3 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.3.3
Note: you may need to restart the kernel to use updated packages.


In [20]:
db = Chroma.from_documents(texts, embeddings)

Using embedded DuckDB without persistence: data will be transient


### Query data with LLM

In [25]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain

In [26]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key=OPENAI_API_KEY) # type: ignore
chain = load_qa_chain(llm, chain_type="stuff")

In [44]:
query = "I wish to divide the whole interview into 3 chapters, please give me the suggestions on when should I mark each chapter based on the content with timestamps"
docs = db.similarity_search(query)

In [41]:
print(docs[2].page_content)

00:16:04.980 --> 00:16:12.440 Justin: fast track your product development. You learn how to manage teams and motivate teams out of necessity.

147 00:16:12.860 --> 00:16:23.120 Justin: You You learn a lot of things that way, so I ultimately didn't go to business school. I went into the start up instead, and it and it kept me in Japan

148 00:16:23.220 --> 00:16:34.460 Justin: in Asia. I did a couple of years in Singapore, setting us up a subsidiary of our startup in sample. But ultimately that opportunity and migrating my career to the Internet

149 00:16:34.540 --> 00:16:37.000 Justin: kept me in Japan. And

150 00:16:37.900 --> 00:16:41.190 at that point I just decided more to commit to the long haul.

151

00:16:41.700





> 00:16:42.430

Daniel Tedesco: Wow!

152 00:16:45.420 --> 00:16:51.370 Daniel Tedesco: And that that was, I guess, a a Japanese company, right?

153 00:16:53.130 --> 00:17:03.510 Daniel Tedesco: But I I guess international operations. And you've also spent a lot

In [45]:
chain.run(input_documents=docs, question=query)

"Chapter 1: 00:16:04.980 - 00:17:21.619\nThis chapter covers Justin's background, education, and early career experiences.\n\nChapter 2: 00:44:24.910 - 01:09:01.640\nThis chapter covers Justin's experiences in the gaming industry, including his work with LINE and Disney, as well as his thoughts on staying ahead of the curve and investing in successful ideas.\n\nChapter 3: 00:31:10.180 - 00:32:34.390\nThis chapter covers Justin's experiences in business development and negotiating with people from different cultural backgrounds."