In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_KEY")
os.environ['PINECONE_API_KEY'] = "51fdde24-a23e-4bff-a3d2-27e7f51249a1"



In [2]:
from langchain_openai.chat_models import ChatOpenAI

model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")

In [3]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

chain = model | parser
chain.invoke("What MLB team won the World Series during the COVID-19 pandemic?")

'The Los Angeles Dodgers won the World Series during the COVID-19 pandemic in 2020.'

In [4]:
from langchain.prompts import ChatPromptTemplate

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)


In [5]:
chain = prompt | model | parser
chain.invoke({
    "context": "Mary's sister is Susana",
    "question": "Who is Mary's sister?"
})

'Susana'

In [6]:
import os
import pdfplumber
from langchain.schema import Document

def load_text_from_pdfs(pdf_paths):
    """Extract text from multiple PDF files and load as LangChain documents."""
    documents = []
    for pdf_path in pdf_paths:
        with pdfplumber.open(pdf_path) as pdf:
            text = ""
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"  # Append each page's text with a newline
            # Create a document for each PDF with all pages concatenated
            documents.append(Document(page_content=text, metadata={"source": pdf_path}))
    return documents

# List of PDF filenames in the same directory as the notebook
pdf_paths = ["dr_arif_butt_1.pdf", "dr_arif_butt_courses.pdf", "dr_arif_butt_current.pdf", "dr_arif_butt_publications.pdf", "dr_arif_butt_resume.pdf"]

# Load the documents from the PDF files
text_documents = load_text_from_pdfs(pdf_paths)

# Display the loaded documents for verification
print(f"Loaded {len(text_documents)} documents.")
for doc in text_documents:
    print(f"Source: {doc.metadata['source']}")
    print(f"Content Preview: {doc.page_content[:500]}...\n")  # Show the first 500 characters


Loaded 5 documents.
Source: dr_arif_butt_1.pdf
Content Preview: Dr. Muhammad Arif Butt is an Assistant Professor at the Department of Data Science,
University of the Punjab (PU), Lahore, Pakistan. He received his MSc and MPhil
degrees both with a Gold Medal from PUCIT, University of the Punjab. Dr. Butt also
earned his Ph.D. in Computer Science from the same University. His research focuses
on applying fuzzy inference models in operating, embedded, and cloud-based
systems/services, where decision making is involved under imprecise and vague
parameters.
His t...

Source: dr_arif_butt_courses.pdf
Content Preview: Dr arif butt teaches machine learning, natural language processing, web scraping for data
scientists, data visualization with matplotlib and seaborn, pandas for data scientists, numpy for
data scientists, basic python programming, descriptive and inferential statistics for data
scientists, linear algebra for data scientists, tools and techniques for data scientists, computer
org

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)

# Split the loaded documents into smaller chunks
split_texts = text_splitter.split_documents(text_documents)

# Display the first 5 chunks for verification
print(f"Number of chunks created: {len(split_texts)}")
for i, chunk in enumerate(split_texts[:5]):
    print(f"Chunk {i + 1}:")
    print(chunk.page_content)
    print("-" * 80)


Number of chunks created: 236
Chunk 1:
Dr. Muhammad Arif Butt is an Assistant Professor at the Department of Data Science,
--------------------------------------------------------------------------------
Chunk 2:
University of the Punjab (PU), Lahore, Pakistan. He received his MSc and MPhil
--------------------------------------------------------------------------------
Chunk 3:
degrees both with a Gold Medal from PUCIT, University of the Punjab. Dr. Butt also
--------------------------------------------------------------------------------
Chunk 4:
earned his Ph.D. in Computer Science from the same University. His research focuses
--------------------------------------------------------------------------------
Chunk 5:
on applying fuzzy inference models in operating, embedded, and cloud-based
--------------------------------------------------------------------------------


In [8]:
from langchain_openai.embeddings import OpenAIEmbeddings

# Initialize the OpenAI embeddings model with your API key
embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)

# Sample query to be embedded
sample_query = "Which course does Dr. Arif Butt teach?"

# Generate the embedding for the sample query
embedded_query = embeddings.embed_query(sample_query)

# Display the embedding length and the first few values
print(f"Embedding length: {len(embedded_query)}")
print("First 10 values of the embedded query:", embedded_query[:10])


Embedding length: 1536
First 10 values of the embedded query: [-0.0054530249908566475, -0.004338966216892004, -0.017024751752614975, 0.0009045261540450156, 0.007919130846858025, 0.025937221944332123, -0.03189726546406746, 0.00019896987942047417, -0.032614678144454956, -0.006398078054189682]


In [9]:
sentence1 = embeddings.embed_query('Dr. Arif butt teaches machine learning and information security')
sentence2 = embeddings.embed_query('Dr. Arif butt drives a red kia sportage')


In [10]:
from sklearn.metrics.pairwise import cosine_similarity

query_sentence1_similarity = cosine_similarity([embedded_query], [sentence1])[0][0]
query_sentence2_similarity = cosine_similarity([embedded_query], [sentence2])[0][0]

query_sentence1_similarity, query_sentence2_similarity

(0.9134719464873107, 0.8521212634683148)

In [11]:
from langchain_pinecone import PineconeVectorStore

index_name = "dr-arif-butt"

pinecone = PineconeVectorStore.from_documents(
    split_texts, embeddings, index_name=index_name
)

  from tqdm.autonotebook import tqdm


In [12]:
pinecone.similarity_search("What does dr arif butt teach?")[:3]

[]

In [13]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

In [15]:
chain = (
    {"context": pinecone.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)

chain.invoke("What the color of the car of dr arif butt?")

"The color of Dr. Arif Butt's car is red."

In [16]:
chain.invoke('which car does he have?')

'Dr. Arif Butt drives a Kia Sportage.'

In [17]:
chain.invoke('where does he live')

'Dr. Arif Butt currently lives in Lahore.'

In [18]:
chain.invoke('what is his current profession?')

'Based on the context provided, his current profession is not explicitly mentioned.'

In [19]:
chain.invoke('what is his management and teaching span?')

'His management and teaching span since 2008.'

In [20]:
chain.invoke('please some of his publications')

'Some of his publications include:\n1. "1661-8289, vol. 7, no. 3, pp. 367-378, 2013 (I.F. 0.76, HEC W Category)"\n2. "SpringerPlus/SpringerOpen, vol.5, no. 1, pp. 1-17, 2016 (IF 1.130, HEC W Category)"'

In [21]:
chain.invoke('what are his qualifications')

'His qualifications are M.Phil and Ph.D.'

In [22]:
chain.invoke('what is his designation in university of punjab')

'His designation at the University of Punjab is Professor.'