# CS Courses QA with Sources

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

API_KEY = os.getenv('OPENAI_API_KEY')
os.environ["OPENAI_API_KEY"] = API_KEY

## Prepare Data

In [2]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.cohere import CohereEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.vectorstores.faiss import FAISS
import pickle

In [3]:
from langchain.document_loaders import UnstructuredFileLoader

# load raw text data from pdf
loader = UnstructuredFileLoader("./data/csci.pdf", mode="elements")
raw_documents = loader.load()

In [6]:
print("Num parsed documents: ", len(raw_documents))

# for i in range(len(raw_documents)):
    # print(raw_documents[i])
    # print(raw_documents[i].metadata['category'], raw_documents[i].page_content)
    
# Merge NarrativeText up to some max length
# Include previous title(s) before Narrative Text
# Prob just the single prev title, but maybe multiple? Maybe use overlap?
# Remove UncategorizedText?

Num parsed documents:  261


In [8]:
# Split text
text_splitter = CharacterTextSplitter(chunk_size = 2000, chunk_overlap = 200)
documents = text_splitter.split_documents(raw_documents)
documents[22]

Document(page_content='To be eligible for admission to the major, a student must have completed at least two Computer Science courses, including Computer Science 136, as well as fulfilled the Discrete Mathematics Proficiency Requirement by the end of the sophomore year. A Mathematics course at the 200-level or higher (except for MATH 200) must be completed by the end of the junior year. Students are urged to have completed two of the four core courses (Computer Science 237, 256, 334, and 361) by the end of the sophomore year and must normally have completed at least three out of the four core courses by the end of the junior year.', metadata={'source': './csci.pdf', 'filename': './csci.pdf', 'page_number': 2, 'category': 'NarrativeText'})

In [10]:
# Load Data to vectorstore
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(documents, embeddings)

# Save vectorstore
with open("./vectorstore.pkl", "wb") as f:
    pickle.dump(vectorstore, f)

In [11]:
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import OpenAI

qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), vectorstore.as_retriever(), return_source_documents=True)

In [14]:
def ask_no_history(qa, query):
    return qa({"question": query, "chat_history": []})

In [15]:
query = "What is the full name of the course 'Computer Science 256?'"
ask_no_history(qa, query)

{'question': "What is the full name of the course 'Computer Science 256?'",
 'chat_history': [],
 'answer': ' Computer Science 256 is an upper-level course titled Algorithms and Data Structures.',
 'source_documents': [Document(page_content='Computer Science 134 Introduction to Computer Science', metadata={'source': './csci.pdf', 'filename': './csci.pdf', 'page_number': 1, 'category': 'NarrativeText'}),
  Document(page_content='CSCI 136 and CSCI 256 or permission of instructor', metadata={'source': './csci.pdf', 'filename': './csci.pdf', 'page_number': 14, 'category': 'NarrativeText'}),
  Document(page_content='Prerequisites: CSCI 256 and CSCI 237', metadata={'source': './csci.pdf', 'filename': './csci.pdf', 'page_number': 12, 'category': 'NarrativeText'}),
  Document(page_content='There are several sequences of courses appropriate for those primarily interested in developing skills in programming for use in other areas. For general programming, Computer Science 134 followed by 136 and

In [16]:
query = "What is the Discrete Mathematics Proficiency Requirement?"
ask_no_history(qa, query)

{'question': 'What is the Discrete Mathematics Proficiency Requirement?',
 'chat_history': [],
 'answer': ' The Discrete Mathematics Proficiency Requirement is to demonstrate proficiency in discrete mathematics by either passing the departmental Discrete Mathematics Proficiency Exam or by earning a grade of C- or better in MATH 200.',
 'source_documents': [Document(page_content='Students must demonstrate proficiency in discrete mathematics by either passing the departmental Discrete Mathematics Proficiency Exam or by earning a grade of C- or better in MATH 200. This requirement must be met by the end of the sophomore year.', metadata={'source': './csci.pdf', 'filename': './csci.pdf', 'page_number': 2, 'category': 'NarrativeText'}),
  Document(page_content='Prerequisites: CSCI 136 and fulfillment of the Discrete Mathematics Proficiency requirement', metadata={'source': './csci.pdf', 'filename': './csci.pdf', 'page_number': 16, 'category': 'NarrativeText'}),
  Document(page_content='Quan

In [34]:
query = "How many CS courses do I need to take to graduate?"
ask_no_history(qa, query)

' You need to take a minimum of 8 courses in Computer Science, including the four core courses (Computer Science 237, 256, 334, and 361) and two or more electives (bringing the total number of Computer Science courses to at least 8).'

In [35]:
query = "I haven't taken CS237. Can I take Computer Graphics?"
ask_no_history(qa, query)

' No, you cannot take Computer Graphics since it requires CSCI 256 and CSCI 237 as prerequisites.'