# CS Courses QA with Sources

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

API_KEY = os.getenv('OPENAI_API_KEY')
os.environ["OPENAI_API_KEY"] = API_KEY

## Prepare Data

In [2]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.cohere import CohereEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.vectorstores.faiss import FAISS
import pickle

In [3]:
from langchain.document_loaders import UnstructuredFileLoader

# load raw text data from pdf
loader = UnstructuredFileLoader("./data/csci.pdf", mode="elements")
raw_documents = loader.load()

In [43]:
print("Num parsed documents: ", len(raw_documents))

for i in range(len(raw_documents)):
    print(raw_documents[i])
    # print(raw_documents[i].metadata['category'], raw_documents[i].page_content)
    
# Merge NarrativeText up to some max length
# Include previous title(s) before Narrative Text
# Prob just the single prev title, but maybe multiple? Maybe use overlap?
# Remove UncategorizedText?

Num parsed documents:  261
page_content='Daniel W. Barowy, Assistant Professor of Computer Science  James M. Bern, Assistant Professor of Computer Science  Rohit Bhattacharya, Assistant Professor of Computer Science  Stephen N. Freund, Chair and A. Barton Hepburn Professor of Computer Science  Mark Hopkins, Assistant Professor of Computer Science  Iris Howley, Assistant Professor of Computer Science  Bill K. Jannen, Assistant Professor of Computer Science  Katie A. Keith, Assistant Professor of Computer Science  Samuel M' metadata={'source': './csci.pdf', 'filename': './csci.pdf', 'page_number': 1, 'category': 'ListItem'}
page_content='auley, Assistant Professor of Computer Science; on leave' metadata={'source': './csci.pdf', 'filename': './csci.pdf', 'page_number': 1, 'category': 'ListItem'}
page_content='-' metadata={'source': './csci.pdf', 'filename': './csci.pdf', 'page_number': 1, 'category': 'ListItem'}
page_content='Kelly A. Shaw, Professor of Computer Science  Shikha Singh, Ass

In [8]:
# Split text
text_splitter = CharacterTextSplitter(chunk_size = 2000, chunk_overlap = 200)
documents = text_splitter.split_documents(raw_documents)
documents[22]

Document(page_content='To be eligible for admission to the major, a student must have completed at least two Computer Science courses, including Computer Science 136, as well as fulfilled the Discrete Mathematics Proficiency Requirement by the end of the sophomore year. A Mathematics course at the 200-level or higher (except for MATH 200) must be completed by the end of the junior year. Students are urged to have completed two of the four core courses (Computer Science 237, 256, 334, and 361) by the end of the sophomore year and must normally have completed at least three out of the four core courses by the end of the junior year.', metadata={'source': './csci.pdf', 'filename': './csci.pdf', 'page_number': 2, 'category': 'NarrativeText'})

In [10]:
# Load Data to vectorstore
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(documents, embeddings)

# Save vectorstore
with open("./vectorstore.pkl", "wb") as f:
    pickle.dump(vectorstore, f)

In [30]:
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import OpenAI

qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), vectorstore.as_retriever(search_kwargs={"k": 6}), return_source_documents=True)

In [24]:
def ask_no_history(qa, query):
    return qa({"question": query, "chat_history": []})

def ask_with_history_and_append(qa, query, history):
    res = qa({"question": query, "chat_history": history})
    history.append((query, res["answer"]))
    return res

In [45]:
query = "What is the full name of the course 'Computer Science 256?'"
ask_no_history(qa, query)

{'question': "What is the full name of the course 'Computer Science 256?'",
 'chat_history': [],
 'answer': ' Computer Science 256 is a course called Algorithm and Data Structure Design.',
 'source_documents': [Document(page_content='Computer Science 134 Introduction to Computer Science', metadata={'source': './csci.pdf', 'filename': './csci.pdf', 'page_number': 1, 'category': 'NarrativeText'}),
  Document(page_content='CSCI 136 and CSCI 256 or permission of instructor', metadata={'source': './csci.pdf', 'filename': './csci.pdf', 'page_number': 14, 'category': 'NarrativeText'}),
  Document(page_content='Prerequisites: CSCI 256 and CSCI 237', metadata={'source': './csci.pdf', 'filename': './csci.pdf', 'page_number': 12, 'category': 'NarrativeText'}),
  Document(page_content='There are several sequences of courses appropriate for those primarily interested in developing skills in programming for use in other areas. For general programming, Computer Science 134 followed by 136 and 256 wil

In [47]:
query = "What is the Discrete Mathematics Proficiency Requirement?"
ask_no_history(qa, query)

{'question': 'What is the Discrete Mathematics Proficiency Requirement?',
 'chat_history': [],
 'answer': ' The Discrete Mathematics Proficiency Requirement is a requirement that students must meet by either passing the departmental Discrete Mathematics Proficiency Exam or by earning a grade of C- or better in MATH 200.',
 'source_documents': [Document(page_content='Students must demonstrate proficiency in discrete mathematics by either passing the departmental Discrete Mathematics Proficiency Exam or by earning a grade of C- or better in MATH 200. This requirement must be met by the end of the sophomore year.', metadata={'source': './csci.pdf', 'filename': './csci.pdf', 'page_number': 2, 'category': 'NarrativeText'}),
  Document(page_content='Prerequisites: CSCI 136 and fulfillment of the Discrete Mathematics Proficiency requirement', metadata={'source': './csci.pdf', 'filename': './csci.pdf', 'page_number': 16, 'category': 'NarrativeText'}),
  Document(page_content='Quantative/Formal

In [34]:
query = "How many CS courses do I need to take to graduate?"
ask_no_history(qa, query)

' You need to take a minimum of 8 courses in Computer Science, including the four core courses (Computer Science 237, 256, 334, and 361) and two or more electives (bringing the total number of Computer Science courses to at least 8).'

In [48]:
query = "I haven't taken CS237. Can I take Computer Graphics?"
ask_no_history(qa, query)

{'question': "I haven't taken CS237. Can I take Computer Graphics?",
 'chat_history': [],
 'answer': ' No, you need to have taken CSCI 256 and CSCI 237 to take CSCI 371 Computer Graphics.',
 'source_documents': [Document(page_content='CSCI 371 (S) Computer Graphics (QFR)', metadata={'source': './csci.pdf', 'filename': './csci.pdf', 'page_number': 13, 'category': 'Title'}),
  Document(page_content='Computer Science 134 Introduction to Computer Science', metadata={'source': './csci.pdf', 'filename': './csci.pdf', 'page_number': 1, 'category': 'NarrativeText'}),
  Document(page_content='Prerequisites: CSCI 256 and CSCI 237', metadata={'source': './csci.pdf', 'filename': './csci.pdf', 'page_number': 12, 'category': 'NarrativeText'}),
  Document(page_content='To be taken by students registered for Computer Science 493-494.', metadata={'source': './csci.pdf', 'filename': './csci.pdf', 'page_number': 21, 'category': 'NarrativeText'}),
  Document(page_content='Computer Science', metadata={'sou

In [31]:
chat_history = []
query = "I'm a junior interested in learning to program but I've never taken a CS course. What courses am I able to take?"
ask_with_history_and_append(qa, query, chat_history)

{'question': "I'm a junior interested in learning to program but I've never taken a CS course. What courses am I able to take?",
 'chat_history': [("I'm a junior interested in learning to program but I've never taken a CS course. What courses am I able to take?",
   ' You are able to take Computer Science 134 Introduction to Computer Science.')],
 'answer': ' You are able to take Computer Science 134 Introduction to Computer Science.',
 'source_documents': [Document(page_content='To be eligible for admission to the major, a student must have completed at least two Computer Science courses, including Computer Science 136, as well as fulfilled the Discrete Mathematics Proficiency Requirement by the end of the sophomore year. A Mathematics course at the 200-level or higher (except for MATH 200) must be completed by the end of the junior year. Students are urged to have completed two of the four core courses (Computer Science 237, 256, 334, and 361) by the end of the sophomore year and mus

In [32]:
chat_history

[("I'm a junior interested in learning to program but I've never taken a CS course. What courses am I able to take?",
  ' You are able to take Computer Science 134 Introduction to Computer Science.')]

In [33]:
query = "Are there any electives I can take? What about CSCI 103?"
ask_with_history_and_append(qa, query, chat_history)

{'question': 'Are there any electives I can take? What about CSCI 103?',
 'chat_history': [("I'm a junior interested in learning to program but I've never taken a CS course. What courses am I able to take?",
   ' You are able to take Computer Science 134 Introduction to Computer Science.'),
  ('Are there any electives I can take? What about CSCI 103?',
   ' Students who have not taken a CS course before can take the four core courses required of majors to provide a broad knowledge of topics underlying all of computer science. They can also take one of the upper-level electives. Appropriate mathematics classes numbered 300 or above, and appropriate statistics courses numbered 200 or above may also be substituted for one Computer Science elective with the advance permission of the department.')],
 'answer': ' Students who have not taken a CS course before can take the four core courses required of majors to provide a broad knowledge of topics underlying all of computer science. They can 

In [38]:
query = "How many CS courses do I need to take to major in CS? Please respond as concisely as possible with an exact answer, or say 'I don't know' if you don't know"
ask_no_history(qa, query)

{'question': "How many CS courses do I need to take to major in CS? Please respond as concisely as possible with an exact answer, or say 'I don't know' if you don't know",
 'chat_history': [],
 'answer': ' 8 courses',
 'source_documents': [Document(page_content='Students seeking to develop an extensive knowledge of computer science without majoring in the department are encouraged to use the major requirements as a guide. In particular, the four core courses required of majors are intended to provide a broad knowledge of topics underlying all of computer science. Students seeking a concentration in Computer Science are urged to complete at least two of these courses followed by one of our upper-level electives. Such a program would typically require the completion of a total of five Computer Science courses in addition to the Discrete', metadata={'source': './csci.pdf', 'filename': './csci.pdf', 'page_number': 4, 'category': 'NarrativeText'}),
  Document(page_content='To be eligible fo

In [39]:
query = "How many biology courses do I need to take to major in CS? Please respond as concisely as possible with an exact answer, or say 'I don't know' if you don't know"
ask_no_history(qa, query)

{'question': "How many biology courses do I need to take to major in CS? Please respond as concisely as possible with an exact answer, or say 'I don't know' if you don't know",
 'chat_history': [],
 'answer': " I don't know.",
 'source_documents': [Document(page_content='Students seeking to develop an extensive knowledge of computer science without majoring in the department are encouraged to use the major requirements as a guide. In particular, the four core courses required of majors are intended to provide a broad knowledge of topics underlying all of computer science. Students seeking a concentration in Computer Science are urged to complete at least two of these courses followed by one of our upper-level electives. Such a program would typically require the completion of a total of five Computer Science courses in addition to the Discrete', metadata={'source': './csci.pdf', 'filename': './csci.pdf', 'page_number': 4, 'category': 'NarrativeText'}),
  Document(page_content='A minimu

In [44]:
query = "How do I graduate with a degree with honors in computer science?"
ask_no_history(qa, query)

{'question': 'How do I graduate with a degree with honors in computer science?',
 'chat_history': [],
 'answer': ' To graduate with a degree with honors in computer science, you must demonstrate outstanding intellectual achievement in study extending beyond the requirements of the regular major. This includes mastery of core material, ability to pursue independent study of computer science, originality in methods of investigation, and creativity. You must also have completed at least two Computer Science courses, including Computer Science 136, as well as fulfilled the Discrete Mathematics Proficiency Requirement by the end of the sophomore year. You must also have completed two of the four core courses (Computer Science 237, 256, 334, and 361) by the end of the sophomore year and must normally have completed at least three out of the four core courses by the end of the junior year. Additionally, you must attend at least twenty Computer Science colloquia and have completed two or more 

In [56]:
texts = []
with open("./data/cs_gold.txt", "r") as f:
    for line in f:
        texts.append(line)

In [58]:
db2 = FAISS.from_texts(texts, embeddings)

# Save vectorstore
with open("./db_cs_gold.pkl", "wb") as f:
    pickle.dump(db2, f)

In [59]:
qa2 = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), db2.as_retriever(search_kwargs={"k": 6}), return_source_documents=True)

In [60]:
query = "I'm a junior interested in learning to program but I've never taken a CS course. What courses am I able to take?"
res = ask_no_history(qa2, query)
res["answer"]

In [62]:
query = "Can I study abroad if I’m majoring in CS?"
res = ask_no_history(qa2, query)
print(res["answer"])
print(res["source_documents"])

' Yes, you can study abroad if you are majoring in Computer Science. You should discuss your plans in advance with the chair of the department or the departmental study away advisor to ensure that you will be able to complete the major.'

In [67]:
query = "Which programming languages are taught in CS 134?"
res = ask_no_history(qa2, query)
print(res["answer"])
print(res["source_documents"])

 Java and Python.
[Document(page_content='INTRODUCTORY COURSES: Computer Science 134 provides an introduction to computer science with a focus on developing computer programming skills. These skills are essential to most upper-level courses in the department. As a result, Computer Science 134 together with Computer Science 136, are required as a prerequisite to most advanced courses in the department. Those students intending to take several Computer Science courses are urged to take 134 early.\n', metadata={}), Document(page_content='MAJOR REQUIREMENTS Introductory Courses: Computer Science 134 Introduction to Computer Science, Computer Science 136 Data Structures and Advanced Programming \n', metadata={}), Document(page_content='PLANS OF STUDY FOR NON-MAJORS: There are several sequences of courses appropriate for those primarily interested in developing skills in programming for use in other areas. For general programming, Computer Science 134 followed by 136 and 256 will provide stu

In [72]:
import json

cs_courses = None

with open("./data/courses.json", "r") as f:
    course_dict = json.load(f)
    courses = course_dict["courses"]
    
    print(f"Found info on {len(courses)} courses")
    
    cs_courses = [course for course in courses if course["department"] == "CSCI"]

Found info on 1785 courses


In [73]:
with open("./data/cs_courses.txt", "w") as f:
    for course in cs_courses:
        f.write(str(course) + "\n")

In [74]:
cs_course_json_linearized = []

with open("./data/cs_courses.txt", "r") as f:
    for line in f:
        cs_course_json_linearized.append(line)

In [76]:
db_courses = FAISS.from_texts(cs_course_json_linearized, embeddings)

In [78]:
db2.merge_from(db_courses)

# Save vectorstore
with open("./db_cs_gold_plus_courses.pkl", "wb") as f:
    pickle.dump(db2, f)

In [87]:
db2.docstore._dict

{'619822b0-5f69-4383-bf46-3e5636bb39b5': Document(page_content='COMPUTER SCIENCE (Div III) \n', metadata={}),
 '09596503-5cba-4d42-978b-a21ed14cdbb6': Document(page_content='Chair: Professor Stephen Freund \n', metadata={}),
 '4c40a3b7-0011-4eb5-8981-76d83cc57b53': Document(page_content='Jeannie R Albrecht, Professor of Computer Science \n', metadata={}),
 '05d415e6-56da-4ada-9df6-8b7ceb1b241e': Document(page_content='Daniel W. Barowy, Assistant Professor of Computer Science \n', metadata={}),
 '77725527-b0e1-4653-83e4-b9661775089c': Document(page_content='James M. Bern, Assistant Professor of Computer Science \n', metadata={}),
 'd34f93cd-9a78-4a7b-b6de-3b981a2013f4': Document(page_content='Rohit Bhattacharya, Assistant Professor of Computer Science \n', metadata={}),
 '79395120-7b66-46cd-b9b3-244170779954': Document(page_content='Stephen N. Freund, Chair and A. Barton Hepburn Professor of Computer Science \n', metadata={}),
 'b5be3b1b-df8e-4551-a81f-fcd3b424d5d7': Document(page_conte

In [79]:
qa3 = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), db2.as_retriever(search_kwargs={"k": 6}), return_source_documents=True)

In [80]:
query = "Which CS course talks about Turing Machines?"
res = ask_no_history(qa3, query)
print(res["answer"])
print(res["source_documents"])

 Computer Science 361 Theory of Computation.
[Document(page_content='INTRODUCTORY COURSES: Those students interested in learning more about exciting new ideas in computer science, but not necessarily interested in developing extensive programming skills, should consider Computer Science 102 The Socio-Techno Web, 103 Electronic Textiles, or 104 Data Science and Computation for All.\n', metadata={}), Document(page_content='INTRODUCTORY COURSES: The department offers a choice of introductory courses; Computer Science 102 The Socio-Techno Web, 103 Electronic Textiles, 104 Data Science and Computation for All, and 134 Introduction to Computer Science.\n', metadata={}), Document(page_content='COMPUTER SCIENCE 134: Introduction to Computer Science covers fundamental concepts in the design, implementation and testing of computer programs including loops, conditionals, functions, elementary data types and recursion. There is a strong focus on constructing correct, understandable and efficient p

In [81]:
query = "How many sections of CS 134 are being offered in fall 2024?"
res = ask_no_history(qa3, query)
print(res["answer"])
print(res["source_documents"])

 I don't know.
[Document(page_content='Students who take Computer Science 102T, 103, or 104 prior to Fall 2023 may use that course as one of the two electives required for the major in Computer Science. Those classes cannot be counted toward the major if taken in Fall 2023 or later semesters. Computer Science 102T, 103, 104, and 134 are not open to students who have taken a Computer Science course numbered 136 or higher.\n', metadata={}), Document(page_content='Alternate Year Courses: Computer Science 102, 103, 104, and our electives are each usually offered at least every other year. All other Computer Science courses are normally offered every year.\n', metadata={}), Document(page_content='INTRODUCTORY COURSES: Computer Science 134 provides an introduction to computer science with a focus on developing computer programming skills. These skills are essential to most upper-level courses in the department. As a result, Computer Science 134 together with Computer Science 136, are require

In [82]:
query = "Can I take CS 136 as a fifth course?"
res = ask_no_history(qa3, query)
print(res["answer"])
print(res["source_documents"])

 No, CS 136 is a required prerequisite for most advanced courses in the department and must be taken as one of the two electives required for the major in Computer Science.
[Document(page_content='Students who take Computer Science 102T, 103, or 104 prior to Fall 2023 may use that course as one of the two electives required for the major in Computer Science. Those classes cannot be counted toward the major if taken in Fall 2023 or later semesters. Computer Science 102T, 103, 104, and 134 are not open to students who have taken a Computer Science course numbered 136 or higher.\n', metadata={}), Document(page_content='INTRODUCTORY COURSES: Computer Science 134 provides an introduction to computer science with a focus on developing computer programming skills. These skills are essential to most upper-level courses in the department. As a result, Computer Science 134 together with Computer Science 136, are required as a prerequisite to most advanced courses in the department. Those student

In [84]:
query = "What's the full name for CSCI 374 and what topics does it cover?"
res = ask_no_history(qa3, query)
print(res["answer"])
print(res["source_documents"])

 I don't know.
[Document(page_content='MAJOR REQUIREMENTS Introductory Courses: Computer Science 134 Introduction to Computer Science, Computer Science 136 Data Structures and Advanced Programming \n', metadata={}), Document(page_content='INTRODUCTORY COURSES: Computer Science 134 provides an introduction to computer science with a focus on developing computer programming skills. These skills are essential to most upper-level courses in the department. As a result, Computer Science 134 together with Computer Science 136, are required as a prerequisite to most advanced courses in the department. Those students intending to take several Computer Science courses are urged to take 134 early.\n', metadata={}), Document(page_content='COMPUTER SCIENCE 134: Introduction to Computer Science covers fundamental concepts in the design, implementation and testing of computer programs including loops, conditionals, functions, elementary data types and recursion. There is a strong focus on constructi

In [88]:
courses_prefixed_separated = []
with open("./data/courses_prefixed_separated.txt", "r") as f:
    for line in f:
        courses_prefixed_separated.append(line)

# courses_prefixed_separated

["{course: CSCI 104 Section 01, year: 2024, semester: Fall, courseID: 022117, department: CSCI, number: 104, section: 01, sectionType: in-person, peoplesoftNumber: 1165, consent: N, gradingBasisDesc: Pass/Fail Available, Fifth Course Unavailable, classType: Lecture, titleLong: Data Science And Computing For All, titleShort: Data Science And Computing, instructors: [{'id': 2346, 'name': 'Stephen N. Freund'}], meetings: [{'days': 'MWF', 'start': '10:00', 'end': '10:50', 'facility': ''}], courseAttributes: {'div1': False, 'div2': False, 'div3': True, 'dpe': False, 'qfr': True, 'wac': False, 'passFail': True, 'fifthCourse': False}, classFormat: , classReqEval: Weekly lab assignments involving programming, a project, and examinations., extraInfo: , prereqs: None; previous programming experience or statistics is not required., departmentNotes: Additional details about the class are available here: https://www.cs.williams.edu/~cs104. Please see the Computer Science Department website for more

In [89]:
db_courses_processed = FAISS.from_texts(courses_prefixed_separated, embeddings)

# Save vectorstore
with open("./db_courses_processed.pkl", "wb") as f:
    pickle.dump(db_courses_processed, f)

In [90]:
qa_processed_data = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), db_courses_processed.as_retriever(search_kwargs={"k": 6}), return_source_documents=True)

In [98]:
query = "Which professors are teaching 134?"
res = ask_no_history(qa_processed_data, query)
print(res["answer"])
for source in res["source_documents"]:
    print(source)

 Mark Hopkins and Bill K. Jannen.
page_content='CSCI 136: This course builds on the programming skills acquired in Computer Science 134. It couples work on program design, analysis, and verification with an introduction to the study of data structures. Data structures capture common ways in which to store and manipulate data, and they are important in the construction of sophisticated computer programs. Students are introduced to some of the most important and frequently used data structures: lists, stacks, queues, trees, hash tables, graphs, and files. Students will be expected to write several programs, ranging from very short programs to more elaborate systems. Emphasis will be placed on the development of clear, modular programs that are easy to read, debug, verify, analyze, and modify.\n' metadata={}
page_content="{course: CSCI 134 Section 01, year: 2024, semester: Fall, courseID: 010801, department: CSCI, number: 134, section: 01, sectionType: in-person, peoplesoftNumber: 1176, c

In [99]:
with open("./data/courses_prefixed_separated.txt", "r") as f:
    for line in f:
        print(len(line))

1395
1393
1396
1396
1394
1394
1346
1342
1344
1340
1344
1340
1345
1341
1347
1343
1341
1345
1345
1341
1347
1343
1345
1341
1207
1204
1204
1207
1208
1205
1205
1208
1205
1208
1208
1205
902
907
908
902
908
903
903
903
903
1048
1047
1047
964
966
966
936
934
891
893
893
1083
1085
1085
1085
1085
1085
1035
1035
1081
932
909
909
822
824
948
996
862
822
824
776
911
754
492
782
593
480
884
573
1069
829
994
451
60
60
401
478
81
60
60
