In [1]:
import langchain
from langchain_community.document_loaders import DataFrameLoader
import json
import pandas as pd
import getpass
import os
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings 
import openai 
from langchain_core.documents import Document

  from tqdm.autonotebook import tqdm


In [3]:
with open('../data/subject_areas.json', 'r') as f:
    subject_areas = json.load(f)

In [4]:
subject_dict = [{k: v for k, v in d.items() if k in ['code', 'title'] } for d in subject_areas]

In [5]:
print(subject_dict[1])
print(len(subject_dict))

{'code': 'A E', 'title': 'AEROSPACE ENGINEERING'}
469


In [6]:
abbrv_desc = [f'{title}: {code}' for code, title in zip([d['code'] for d in subject_dict], [d['title'] for d in subject_dict])]

In [7]:
print(abbrv_desc)

['AERONAUTICS & ASTRONAUTICS: A A', 'AEROSPACE ENGINEERING: A E', 'AEROSPACE STUDIES (AIR FORCE ROTC): A S', 'ASIAN AMERICAN STUDIES: AAS', 'UNIVERSITY ACADEMY: ACADEM', 'ACCOUNTING: ACCTG', 'ADMINISTRATION: ADMIN', 'AMERICAN ETHNIC STUDIES: AES', 'AFRICAN-AMERICAN STUDIES: AFRAM', 'AMERICAN INDIAN STUDIES: AIS', 'APPLIED MATHEMATICS: AMATH', 'AMHARIC: AMHAR', 'ANESTHESIOLOGY: ANEST', 'ANTHROPOLOGY: ANTH', 'ARABIC: ARAB', 'ARAMAIC: ARAMIC', 'ARCHITECTURE: ARCH', 'ARCHAEOLOGY: ARCHY', 'ARCTIC STUDIES: ARCTIC', 'ART: ART', 'ART HISTORY: ART H', 'INTERDISCIPLINARY ARTS: ARTS', 'ARTS AND SCIENCES: ARTSCI', 'ASIAN LANGUAGES & LITERATURE: ASIAN', 'AMERICAN SIGN LANGUAGE: ASL', 'ASTROBIOLOGY: ASTBIO', 'ASTRONOMY: ASTR', 'ATMOSPHERIC SCIENCES: ATM S', 'BUSINESS ADMINISTRATION: B A', 'ACCOUNTING(BOTHELL): B ACCT', 'ARABIC - BOTHELL: B ARAB', 'BUSINESS ECONOMICS (BOTHELL CAMPUS): B BECN', 'BIOLOGY - UW BOTHELL: B BIO', 'BUSINESS SKILLS (BOTHELL CAMPUS): B BSKL', 'BUSINESS ADMINISTRATION (BOTHELL

In [16]:
departments = ['SWA', 'ARCTIC', 'TAGLG', 'AAS', 'CHSTU', 'BENG', 'CFRM', 'AFRAM', 'AES', 'ASIAN',
 'AMATH', 'ART', 'DESIGN', 'AIS', 'INDO', 'INDN', 'ARCHY', 'URDU', 'HINDI', 'SNKRT',
 'CHIN', 'ASTBIO', 'KOREAN', 'HUM', 'VIET', 'JAPAN', 'ASTR', 'CMS', 'GREEK', 'CLAS',
 'LATIN', 'DXARTS', 'ANTH', 'INDIV', 'CHID', 'CHEM', 'TXTDS', 'BIOL', 'DANCE',
 'COM', 'FRENCH', 'ITAL', 'HSTAFM', 'DRAMA', 'CSE', 'ECON', 'HSTLAC', 'HSTAM',
 'HPS', 'HSTRY', 'GWSS', 'HSTAS', 'HSTCMP', 'LABOR', 'GEOG', 'ASL', 'HSTEU',
 'GERMAN', 'RELIG', 'ARAMIC', 'COPTIC', 'GEEZ', 'EGYPT', 'LADINO', 'MICROM', 'LSJ',
 'JSIS', 'ARAB', 'TURKIC', 'CHGTAI', 'LING', 'BIBHEB', 'MODHEB', 'PRSAN', 'MATH',
 'ENGL', 'KYRGYZ', 'KAZAKH', 'UGARIT', 'UZBEK', 'UYGUR', 'TKISH', 'MUSAP', 'MUSED',
 'NEUSCI', 'ETHICS', 'MUSEN', 'MUHST', 'MELC', 'FINN', 'DANISH', 'ESTO', 'LITH',
 'LATV', 'NORW', 'BCMS', 'SWED', 'CZECH', 'BULGR', 'MUSICP', 'ROMN', 'PHIL',
 'POLSH', 'GLITS', 'UKR', 'SLVN', 'PORT', 'SCAND', 'PHYS', 'SPLING', 'MUSIC',
 'SLAVIC', 'RUSS', 'PSYCH', 'CEP', 'CM', 'SPHSC', 'STAT', 'ACCTG', 'ARCH', 'ENTRE',
 'SOC', 'FIN', 'SPAN', 'QMETH', 'OPMGT', 'MGMT', 'DPHS', 'MKTG', 'ORALM', 'OHS',
 'ORTHO', 'ECE', 'PERIO', 'ECFS', 'EDUC', 'NME', 'EDSPE', 'CEE', 'EDPSY', 'EDLPS',
 'ENGR', 'HCDE', 'BSE', 'FISH', 'FHL', 'MARBIO', 'ENVIR', 'SMEA', 'GRDSCH', 'ESRM',
 'INFO', 'OCEAN', 'ESS', 'LAW', 'NUTR', 'UCONJ', 'BIOC', 'ANEST', 'BIME', 'BIOEN',
 'IMMUN', 'GENOME', 'CONJ', 'PATH', 'MED', 'FAMED', 'OPHTH', 'OTOHN', 'ORTHP',
 'PHCOL', 'IECMH', 'RADGY', 'UROL', 'PBSCI', 'NEURL', 'SURG', 'NMETH', 'NCLIN',
 'PEDS', 'REHAB', 'MEDCH', 'PCEUT', 'PHARM', 'NURS', 'PUBPOL', 'HIHIM', 'BIOST',
 'EPI', 'PABIO', 'PHG', 'SPH', 'LEAD', 'HONORS', 'HSERV']


print(len(departments))



195


In [8]:
#Setting up keys
pinecone_api_key = os.environ.get("pinecone_API")

pc = Pinecone(api_key=pinecone_api_key)

api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = api_key


In [9]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

pc.create_index(
    name= 'department-abbrev-db',
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)



In [10]:
index = pc.Index("department-abbrev-db")
vector_store = PineconeVectorStore(index , embedding = embeddings)

In [22]:
vector_store.delete(delete_all = True)

In [6]:
index = pc.Index("department-abbrev-db")

In [11]:
documents = [Document(page_content = doc) for doc in abbrv_desc]
print(documents[0])

page_content='AERONAUTICS & ASTRONAUTICS: A A'


In [12]:

vector_store.add_documents(documents)



['bae232dd-5889-4faa-9bf8-67d1cd32ac02',
 'ca351f52-ac0b-47d8-8cc7-4b9a2118d003',
 '44101f0d-0e2c-4fb2-bced-62f0a46b42d2',
 'f5cd5621-a583-4d2a-9b1a-d7256a0acd6e',
 'd08ab2b3-856b-498d-8295-41c6b654239c',
 'f19de0f2-9821-4675-9938-3dba492026f3',
 'ec3d6598-2109-4fe7-8086-15f801adec20',
 '15ae292b-9175-4f45-af80-a29773e128fa',
 '8c737da1-7208-4dca-8955-204aa911f41a',
 'f6349a83-33ad-46a0-98cb-7dfc7853cfee',
 'cdbac2e4-f1ad-4a57-92da-7519a7813c68',
 'a1712dea-d1fb-4b14-b837-34b6cf6d1062',
 '20d20f42-5777-4001-b12d-7fda263f3c03',
 '6fef9c92-d7d6-4f33-ac43-9c076a8bb388',
 'c0f122bc-17cf-42e8-9732-8e5054f07c7a',
 'a6733b9a-4e7b-43c2-acf4-297ba9fe3810',
 '58ad57fd-1f8f-40b9-9a03-2cc94bcf784d',
 '6c3f375d-1ad6-41d5-9df6-07339182988a',
 '0ad53309-2dfa-4a63-b9e4-51b12edda709',
 '8796ab0e-f24c-4659-97bc-b810d785f165',
 '9388d9b1-609e-479f-b595-beb7f9f98e05',
 'f8de473c-c4a5-4d6b-b96d-53362c8081b4',
 '48a3612a-199f-438e-9cb0-21ac6ed45160',
 '90f4730f-1420-4282-8380-a1925924399d',
 'f1caa59f-42c3-

In [34]:
results = vector_store.similarity_search(
    "I want to take a 400 level class relating to the environment which has a rate my professor score of 4 or higher",
    k=4,
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* COLLEGE OF THE ENVIRONMENT: C ENV [{}]
* ENVIRONMENTAL SCIENCE - UW BOTHELL: BES [{}]
* CIVIL & ENVIRONMENTAL ENGINEERING: CEE [{}]
* ENVIRONMENTAL SUSTAINABILITY: TEST [{}]
