In [1]:
import random
import pandas as pd
import json

In [2]:

def prompt_generator(prompt_stems, answer_stems, prompt_keyword, answer_keyword):
    # Define the list of prompt stems
   
    # Randomly select a prompt stem and an answer stem
    selected_prompt = random.choice(prompt_stems)
    selected_answer = random.choice(answer_stems)
    # Replace {course_name} and {answer} in the selected stems
    user_prompt = selected_prompt.format(prompt_keyword=prompt_keyword)
    assistant_response = selected_answer.format(prompt_keyword=prompt_keyword, answer_keyword=answer_keyword)
    
    # Return the results as a dictionary
    return {
        "user": user_prompt,
        "assistant": assistant_response
    }

# Generating datsets for Majors and Schools

In [3]:
df_majors = pd.read_csv("majors_to_scrape.csv")

school_dict = {
    "LS": "College of Letters and Science",
    "AA": "School of the Arts and Architecture",
    "EN": "School of Engineering",
    "MN": "School of Medicine",
    "PH": "School of Public Health",
    "EI": "School of Education and Information Studies",
    "MU": "School of Music",
    "PA": "School of Public Affairs",
    "LW": "School of Law",
    "MG": "School of Management",
    "NS": "School of Nursing",
    "TF": "School of Theater, Film, and Television"
}

prompt_stem_major = [
    "What school does the {prompt_keyword} major belong to?",
    "Which school is the {prompt_keyword} major under?",
    "Which school should I apply to in order to study the {prompt_keyword} major?"
    "Which college or professional school houses the {prompt_keyword} major?"
]
prompt_stem_department = [
    "What department does the {prompt_keyword} major fall under?",
    "Which department has the {prompt_keyword} major?",
    "To find out more about the {prompt_keyword} major, which department should I go to?",
    "In order to enroll in the {prompt_keyword} major, which department should I speak to?"
]
solution_stem_major = [
    "The {prompt_keyword} major belongs to the {answer_keyword}."
]
solution_stem_department = [
    "The {prompt_keyword} major falls under the Department of {answer_keyword}."
]
df_majors.columns

Index(['Department Name', 'Abbreviation', 'Subject Area Name',
       'Abbreviation.1', 'Div', 'School', 'are_classes'],
      dtype='object')

In [4]:
# Compose two seperate jsons, one for major to school, the other for major to department
# For each entry in JSON, create two prompts
major_to_school_prompts = []
major_to_department_prompts = []

for i in range(len(df_majors)):
    dep_name, maj_name, school_abbrev = df_majors.iloc[i]['Department Name'], df_majors.iloc[i]['Subject Area Name'], df_majors.iloc[i]['School']
    school = school_dict[school_abbrev]
    
    # Generate two prompts per data entry
    for i in range(2):
        major_to_school_prompts.append(prompt_generator(prompt_stem_major, solution_stem_major, maj_name, school))
        major_to_department_prompts.append(prompt_generator(prompt_stem_department, solution_stem_department, maj_name, dep_name))


In [5]:
with open('prompts_major_school.json', 'w') as file:
    # Write the list of dictionaries to file as JSON
    json.dump(major_to_school_prompts, file, indent=4)

with open('prompts_major_department.json', 'w') as file:
    # Write the list of dictionaries to file as JSON
    json.dump(major_to_department_prompts, file, indent=4)

# Generating Datasets for Course Information

In [7]:
df_classes = pd.read_csv('prof_ratings_and_class_data_F_24.csv')
df_classes = df_classes[df_classes['class_dates'].str.len() <= 5]

df_classes['lec'] = df_classes['class_name'].str[-5:]
df_classes['class_name'] = df_classes['class_name'].str[:-6]
df_classes['year_semester'] = df_classes.apply(lambda row: (str(row['year']) + " " + row['semester']), axis=1)

df_classes.head()

Unnamed: 0,class_name,class_dates,class_times,class_professor,year,semester,prof_rating,bruin_walk_url,lec,year_semester
0,Aerospace Studies (AERO ST) 1A - Heritage and ...,F,12pm-12:50pm,"Malone, M.B.",2024,fall,,https://bruinwalk.com/professors/morgan-b-malo...,Lec 1,2024 fall
1,Aerospace Studies (AERO ST) 20A - Team and Lea...,F,1pm-1:50pm,"Malone, M.B.",2024,fall,,https://bruinwalk.com/professors/morgan-b-malone,Lec 1,2024 fall
2,Aerospace Studies (AERO ST) 130A - Air Force L...,F,12pm-2:50pm,"Everhart, R.",2024,fall,,https://bruinwalk.com/professors/robert-everha...,Lec 1,2024 fall
3,Aerospace Studies (AERO ST) 140A - National Se...,F,12pm-2:50pm,"Allison, M.A.",2024,fall,,https://bruinwalk.com/professors/mae-li-a-alli...,Lec 1,2024 fall
4,African American Studies (AF AMER) 1 - Introdu...,MW,1pm-2:50pm,"Streeter, C.A.",2024,fall,,https://bruinwalk.com/professors/caroline-stre...,Lec 1,2024 fall


In [8]:
day_map = {
    "M": "Monday",
    "T": "Tuesday",
    "W": "Wednesday",
    "R": "Thursday",
    "F": "Friday",
    "MT": "Monday Tuesday",
    "MW": "Monday Wednesday",
    "MR": "Monday Thursday",
    "MF": "Monday Friday",
    "TW": "Tuesday Wednesday",
    "TR": "Tuesday Thursday",
    "TF": "Tuesday Friday",
    "WR": "Wednesday Thursday",
    "WF": "Wednesday Friday",
    "RF": "Thursday Friday",
    "MTW": "Monday Tuesday Wednesday",
    "MTR": "Monday Tuesday Thursday",
    "MTF": "Monday Tuesday Friday",
    "MWR": "Monday Wednesday Thursday",
    "MWF": "Monday Wednesday Friday",
    "MRF": "Monday Thursday Friday",
    "TWR": "Tuesday Wednesday Thursday",
    "TWF": "Tuesday Wednesday Friday",
    "TRF": "Tuesday Thursday Friday",
    "WRF": "Wednesday Thursday Friday",
    "MTWR": "Monday Tuesday Wednesday Thursday",
    "MTWF": "Monday Tuesday Wednesday Friday",
    "MTRF": "Monday Tuesday Thursday Friday",
    "MWRF": "Monday Wednesday Thursday Friday",
    "TWRF": "Tuesday Wednesday Thursday Friday",
    "MTWRF": "Monday Tuesday Wednesday Thursday Friday"
}


def concat_class_string(lec, day, hour):
    result_string = lec + ": " + day_map[day] + " at " + hour
    return result_string
    

In [10]:
# concatenates time strings into string representation for language model
df_classes['class_info_string'] = df_classes.apply(lambda row: concat_class_string(row['lec'], row['class_dates'], row['class_times']), axis=1)

# aggregates into a dictionary of class names to a list of lecture times
grouped = df_classes.groupby(['class_name', 'year_semester', 'class_professor', 'prof_rating', 'bruin_walk_url'], dropna=False)['class_info_string'].agg(list)

nested_schedule = {}
for (class_name, semester, class_professor, prof_rating, bruin_walk_url), info in grouped.items():
    if class_name not in nested_schedule:
        nested_schedule[class_name] = {}
    nested_schedule[class_name] = [semester, class_professor, prof_rating, bruin_walk_url, info] # [semester][class_professor][prof_rating][bruin_walk_url]

#print(nested_schedule['Aerospace Studies (AERO ST) 130A - Air Force Leadership Studies'])
grouped.head(20)

class_name                                                                                                                                           year_semester  class_professor     prof_rating  bruin_walk_url                                                       
Aerospace Studies (AERO ST) 130A - Air Force Leadership Studies                                                                                      2024 fall      Everhart, R.        NaN          https://bruinwalk.com/professors/robert-everhart/aero-st-130a/                             [Lec 1: Friday at 12pm-2:50pm]
Aerospace Studies (AERO ST) 140A - National Security Affairs/Preparation for Active Duty                                                             2024 fall      Allison, M.A.       NaN          https://bruinwalk.com/professors/mae-li-a-allison/aero-st-140a/                            [Lec 1: Friday at 12pm-2:50pm]
Aerospace Studies (AERO ST) 1A - Heritage and Values                                           

In [11]:
def class_schedule_to_string(class_name, class_schedule):
    quarters = len(class_schedule)
    output = f"{class_name} is offered in {quarters} quarters. "
    
    #for quarter, lectures in class_schedule.items():
    #    output += f"In {quarter} quarter, "
    lecture_details = []
    for lecture in class_schedule:
        lecture_number, lecture_time = lecture.split(': ', 1)
        lecture_details.append(f"{lecture_number} is offered on {lecture_time}")
    output += ", and ".join(lecture_details) + ". "
    
    return output.strip()


In [12]:
# Creating txt files from the dictionary and dataframe

embedding_sentences = []

for class_name in nested_schedule:
    class_id = class_name.split(' - ')[0]
    class_title = class_name.split(' - ')[1]
    class_prof = nested_schedule[class_name][1]
    prof_rating = nested_schedule[class_name][2]
    prof_url = nested_schedule[class_name][3]
    class_schedule = nested_schedule[class_name][4]
    definition_string = f"{class_name} has class ID {class_id}. {class_name} has class title {class_id}. " 
    definition_string += class_schedule_to_string(class_name, class_schedule)
    definition_string += f" {class_name} has professor {class_prof} with a Bruin Walk rating of {prof_rating}. More info can be found at {prof_url}"
    embedding_sentences.append(definition_string)

In [13]:
with open('embedding_input_followup.txt', mode='wt', encoding='utf-8') as f:
    f.write('\n'.join(embedding_sentences))

In [7]:
prompt_stem_class = [
    "What lecture times are available for {prompt_keyword}?",
    "What is the schedule like for {prompt_keyword}?",
    "How many offerings are out there for {prompt_keyword}?",
    "If I want to take {prompt_keyword}, what time slots do I need to free out?"
]
answer_stem_class = [
    "The class {prompt_keyword} has the following time schedules:\n\n{answer_keyword}"
]

In [49]:
# Train for class id, class name, and combined

class_prompts = []

for class_name in nested_schedule:
    class_id = class_name.split(' - ')[0]
    class_title = class_name.split(' - ')[1]
    prompt_keywords = [class_name, class_id, class_title]
    answer_keyword = ""
    for semester in nested_schedule[class_name]:
        answer_keyword += "In " + semester + " quarter:\n"
        answer_keyword += '\n'.join(nested_schedule[class_name][semester])
        answer_keyword += "\n\n"

    # Generate two prompts per data entry
    for i in range(3):
        class_prompts.append(prompt_generator(prompt_stem_class, answer_stem_class, prompt_keywords[i], answer_keyword))



In [50]:
with open('prompts_classes.json', 'w') as file:
    # Write the list of dictionaries to file as JSON
    json.dump(class_prompts, file, indent=4)

In [8]:
print(len(class_prompts))
print(len(major_to_school_prompts))
print(len(major_to_department_prompts))

NameError: name 'class_prompts' is not defined