In [33]:
import random
import pandas as pd
import json

In [25]:

def prompt_generator(prompt_stems, answer_stems, prompt_keyword, answer_keyword):
    # Define the list of prompt stems
   
    # Randomly select a prompt stem and an answer stem
    selected_prompt = random.choice(prompt_stems)
    selected_answer = random.choice(answer_stems)
    # Replace {course_name} and {answer} in the selected stems
    user_prompt = selected_prompt.format(prompt_keyword=prompt_keyword)
    assistant_response = selected_answer.format(prompt_keyword=prompt_keyword, answer_keyword=answer_keyword)
    
    # Return the results as a dictionary
    return {
        "user": user_prompt,
        "assistant": assistant_response
    }

# Generating datsets for Majors and Schools

In [30]:
df_majors = pd.read_csv("majors_to_scrape.csv")

school_dict = {
    "LS": "College of Letters and Science",
    "AA": "School of the Arts and Architecture",
    "EN": "School of Engineering",
    "MN": "School of Medicine",
    "PH": "School of Public Health",
    "EI": "School of Education and Information Studies",
    "MU": "School of Music",
    "PA": "School of Public Affairs",
    "LW": "School of Law",
    "MG": "School of Management",
    "NS": "School of Nursing",
    "TF": "School of Theater, Film, and Television"
}

prompt_stem_major = [
    "What school does the {prompt_keyword} major belong to?",
    "Which school is the {prompt_keyword} major under?",
    "Which school should I apply to in order to study the {prompt_keyword} major?"
    "Which college or professional school houses the {prompt_keyword} major?"
]
prompt_stem_department = [
    "What department does the {prompt_keyword} major fall under?",
    "Which department has the {prompt_keyword} major?",
    "To find out more about the {prompt_keyword} major, which department should I go to?",
    "In order to enroll in the {prompt_keyword} major, which department should I speak to?"
]
solution_stem_major = [
    "The {prompt_keyword} major belongs to the {answer_keyword}."
]
solution_stem_department = [
    "The {prompt_keyword} major falls under the Department of {answer_keyword}."
]
df_majors.columns

Index(['Department Name', 'Abbreviation', 'Subject Area Name',
       'Abbreviation.1', 'Div', 'School', 'are_classes'],
      dtype='object')

In [31]:
# Compose two seperate jsons, one for major to school, the other for major to department
# For each entry in JSON, create two prompts
major_to_school_prompts = []
major_to_department_prompts = []

for i in range(len(df_majors)):
    dep_name, maj_name, school_abbrev = df_majors.iloc[i]['Department Name'], df_majors.iloc[i]['Subject Area Name'], df_majors.iloc[i]['School']
    school = school_dict[school_abbrev]
    
    # Generate two prompts per data entry
    for i in range(2):
        major_to_school_prompts.append(prompt_generator(prompt_stem_major, solution_stem_major, maj_name, school))
        major_to_department_prompts.append(prompt_generator(prompt_stem_department, solution_stem_department, maj_name, dep_name))


In [34]:
with open('prompts_major_school.json', 'w') as file:
    # Write the list of dictionaries to file as JSON
    json.dump(major_to_school_prompts, file, indent=4)

with open('prompts_major_department.json', 'w') as file:
    # Write the list of dictionaries to file as JSON
    json.dump(major_to_department_prompts, file, indent=4)

# Generating Datasets for Course Information

In [59]:
df_classes = pd.read_csv('ucla_class_info.csv')
df_classes = df_classes[df_classes['class_dates'].str.len() <= 5]

df_classes['lec'] = df_classes['class_name'].str[-5:]
df_classes['class_name'] = df_classes['class_name'].str[:-6]

df_classes.head()

Unnamed: 0,class_name,class_dates,class_times,year,semester,lec
0,Aerospace Studies (AERO ST) 1A - Heritage and ...,F,1pm-1:50pm,2020,fall,Lec 1
1,Aerospace Studies (AERO ST) 1A - Heritage and ...,T,9am-9:50am,2020,fall,Lec 2
2,Aerospace Studies (AERO ST) 20A - Team and Lea...,F,2pm-2:50pm,2020,fall,Lec 1
3,Aerospace Studies (AERO ST) 20A - Team and Lea...,W,1pm-1:50pm,2020,fall,Lec 2
4,Aerospace Studies (AERO ST) 130A - Air Force L...,T,8am-10:50am,2020,fall,Lec 1


In [46]:
day_map = {
    "M": "Monday",
    "T": "Tuesday",
    "W": "Wednesday",
    "R": "Thursday",
    "F": "Friday",
    "MT": "Monday Tuesday",
    "MW": "Monday Wednesday",
    "MR": "Monday Thursday",
    "MF": "Monday Friday",
    "TW": "Tuesday Wednesday",
    "TR": "Tuesday Thursday",
    "TF": "Tuesday Friday",
    "WR": "Wednesday Thursday",
    "WF": "Wednesday Friday",
    "RF": "Thursday Friday",
    "MTW": "Monday Tuesday Wednesday",
    "MTR": "Monday Tuesday Thursday",
    "MTF": "Monday Tuesday Friday",
    "MWR": "Monday Wednesday Thursday",
    "MWF": "Monday Wednesday Friday",
    "MRF": "Monday Thursday Friday",
    "TWR": "Tuesday Wednesday Thursday",
    "TWF": "Tuesday Wednesday Friday",
    "TRF": "Tuesday Thursday Friday",
    "WRF": "Wednesday Thursday Friday",
    "MTWR": "Monday Tuesday Wednesday Thursday",
    "MTWF": "Monday Tuesday Wednesday Friday",
    "MTRF": "Monday Tuesday Thursday Friday",
    "MWRF": "Monday Wednesday Thursday Friday",
    "TWRF": "Tuesday Wednesday Thursday Friday",
    "MTWRF": "Monday Tuesday Wednesday Thursday Friday"
}


def concat_class_string(lec, day, hour):
    result_string = lec + ": " + day_map[day] + " at " + hour
    return result_string
    

In [70]:
# concatenates time strings into string representation for language model
df_classes['class_info_string'] = df_classes.apply(lambda row: concat_class_string(row['lec'], row['class_dates'], row['class_times']), axis=1)

# aggregates into a dictionary of class names to a list of lecture times
course_schedule_dict = df_classes.groupby('class_name')['class_info_string'].agg(set).to_dict()
print(course_schedule_dict['Aerospace Studies (AERO ST) 1A - Heritage and Values'])

{'Lec 1: Friday at 1pm-1:50pm', 'Lec 1: Friday at 12pm-12:50pm', 'Lec 2: Tuesday at 9am-9:50am'}
