In [None]:
# Notebook Parameters

fold_pth = "N:/Code/GITHUB/Analytics_Day/Analytics_Day_Fall2022/" + "/data/"

input_data_name = "MERGED_PP" + ".csv"

output_data_name = "ratemyprof_reviews" + ".csv"

In [None]:
# Class Creation

import requests, json, math, csv, os
import pandas as pd

from professor import Professor
# This code has been tested using Python 3.6 interpreter and Linux (Ubuntu).
# It should run under Windows, if anything you may need to make some adjustments for the file paths of the CSV files.


class ProfessorNotFound(Exception):
    def __init__(self, search_argument, search_parameter: str = "Name"):

        # What the client is looking for. Ex: "Professor Pattis"
        self.search_argument = self.search_argument

        # The search criteria. Ex: Last Name
        self.search_parameter = search_parameter

    def __str__(self):

        return (
            f"Proessor not found"
            + f" The search argument {self.search_argument} did not"
            + f" match with any professor's {self.search_parameter}"
        )



class RateMyProfApi:

    def __init__(self, school_id: str = "1074", testing: bool = False, prnt = True):
        self.prnt = prnt
        self.UniversityId = school_id
        if not os.path.exists("SchoolID_" + str(self.UniversityId)):
            os.mkdir("SchoolID_" + str(self.UniversityId))

        # dict of Professor
        self.professors= self.scrape_professors(testing)
        self.indexnumber = False

    def scrape_professors(self, testing: bool = False):  # creates List object that include basic information on all Professors from the IDed University
        professors = dict()
        num_of_prof = self.get_num_of_professors(self.UniversityId)
        num_of_pages = math.ceil(num_of_prof / 20)
        temp_lst = []
        for i in range(1, num_of_pages + 1):  # the loop insert all professor into list
            page = requests.get(
                "http://www.ratemyprofessors.com/filter/professor/?&page="
                + str(i)
                + "&filter=teacherlastname_sort_s+asc&query=*%3A*&queryoption=TEACHER&queryBy=schoolId&sid="
                + str(self.UniversityId)
            )
            json_response = json.loads(page.content)
            temp_list = json_response["professors"]

            temp_lst.append(json_response)

            for json_professor in json_response["professors"]:
                if self.prnt == True:
                    print(json_professor)
                professor = Professor(
                    json_professor["tid"],
                    json_professor["tFname"],
                    json_professor["tLname"],
                    json_professor["tNumRatings"],
                    json_professor["overall_rating"])

                professors[professor.ratemyprof_id] = professor

            # for test cases, limit to 2 iterations
            if testing and (i > 1): break

        return professors

    def get_num_of_professors(self, id):  # function returns the number of professors in the university of the given ID.
        page = requests.get(
            "http://www.ratemyprofessors.com/filter/professor/?&page=1&filter=teacherlastname_sort_s+asc&query=*%3A*&queryoption=TEACHER&queryBy=schoolId&sid="
            + str(id))  # get request for page

        temp_jsonpage = json.loads(page.content)

        num_of_prof = (temp_jsonpage["remaining"] + 20)  # get the number of professors at William Paterson University

        return num_of_prof

    def search_professor(self, ProfessorName):
        self.indexnumber = self.get_professor_by_last_name(ProfessorName)
        self.print_professor_info()
        return self.indexnumber

    def get_professor_by_last_name(
        self, last_name
    ) -> Professor:
        '''
        Return the first professor with the matching last name.
        Case insenstive.
        '''
        last_name = last_name.lower()
        for name in professors:
            if last_name == professors[name].last_name.lower():
                return professors[name]

        # Raise error if no matching professor found
        raise ProfessorNotFound(last_name, "Last Name")

    def WriteProfessorListToCSV(self):
        csv_columns = [
            "tDept",
            "tSid",
            "institution_name",
            "tFname",
            "tMiddlename",
            "tLname",
            "tid",
            "tNumRatings",
            "rating_class",
            "contentType",
            "categoryType",
            "overall_rating",
        ]
        csv_file = "SchoolID_" + str(self.UniversityId) + ".csv"
        with open(csv_file, "w") as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
            writer.writeheader()
            for data in self.professorlist:
                writer.writerow(data)

    def create_reviews_list(self, tid):
        tempreviewslist = []
        num_of_reviews = self.get_num_of_reviews(tid)
        # RMP only loads 20 reviews per page,
        # so num_of_pages tells us how many pages we need to get all the reviews
        num_of_pages = math.ceil(num_of_reviews / 20)
        i = 1
        while i <= num_of_pages:
            page = requests.get(
                "https://www.ratemyprofessors.com/paginate/professors/ratings?tid="
                + str(tid)
                + "&filter=&courseCode=&page="
                + str(i)
            )
            temp_jsonpage = json.loads(page.content)
            temp_list = temp_jsonpage["ratings"]
            tempreviewslist.extend(temp_list)
            i += 1
        return tempreviewslist

    def get_num_of_reviews(self, id):
        page = requests.get(
            "https://www.ratemyprofessors.com/paginate/professors/ratings?tid="
            + str(id)
            + "&filter=&courseCode=&page=1"
        )
        temp_jsonpage = json.loads(page.content)
        num_of_reviews = temp_jsonpage["remaining"] + 20
        return num_of_reviews

    def WriteReviewsListToCSV(self, rlist, tid):
        csv_columns = [
            "attendance",
            "clarityColor",
            "easyColor",
            "helpColor",
            "helpCount",
            "id",
            "notHelpCount",
            "onlineClass",
            "quality",
            "rClarity",
            "rClass",
            "rComments",
            "rDate",
            "rEasy",
            "rEasyString",
            "rErrorMsg",
            "rHelpful",
            "rInterest",
            "rOverall",
            "rOverallString",
            "rStatus",
            "rTextBookUse",
            "rTimestamp",
            "rWouldTakeAgain",
            "sId",
            "takenForCredit",
            "teacher",
            "teacherGrade",
            "teacherRatingTags",
            "unUsefulGrouping",
            "usefulGrouping",
        ]
        csv_file = (
            "SchoolID_" + str(self.UniversityId) + "/TeacherID_" + str(tid) + ".csv"
        )
        with open(csv_file, "w") as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
            writer.writeheader()
            for data in rlist:
                writer.writerow(data)

## example
# uci = RateMyProfApi(1074)

In [None]:
# Functions

def get_ratemyprof_sid(school_name):
    import requests
    from bs4 import BeautifulSoup
    url = "https://www.ratemyprofessors.com/search/schools?query=" + str(school_name).replace(" ", "%20")
    page = requests.get(url)
    soup = BeautifulSoup(page.text, "html.parser")
    links = [x.get("href") for x in soup.find_all("a")]
    sublinks = [x for x in links if str(x).__contains__("school?sid")]
    if len(sublinks) > 0:
        return sublinks[0]
    else:
        return None

def get_prof_review_list(school, x):
    try:
        outs = school.create_reviews_list(x)
    except:
        outs = None
    return outs

def get_prof_reviews(professor_tid, school_sid, school):
    prof_reviews = get_prof_review_list(school, professor_tid)

    if prof_reviews != None:
        prof_reviews = pd.json_normalize(prof_reviews)
        prof_reviews["professor_tid"] = professor_tid
        prof_reviews["school_sid"] = school_sid
    
    return prof_reviews

def get_school_reviews(school_sid):
    # Create a School's object and scrape the professors at the school
    school = RateMyProfApi(school_sid, prnt = False)
    prof = school.professors

    # Scrape the reviews for each professor
    temp_list = [get_prof_reviews(professor_tid, school_sid, school) for professor_tid in prof]
    if temp_list != []:
        prof_review_df = pd.concat(temp_list)
    else:
        prof_review_df = pd.DataFrame()
    return prof_review_df

    

In [None]:
# Data Import

import pandas as pd


df = pd.read_csv(fold_pth + input_data_name, index_col = 0, low_memory=False)


In [None]:
# Get the School's RateMyProfessor ID by searching RMP and pulling the SID for the top result.

rmp_df = pd.DataFrame([[school_name, get_ratemyprof_sid(school_name)] for school_name in df.INSTNM.drop_duplicates().tolist()], columns = ["INSTNM", "ratemyprof_sid"])
rmp_df = rmp_df[rmp_df["ratemyprof_sid"].isna() == False]
rmp_df["sid"] = [x.split("=")[1] for x in rmp_df.ratemyprof_sid]

df = pd.merge(left = df.reset_index().drop("index", axis = 1),
              right = rmp_df.reset_index().drop("index", axis = 1),
              how = "left",
              left_on = "INSTNM",
              right_on = "INSTNM"
             )
# df.to_csv(data_pth) 

In [None]:
# Data Subset to include only GA schools

subdf = df.loc[df["sid_y"].isna() == False]

subdf = subdf.rename(columns = {"sid_y": "sid"})

print(len(subdf.sid.drop_duplicates().tolist()))

subdf = subdf[subdf["STABBR"] == "GA"]

print(len(subdf.sid.drop_duplicates().tolist()))


In [11]:
# Scrape Rate My Professor

ksu_sid = rmp_df[rmp_df["INSTNM"] == "Kennesaw State University"].sid.iloc[0]
school_results = get_school_reviews(ksu_sid)

coun_ter = 0

supdf_sid_list = [int(x) for x in subdf.sid.drop_duplicates().tolist()[coun_ter:]]

for school_sid in supdf_sid_list:

    if school_sid != ksu_sid:
        temp = get_school_reviews(school_sid)
        school_results = pd.concat([school_results, temp])
        coun_ter = coun_ter + 1

        if coun_ter % 5:
            print("CHECKPOINT")
            print("NUMBER OF SCHOOLS LEFT:", len(supdf_sid_list) - coun_ter)
            print("NUMBER OF REVIEWS FOUND FOR MOST RECENT SCHOOL:", temp.shape)
            print("NUMBER OF REVIEWS FOUND TOTAL:", school_results.shape)
            school_results.to_csv("ratemyprof_reviews_IMPORTANT.csv")

# Scrape Rate My Professor - Alternative

# school_results = pd.concat([get_school_reviews(school_sid) for school_sid in [int(x) for x in subdf.sid.drop_duplicates().tolist()[coun_ter:]]])

In [None]:
# Data Save

school_results.to_csv(fold_pth + output_data_name)