In [170]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [171]:
df = pd.read_csv("sjsu_rmp.csv")
df.head()

Unnamed: 0,professor_first_name,professor_last_name,professor_department,avg_difficulty,avg_rating,num_ratings,would_take_again_percent,comment,difficulty_rating,grade,clarity_rating,rating_tags
0,Eric,Crabill,Electrical Engineering,2.8,4.9,14,100.0,Eric is one of the best professors I have ever...,1,A+,5,Respected--Inspirational--Amazing lectures
1,Eric,Crabill,Electrical Engineering,2.8,4.9,14,100.0,Great professor taught me a lot about microchi...,4,A,5,Inspirational--Hilarious--Amazing lectures
2,Eric,Crabill,Electrical Engineering,2.8,4.9,14,100.0,"Amazing Lectures, inspiring speeches, takes th...",3,A,5,Gives good feedback--Respected--Get ready to read
3,Eric,Crabill,Electrical Engineering,2.8,4.9,14,100.0,EE178 with Professor Crabill is a 100% practic...,3,Not sure yet,5,Gives good feedback--Clear grading criteria
4,Eric,Crabill,Electrical Engineering,2.8,4.9,14,100.0,Crabill is a perfect example of how a great pr...,2,A,4,


In [172]:
df = df.drop(columns=["comment", "rating_tags"])
df = df.dropna()
df.head()

Unnamed: 0,professor_first_name,professor_last_name,professor_department,avg_difficulty,avg_rating,num_ratings,would_take_again_percent,difficulty_rating,grade,clarity_rating
0,Eric,Crabill,Electrical Engineering,2.8,4.9,14,100.0,1,A+,5
1,Eric,Crabill,Electrical Engineering,2.8,4.9,14,100.0,4,A,5
2,Eric,Crabill,Electrical Engineering,2.8,4.9,14,100.0,3,A,5
3,Eric,Crabill,Electrical Engineering,2.8,4.9,14,100.0,3,Not sure yet,5
4,Eric,Crabill,Electrical Engineering,2.8,4.9,14,100.0,2,A,4


In [173]:
order = {
    "A+": 0,
    "A": 1,
    "A-": 2,
    "B+": 3,
    "B": 4,
    "B-": 5,
    "C+": 6,
    "C": 7,
    "C-": 8,
    "D": 9,
    "F": 10,
}
df = df[
    ~df["grade"].isin(
        [
            "Not sure yet",
            "Rather not say",
            "Not_Sure_Yet",
            "Audit/No Grade",
            "Rather_Not_Say",
            "Pass",
            "Fail",
            "Audit/No_Grade",
        ]
    )
]
df["grade_encoded"] = df["grade"].map(order)
df = df.drop(columns=["grade"])
df.head()

Unnamed: 0,professor_first_name,professor_last_name,professor_department,avg_difficulty,avg_rating,num_ratings,would_take_again_percent,difficulty_rating,clarity_rating,grade_encoded
0,Eric,Crabill,Electrical Engineering,2.8,4.9,14,100.0,1,5,0.0
1,Eric,Crabill,Electrical Engineering,2.8,4.9,14,100.0,4,5,1.0
2,Eric,Crabill,Electrical Engineering,2.8,4.9,14,100.0,3,5,1.0
4,Eric,Crabill,Electrical Engineering,2.8,4.9,14,100.0,2,4,1.0
5,Eric,Crabill,Electrical Engineering,2.8,4.9,14,100.0,4,4,2.0


In [174]:
df["professor_name"] = df["professor_first_name"] + " " + df["professor_last_name"]
prof_counts = df["professor_name"].value_counts()
df["professor_name"] = df["professor_name"].where(
    prof_counts[df["professor_name"]] >= 10, "Other"
)
df = df.drop(columns=["professor_first_name", "professor_last_name"])
df = pd.get_dummies(df, columns=["professor_department", "professor_name"], dtype=int)

df.head()

Unnamed: 0,avg_difficulty,avg_rating,num_ratings,would_take_again_percent,difficulty_rating,clarity_rating,grade_encoded,professor_department_Accounting,professor_department_Accounting Finance,professor_department_Accounting & Finance,...,professor_department_Special Education,professor_department_Technology,professor_department_Theater,professor_department_University Studies,professor_department_Urban & Regional Planning,professor_department_Urban Planning,professor_department_Vietnamese,professor_department_Women's Studies,professor_department_Writing,professor_name_Other
0,2.8,4.9,14,100.0,1,5,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2.8,4.9,14,100.0,4,5,1.0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2.8,4.9,14,100.0,3,5,1.0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,2.8,4.9,14,100.0,2,4,1.0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,2.8,4.9,14,100.0,4,4,2.0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [175]:
x = df.drop(columns=["difficulty_rating"])
y = df["difficulty_rating"]
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.1, random_state=42
)
rf_regressor = RandomForestRegressor(n_estimators=200, random_state=42)
rf_regressor.fit(x_train, y_train)
y_pred = rf_regressor.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 0.8795136412748857
R^2 Score: 0.4103585970796281
