In [1]:
import requests
    
r = requests.get('https://www.ratemyprofessors.com/professor/2936075')
print(r.text)



    <!DOCTYPE html>
    <!-- SSR -->
    <html >
      <head>
        <meta name="viewport" content="width=device-width, initial-scale=1" />
        <meta name="theme-color" content="#000000" />
        <meta name="thumbnail" content="https://www.ratemyprofessors.com/build/thumbnail.svg" />

        

        <link rel="manifest" href="/build/manifest.json">
        <link rel="stylesheet" type="text/css" href="/static/css/main.1773c5b7.css">

        <!-- Nobid Prebid Wrapper -->
        <script async id="nobid-wrapper" src="https://public.servenobid.com/partner/66897/56514/wrapup_1.0.2.js"></script>
        <script>
          window.wrapup = window.wrapup || { cmd: [] };
        </script>

        <link data-react-helmet="true" rel="icon" href="/favicons/favicon-16.png" sizes/><link data-react-helmet="true" rel="icon" href="/favicons/favicon-32.png" sizes="32×32"/><link data-react-helmet="true" rel="apple-touch-icon" href="/favicons/favicon-57.png" sizes/><link data-react-helmet="tru

In [5]:
import json
import re
import pandas as pd

def extract_reviews_from_html(html_content):
    # Find the JavaScript data object that contains the reviews
    data_match = re.search(r'window\.__RELAY_STORE__ = ({.*?});', html_content, re.DOTALL)
    if not data_match:
        print("No data found in HTML")
        return []
    
    try:
        # Parse the JavaScript object
        data = json.loads(data_match.group(1))
        
        reviews = []
        # Look for rating objects in the data
        # Print all possible keys from the first rating found
        for key, value in data.items():
            if isinstance(value, dict) and value.get('__typename') == 'Rating':
                print("All keys in first rating:")
                for k in value.keys():
                    print(f"- {k}")
                
                review = {
                    'comment': value.get('comment'),
                    'date': value.get('date'), 
                    'class': value.get('class'),
                    'grade': value.get('grade'),
                    'difficulty': value.get('difficultyRating'),
                    'quality': value.get('helpfulRating'),
                    'tags': value.get('ratingTags'),
                    'would_take_again': value.get('wouldTakeAgain')
                }
                reviews.append(review)
                break  # Only need to print keys from first rating
                
        # Continue collecting all reviews
        for key, value in data.items():
            if isinstance(value, dict) and value.get('__typename') == 'Rating':
                review = {
                    'comment': value.get('comment'),
                    'date': value.get('date'),
                    'class': value.get('class'),
                    'grade': value.get('grade'),
                    'difficulty': value.get('difficultyRating'),
                    'quality': value.get('helpfulRating'),
                    'tags': value.get('ratingTags'),
                    'would_take_again': value.get('wouldTakeAgain')
                }
                reviews.append(review)
        
        return reviews
        
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON data: {e}")
        return []

# Get reviews from your HTML content
reviews = extract_reviews_from_html(r.text)

# Convert to DataFrame
df = pd.DataFrame(reviews)

# Basic cleaning
df['date'] = pd.to_datetime(df['date'].str.split(' ').str[0])
df = df.sort_values('date', ascending=False)

# Save to CSV
#df.to_csv('professor_reviews.csv', index=False)

# Print summary
print(f"Extracted {len(reviews)} reviews")
print("\nMost recent reviews:")
print(df[['date', 'class', 'difficulty', 'quality']].head())


All keys in first rating:
- __id
- __typename
- comment
- flagStatus
- createdByUser
- teacherNote
- legacyId
- date
- class
- helpfulRating
- clarityRating
- isForOnlineClass
- difficultyRating
- attendanceMandatory
- wouldTakeAgain
- grade
- textbookUse
- isForCredit
- ratingTags
- id
- adminReviewedAt
- thumbsUpTotal
- thumbsDownTotal
- thumbs
Extracted 21 reviews

Most recent reviews:
        date       class  difficulty  quality
0 2025-01-15       CS575           5        1
1 2025-01-15       CS575           5        1
2 2024-12-21       CS575           5        1
3 2024-12-11       CS575           5        1
4 2024-11-18  COMPSCI240           5        1


In [3]:
import requests
import json
import pandas as pd
import time

import base64

def get_all_reviews(professor_id):
    headers = {
        'Authorization': 'Basic dGVzdDp0ZXN0',
        'Content-Type': 'application/json',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    # Encode the professor_id in base64
    encoded_professor_id = base64.b64encode(f"Teacher-{professor_id}".encode()).decode()
    
    payload = {
        "query": "query GetTeacherRatings($id: ID!) { node(id: $id) { ... on Teacher { id firstName lastName school { name } ratings(first: 1000) { edges { node { comment class difficultyRating qualityRating createdByUser attendanceMandatory wouldTakeAgain isForCredit textbookUse date } } pageInfo { hasNextPage endCursor } } } } }",
        "variables": {
            "id": encoded_professor_id
        }
    }
    
    response = requests.post(
        "https://www.ratemyprofessors.com/graphql",
        json=payload,
        headers=headers
    )
    
    if response.status_code != 200:
        print(f"Error: Status code {response.status_code}")
        print(response.text)
        return None
        
    data = response.json()
    
    # Check for errors in the response
    if 'errors' in data:
        print("GraphQL Errors:", data['errors'])
        return None
        
    node_list = data['data']['node']['ratings']['edges']

    rev_list = []
    for node in node_list:
        node['node']['tid'] = professor_id
        rev_list.append(node['node'])
    return rev_list

# Test the function
professor_id = "2936075"  # Mark Wilson's ID
result = get_all_reviews(professor_id)
print(json.dumps(result, indent=2))


[
  {
    "attendanceMandatory": "",
    "class": "CS575",
    "comment": "Didn't offer a make-up exam for the midterm, which made the final exam 50% of my grade. The final exam was extremely hard, he didn't offer any regrade requests for the final, and closed the Piazza almost immediately after the final so we couldn't dispute anything. Allowing him to teach a CS course again was a huge mistake for the CS department.",
    "createdByUser": false,
    "date": "2025-01-15 21:34:45 +0000 UTC",
    "difficultyRating": 5,
    "isForCredit": true,
    "qualityRating": 1,
    "textbookUse": null,
    "wouldTakeAgain": null,
    "tid": "2936075"
  },
  {
    "attendanceMandatory": "mandatory",
    "class": "CS575",
    "comment": "If you can avoid him, please do. As said before, he is extremely condescending and lacking in patience when answering questions. He will call a new examinable concept simple or powerful or both, then not explain it. He will be unable to finish examples he's supposed

In [4]:
def get_umass_professors(school_id="U2Nob29zLTE1MTM="):  # Base64 encoded "School-1513"
    headers = {
        'Authorization': 'Basic dGVzdDp0ZXN0',
        'Content-Type': 'application/json',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    query = {
        "query": """
        query NewSearch($query: TeacherSearchQuery!) {
          newSearch {
            teachers(query: $query, first: 1000) {
              edges {
                node {
                  id
                  firstName
                  lastName
                  department
                }
              }
              pageInfo {
                hasNextPage
                endCursor
              }
            }
          }
        }
        """,
        "variables": {
            "query": {
                "text": "",
                "schoolID": school_id,
                "fallback": False,
                "departmentID": None
            }
        }
    }
    
    response = requests.post(
        "https://www.ratemyprofessors.com/graphql",
        json=query,
        headers=headers
    )
    
    if response.status_code == 200:
        data = response.json()
        professors = []
        
        if 'data' in data and 'newSearch' in data['data']:
            try: 
                for edge in data['data']['newSearch']['teachers']['edges']:
                    prof = edge['node']
                    # Check if 'id' is in the expected format (Base64 encoded) and decode it
                    if 'id' in prof:
                        try:
                            decoded_id = base64.b64decode(prof['id']).decode('utf-8')
                            professors.append({
                                'id': decoded_id.split('-')[1],  # Remove the "Teacher-" prefix
                                'name': f"{prof['firstName']} {prof['lastName']}",
                                'department': prof['department']
                            })
                        except Exception as e:
                            print(f"Error decoding ID for professor: {prof}, Error: {e}")
                    else:
                        print(f"Unexpected ID format for professor: {prof}")
            except Exception as e:
                print(f"Error processing data: {e}")

            print(f"Found {len(professors)} professors")
            return professors
    else:
        print(f"Error: {response.status_code}")
        print(response.text)
        return None

# Get all UMass professors
professors = get_umass_professors()

# Print first few professors to verify
if professors:
    print("\nFirst 5 professors:")
    for prof in professors[:5]:
        print(f"ID: {prof['id']}, Name: {prof['name']}, Department: {prof['department']}")
    
    # Save to CSV for future use
    df = pd.DataFrame(professors)

professor_dict = {prof['id']: prof for prof in professors}


Found 1000 professors

First 5 professors:
ID: 203815, Name: John Bickford, Department: Psychology
ID: 77120, Name: Randall Phillis, Department: Biology
ID: 1621419, Name: Laura Francis, Department: Biology
ID: 192549, Name: Joanna Jeneralczuk, Department: Mathematics
ID: 1617241, Name: Chris McDaniel, Department: Chemistry


In [6]:
ids = [id['id'] for id in professors]
reviews = []

if 'reviews' not in locals() or reviews == []:
    reviews = []
    total_reviews = 0
    teacher_count = 0
    for id in ids:
        print(id)
        new_reviews = get_all_reviews(id)
        print(len(new_reviews))
        if new_reviews:
            reviews.append(new_reviews)
            total_reviews += len(new_reviews)
            teacher_count += 1
        
        time.sleep(10)  # Wait for 10 second between each review to help the server from robot.txt
    print(f"Total teachers processed: {teacher_count}")

# Reduce reviews to get the length
reviews_length = [len(review) for review in reviews]
print(f"Total number of reviews collected: {sum(reviews_length)}")
print(f"Length of each review: {reviews_length}")


203815
449
77120
388
1621419
370
192549
323
1617241
298
47704
280
887526
265
837070
261
1918813
269
2385263
257
752484
260
385698
240
287874
236
83019
237
82675
230
92488
223
83082
218
7541
202
1847647
202
2118960
195
1661417
191
59327
184
1412623
188
227364
184
101901
177
84404
176
82618
171
545173
173
92458
170
2416008
169
83243
168
98567
162
841483
162
1619359
163
77072
155
102012
151
92516
151
82779
148
82439
145
83040
145
193928
145
1760754
145
191694
143
1325070
134
2182132
140
84356
131
83050
131
191977
130
101369
126
64862
126
88142
124
100144
124
59525
121
935860
121
82723
120
2010156
123
1552014
114
82760
113
92440
112
2176331
117
2409689
108
79172
108
81706
108
305006
108
2086611
110
2936075
108
99943
107
83187
105
83087
106
1694321
104
1236721
102
2291200
104
1959208
100
84876
99
523814
99
1049909
100
53084
98
2272057
105
2420066
101
73927
98
99927
97
83126
96
82730
95
1124632
93
82462
92
82696
91
1769629
89
82438
88
88251
88
1689577
90
81745
89
84388
87
82435
87
75122
86
8

In [7]:

import pandas as pd

# Flatten the list of reviews and create a DataFrame
flattened_reviews = [review for sublist in reviews for review in sublist]
reviews_df = pd.DataFrame(flattened_reviews)

# Print the DataFrame to verify
print(reviews_df.head())

# View memory usage of the DataFrame
memory_usage_mb = reviews_df.memory_usage(deep=True).sum() / (1024 * 1024)  # Convert bytes to MB
print(f"Memory usage of reviews DataFrame: {memory_usage_mb:.2f} MB")

reviews_df.to_csv('RateMyProfAnalysis/UMassReviews/reviews.csv', index=False)

  attendanceMandatory     class  \
0       non mandatory  PSYCH100   
1       non mandatory  PSYCH100   
2       non mandatory  PSYCH370   
3       non mandatory  PSYCH100   
4       non mandatory       100   

                                             comment  createdByUser  \
0  Bickford was easily my favorite teacher. His l...          False   
1  He might just be the most relatable and humoro...          False   
2  exams are in person but are open note so just ...          False   
3  hands down the BEST professor i've had here so...          False   
4  Super funny guy, lectures are great. Gives LOT...          False   

                            date  difficultyRating  isForCredit  \
0  2025-01-13 18:51:48 +0000 UTC                 2         True   
1  2024-12-24 23:27:46 +0000 UTC                 2         True   
2  2024-12-16 02:54:06 +0000 UTC                 1         True   
3  2024-12-09 23:44:58 +0000 UTC                 2         True   
4  2024-12-07 00:00:23 +000

In [8]:
# Check for nulls and NaNs in the reviews DataFrame
nulls = reviews_df.isnull().sum()
print("Null values in reviews DataFrame:\n", nulls)

# Check for duplicates in the reviews DataFrame
duplicates = reviews_df.duplicated().sum()
print(f"Number of duplicate reviews: {duplicates}")

# Optionally, drop duplicates if needed
reviews_df = reviews_df.drop_duplicates()


Null values in reviews DataFrame:
 attendanceMandatory        0
class                      0
comment                    0
createdByUser              0
date                       0
difficultyRating           0
isForCredit                0
qualityRating              0
textbookUse             9596
wouldTakeAgain         29174
tid                        0
dtype: int64
Number of duplicate reviews: 1


In [9]:
# Populate the existing professors dictionary
for review in flattened_reviews:
    tid = review['tid']
    quality_rating = review['qualityRating']
    difficulty_rating = review['difficultyRating']
    classs = review['class']  # Assuming 'class' is a field in the review

    # Update total reviews count
    if 'total_reviews_count' not in professor_dict[tid]:
        professor_dict[tid]['total_reviews_count'] = 0

    professor_dict[tid]['total_reviews_count'] += 1

    # Update average quality rating
    if 'average_quality_rating' not in professor_dict[tid]:
        professor_dict[tid]['average_quality_rating'] = 0
    professor_dict[tid]['average_quality_rating'] += quality_rating

    # Update average difficulty rating
    if 'average_difficulty_rating' not in professor_dict[tid]:
        professor_dict[tid]['average_difficulty_rating'] = 0
    professor_dict[tid]['average_difficulty_rating'] += difficulty_rating

    # Update classes taught
    if 'classes_taught' not in professor_dict[tid]:
        professor_dict[tid]['classes_taught'] = []
    if classs not in professor_dict[tid]['classes_taught']:
        professor_dict[tid]['classes_taught'].append(classs)

# Calculate the average ratings, wont work with subset of professors
for tid in professor_dict:
    if professor_dict[tid]['total_reviews_count'] > 0:
        professor_dict[tid]['average_quality_rating'] /= professor_dict[tid]['total_reviews_count']
        professor_dict[tid]['average_difficulty_rating'] /= professor_dict[tid]['total_reviews_count']
    professor_dict[tid]['classes_taught'] = list(professor_dict[tid]['classes_taught'])  # Convert set to list

# Convert the dictionary to a DataFrame
professors_df = pd.DataFrame.from_dict(professor_dict, orient='index').reset_index()
professors_df.rename(columns={'index': 'tid'}, inplace=True)
professors_df.drop('tid', axis=1, inplace=True)


# Save the professors DataFrame to a CSV file
professors_df.to_csv('RateMyProfAnalysis/UMassReviews/professors.csv', index=False)


KeyError: "['tid'] not found in axis"

In [11]:
#TODO: Get more reviews from more colleges and then clean and post on kaggle

       tid       id                name   department  total_reviews_count  \
0   203815   203815       John Bickford   Psychology                  449   
1    77120    77120     Randall Phillis      Biology                  388   
2  1621419  1621419       Laura Francis      Biology                  370   
3   192549   192549  Joanna Jeneralczuk  Mathematics                  323   
4  1617241  1617241      Chris McDaniel    Chemistry                  298   

   average_quality_rating  average_difficulty_rating  \
0                4.077951                   2.031180   
1                3.920103                   3.298969   
2                2.659459                   4.000000   
3                3.780186                   3.142415   
4                2.104027                   4.342282   

                                      classes_taught  
0  [PSYCH100, PSYCH370, 100, PSCYH100, PERSONALIT...  
1  [BIO151, 151, 100, BIOLOGY151, BIO484, 101, IN...  
2  [161H, BIO151, 285, BIO285, 151,