In [None]:
import pandas as pd

import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df =  pd.read_csv("/content/drive/MyDrive/Data/sampled_professor_reviews.csv")
df.head()

Unnamed: 0,__typename_Rating,adminReviewedAt_Rating,attendanceMandatory_Rating,clarityRating_Rating,class_Rating,comment_Rating,createdByUser_Rating,date_Rating,difficultyRating_Rating,flagStatus_Rating,...,thumbs_Rating,thumbsDownTotal_Rating,thumbsUpTotal_Rating,wouldTakeAgain_Rating,ProfessorName,Department,SchoolName,LegacyId,NumRatings,predictedGender
0,Rating,2008-01-29 23:49:03 +0000 UTC,,5.0,INDV101,she very passionate about what she teaches but...,False,2007-12-12 18:40:36 +0000 UTC,1.0,UNFLAGGED,...,[],0.0,0.0,,Amy Fountain,Languages,University of Arizona,644785,199,Female
1,Rating,2013-03-29 00:55:18 +0000 UTC,,4.0,ECE6040,a very knowledgeable professor her accent may ...,False,2013-03-29 00:25:18 +0000 UTC,3.0,UNFLAGGED,...,[],0.0,0.0,,Vesna Zderic,Engineering,George Washington University,1760280,5,Female
2,Rating,,,2.0,UGC211,bad teacher he demeans students whom he does n...,False,2004-10-24 15:09:48 +0000 UTC,1.0,UNFLAGGED,...,[],0.0,0.0,,David Johnson,Languages,University at Buffalo (SUNY Buffalo),140727,23,Male
3,Rating,2021-12-09 14:08:35 +0000 UTC,non mandatory,5.0,HDFS3430,a very respectable and knowledgeable professor...,False,2021-12-06 17:36:30 +0000 UTC,2.0,UNFLAGGED,...,[],0.0,0.0,1.0,Russell Ravert,Human Development,University of Missouri - Columbia,1698828,3,Male
4,Rating,2012-11-07 18:40:56 +0000 UTC,,2.0,BUSX170,completely boring and likes to ramble on will ...,False,2012-11-07 17:20:33 +0000 UTC,3.0,UNFLAGGED,...,[{'computerId': 'C1529A92404BFE8EA2371E96DAE13...,0.0,5.0,,Paul Coulis,Business,Indiana University Bloomington,1197121,13,Male


In [None]:
# Take a random sample of 10,000 rows from the dataset
sampled_df = df.sample(n=10000, random_state=42)

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def preprocess(text):
    return tokenizer(text, padding=True, truncation=True, return_tensors="pt")

def encode_text(text):
    inputs = preprocess(text)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Create a utility function to generate professor report
def generate_professor_report(course_code, top_n=5):
    # Filter the dataset for the given course code
    course_reviews = sampled_df[sampled_df['class_Rating'] == course_code]

    # Encode the comments
    course_reviews['encoded_comments'] = course_reviews['comment_Rating'].apply(encode_text)

    # Aggregate the encoded comments by professor
    professor_embeddings = course_reviews.groupby('ProfessorName')['encoded_comments'].apply(np.mean).reset_index()

    # Calculate the cosine similarity between professor embeddings
    embeddings = np.stack(professor_embeddings['encoded_comments'].values)
    similarities = cosine_similarity(embeddings)

    # Find the top N similar professors
    similar_professors = pd.DataFrame(similarities, index=professor_embeddings['ProfessorName'], columns=professor_embeddings['ProfessorName'])

    # Get the top N professors for the course
    top_professors = similar_professors.mean(axis=1).sort_values(ascending=False).head(top_n).index.tolist()

    # Generate the report
    report = sampled_df[sampled_df['ProfessorName'].isin(top_professors)]

    return report[['ProfessorName', 'SchoolName', 'Department', 'comment_Rating']]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
import re
# Remove identifying columns
df_cleaned = sampled_df.drop(columns=['ProfessorName', 'predictedGender'], errors='ignore')

def remove_pronouns(text):
    if isinstance(text, str):  # Check if the input is a string
        pronouns = r'\b(he|she|him|her|his|hers|himself|herself|guy|man|woman|girl|this|that)\b'
        return re.sub(pronouns, '', text, flags=re.IGNORECASE)
    return text  # If it's not a string (e.g., NaN), return it unchanged

# Apply the function to remove pronouns from comment_Rating
df_cleaned['comment_Rating'] = df_cleaned['comment_Rating'].apply(remove_pronouns)

df_cleaned = pd.DataFrame(df_cleaned)
df_cleaned.head()

Unnamed: 0,__typename_Rating,adminReviewedAt_Rating,attendanceMandatory_Rating,clarityRating_Rating,class_Rating,comment_Rating,createdByUser_Rating,date_Rating,difficultyRating_Rating,flagStatus_Rating,...,teacherNote_Rating,textbookUse_Rating,thumbs_Rating,thumbsDownTotal_Rating,thumbsUpTotal_Rating,wouldTakeAgain_Rating,Department,SchoolName,LegacyId,NumRatings
6252,Rating,2019-03-13 19:46:47 +0000 UTC,non mandatory,5.0,ACCT215,is really funny and the lectures are not bori...,False,2019-03-13 19:46:15 +0000 UTC,2.0,UNFLAGGED,...,,5.0,[],0.0,0.0,1.0,Business,Iowa State University,1754338,144
4684,Rating,2013-01-24 22:23:42 +0000 UTC,,5.0,COM323,teacher was was great made the class fly by ...,False,2013-01-24 18:10:33 +0000 UTC,1.0,UNFLAGGED,...,,3.0,[{'computerId': '9FD4833FF016AF42C26E0DCF31158...,1.0,0.0,,Communication,University of Alabama,1794489,1
1731,Rating,,,5.0,GLY1000,funny teacher if you go to class it is easy be...,False,2004-11-23 02:02:13 +0000 UTC,1.0,UNFLAGGED,...,,,[],0.0,0.0,,Geology,Florida State University,406776,69
4742,Rating,2023-12-22 00:55:44 +0000 UTC,mandatory,5.0,ACC210,great professor has a slight accent but as lon...,False,2023-12-22 00:55:28 +0000 UTC,3.0,UNFLAGGED,...,,-1.0,[],0.0,0.0,1.0,Accounting,University of Hawaii at Manoa,1094192,45
4521,Rating,2014-12-30 12:54:14 +0000 UTC,,3.0,SPEAV261,professor morris is a great i ended up with a...,False,2014-12-23 01:47:51 +0000 UTC,2.0,UNFLAGGED,...,,1.0,"[{'computerId': 'upw52ksxp7x2pzo674y8ydwi', 'i...",0.0,0.0,,Public Environmental Affairs,Indiana University Bloomington,1969228,2


In [None]:
df_professor_info = sampled_df[['ProfessorName', 'predictedGender', 'SchoolName', 'Department', 'comment_Rating']]
df_professor_info['comment_Rating'] = df_professor_info['comment_Rating'].apply(remove_pronouns)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_professor_info['comment_Rating'] = df_professor_info['comment_Rating'].apply(remove_pronouns)


In [None]:
# Example usage
course_code = 'INDV101'
report = generate_professor_report(course_code)
print(f"Professor report for the course {course_code}:")
print(report)

Professor report for the course INDV101:
        ProfessorName             SchoolName  Department  \
0        Amy Fountain  University of Arizona   Languages   
3259  Suzanne Delaney  University of Arizona  Psychology   

                                         comment_Rating  
0     she very passionate about what she teaches but...  
3259  structure of mind and behavior with delaney is...  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  course_reviews['encoded_comments'] = course_reviews['comment_Rating'].apply(encode_text)


In [None]:
pd.set_option('display.max_colwidth', None)
report = generate_professor_report(course_code)
print(f"Professor report for the course {course_code}:")
display(report)

Professor report for the course INDV101:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  course_reviews['encoded_comments'] = course_reviews['comment_Rating'].apply(encode_text)


Unnamed: 0,ProfessorName,SchoolName,Department,comment_Rating
0,Amy Fountain,University of Arizona,Languages,she very passionate about what she teaches but the lectures are really boring i fell asleep all too often it a really easy class and the assignments seem kind of pointless but it not a terrible class super easy and super boring
3259,Suzanne Delaney,University of Arizona,Psychology,structure of mind and behavior with delaney is probably the best general education class that you can take here you can tell that she really loves the topic and is a very engaging lecturer she also loves to show movie clips the class was incredibly easy whether you take the honors section or not she certainly sparked my interest


In [None]:
# Take a random sample of 10,000 rows from the dataset
sampled_df = df.sample(n=10000, random_state=42)

In [9]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the tokenizer and model for distilgpt2
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
model = AutoModelForCausalLM.from_pretrained("distilgpt2")

# Function to get gender prediction from distilgpt2
def predict_gender(name, comment):
    prompt = f"Based on the following professor's name and comment, predict their likely gender:\n\nProfessor Name: {name}\nComment: {comment}\n\nWhat is the most likely gender of this professor? (Male, Female, Unknown)"

    # Encode the input and generate the model's response
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=20, pad_token_id=tokenizer.eos_token_id)

    # Decode the output to get the predicted gender
    predicted_gender = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

    # Parse output to capture only the expected gender response (Male, Female, Unknown)
    for gender in ["Male", "Female", "Unknown"]:
        if gender in predicted_gender:
            return gender
    return "Unknown"  # Default if no match is found

# Assuming df_professor_info is defined and has the necessary columns
df_professor_info['predictedGender_distilgpt2'] = df_professor_info.apply(lambda row: predict_gender(row['ProfessorName'], row['comment_Rating']), axis=1)

# Compare distilgpt2's predictions with the existing predicted gender
df_professor_info['GenderMatch'] = df_professor_info['predictedGender'] == df_professor_info['predictedGender_distilgpt2']

# Show the updated DataFrame with predictions and comparison
print(df_professor_info)

# Optional: Export the DataFrame with results to a CSV for further analysis
df_professor_info.to_csv('professor_gender_comparison_distilgpt2.csv', index=False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_professor_info['predictedGender_distilgpt2'] = df_professor_info.apply(lambda row: predict_gender(row['ProfessorName'], row['comment_Rating']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_professor_info['GenderMatch'] = df_professor_info['predictedGender'] == df_professor_info['predictedGender_distilgpt2']


         ProfessorName predictedGender  \
6252   Michael Bootsma            Male   
4684     Mallory Marsh          Female   
1731        Leroy Odom            Male   
4742     Boo Chun Jung            Male   
4521    Rodger  Morris            Male   
...                ...             ...   
5734  Jennifer Jenkins          Female   
5191       Paul Kaplan            Male   
5390   Mary Majadillas          Female   
860        Nadia Kader          Female   
7270      Beth Stephan          Female   

                                           SchoolName  \
6252                            Iowa State University   
4684                            University of Alabama   
1731                         Florida State University   
4742                    University of Hawaii at Manoa   
4521                   Indiana University Bloomington   
...                                               ...   
5734                    Grand Valley State University   
5191                    Stony Brook Uni

In [16]:
# Count the number of correct predictions
correct_predictions = df_professor_info['GenderMatch'].sum()

# Calculate the total number of predictions
total_predictions = len(df_professor_info)

# Calculate the percentage of correct predictions
accuracy_percentage = (correct_predictions / total_predictions) * 100

print(f"Percentage of correct predictions: {accuracy_percentage:.2f}%")


Percentage of correct predictions: 63.04%


In [None]:
import pandas as pd
import re


def clean_comments(comment, professor_name):

    if isinstance(comment, str) and isinstance(professor_name, str):

        cleaned_comment = re.sub(professor_name, '', comment, flags=re.IGNORECASE)
         cleaned_comment = re.sub(r'\b(he|she)\b', 'he', cleaned_comment, flags=re.IGNORECASE)


        cleaned_comment = re.sub(r'\b(his|her)\b', 'her', cleaned_comment, flags=re.IGNORECASE)


        cleaned_comment = re.sub(r'\b(he|she)\b', 'they', cleaned_comment, flags=re.IGNORECASE)


        cleaned_comment = re.sub(r'\b(his|her)\b', 'their', cleaned_comment, flags=re.IGNORECASE)

        return cleaned_comment
    return comment




sampled_df.head()


Unnamed: 0,__typename_Rating,adminReviewedAt_Rating,attendanceMandatory_Rating,clarityRating_Rating,class_Rating,comment_Rating,createdByUser_Rating,date_Rating,difficultyRating_Rating,flagStatus_Rating,...,thumbs_Rating,thumbsDownTotal_Rating,thumbsUpTotal_Rating,wouldTakeAgain_Rating,Department,SchoolName,LegacyId,NumRatings,predictedGender,cleaned_comment
6252,Rating,2019-03-13 19:46:47 +0000 UTC,non mandatory,5.0,ACCT215,he is really funny and the lectures are not bo...,False,2019-03-13 19:46:15 +0000 UTC,2.0,UNFLAGGED,...,[],0.0,0.0,1.0,Business,Iowa State University,1754338,144,Male,they is really funny and the lectures are not ...
4684,Rating,2013-01-24 22:23:42 +0000 UTC,,5.0,COM323,this teacher was was great she made the class ...,False,2013-01-24 18:10:33 +0000 UTC,1.0,UNFLAGGED,...,[{'computerId': '9FD4833FF016AF42C26E0DCF31158...,1.0,0.0,,Communication,University of Alabama,1794489,1,Female,this teacher was was great they made the class...
1731,Rating,,,5.0,GLY1000,funny teacher if you go to class it is easy be...,False,2004-11-23 02:02:13 +0000 UTC,1.0,UNFLAGGED,...,[],0.0,0.0,,Geology,Florida State University,406776,69,Male,funny teacher if you go to class it is easy be...
4742,Rating,2023-12-22 00:55:44 +0000 UTC,mandatory,5.0,ACC210,great professor has a slight accent but as lon...,False,2023-12-22 00:55:28 +0000 UTC,3.0,UNFLAGGED,...,[],0.0,0.0,1.0,Accounting,University of Hawaii at Manoa,1094192,45,Male,great professor has a slight accent but as lon...
4521,Rating,2014-12-30 12:54:14 +0000 UTC,,3.0,SPEAV261,professor morris is a great guy i ended up wit...,False,2014-12-23 01:47:51 +0000 UTC,2.0,UNFLAGGED,...,"[{'computerId': 'upw52ksxp7x2pzo674y8ydwi', 'i...",0.0,0.0,,Public Environmental Affairs,Indiana University Bloomington,1969228,2,Male,professor morris is a great guy i ended up wit...
