In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df =  pd.read_csv("/content/drive/MyDrive/Data/sampled_professor_reviews.csv")
df.head()

Unnamed: 0,__typename_Rating,adminReviewedAt_Rating,attendanceMandatory_Rating,clarityRating_Rating,class_Rating,comment_Rating,createdByUser_Rating,date_Rating,difficultyRating_Rating,flagStatus_Rating,...,thumbs_Rating,thumbsDownTotal_Rating,thumbsUpTotal_Rating,wouldTakeAgain_Rating,ProfessorName,Department,SchoolName,LegacyId,NumRatings,predictedGender
0,Rating,2008-01-29 23:49:03 +0000 UTC,,5.0,INDV101,she very passionate about what she teaches but...,False,2007-12-12 18:40:36 +0000 UTC,1.0,UNFLAGGED,...,[],0.0,0.0,,Amy Fountain,Languages,University of Arizona,644785,199,Female
1,Rating,2013-03-29 00:55:18 +0000 UTC,,4.0,ECE6040,a very knowledgeable professor her accent may ...,False,2013-03-29 00:25:18 +0000 UTC,3.0,UNFLAGGED,...,[],0.0,0.0,,Vesna Zderic,Engineering,George Washington University,1760280,5,Female
2,Rating,,,2.0,UGC211,bad teacher he demeans students whom he does n...,False,2004-10-24 15:09:48 +0000 UTC,1.0,UNFLAGGED,...,[],0.0,0.0,,David Johnson,Languages,University at Buffalo (SUNY Buffalo),140727,23,Male
3,Rating,2021-12-09 14:08:35 +0000 UTC,non mandatory,5.0,HDFS3430,a very respectable and knowledgeable professor...,False,2021-12-06 17:36:30 +0000 UTC,2.0,UNFLAGGED,...,[],0.0,0.0,1.0,Russell Ravert,Human Development,University of Missouri - Columbia,1698828,3,Male
4,Rating,2012-11-07 18:40:56 +0000 UTC,,2.0,BUSX170,completely boring and likes to ramble on will ...,False,2012-11-07 17:20:33 +0000 UTC,3.0,UNFLAGGED,...,[{'computerId': 'C1529A92404BFE8EA2371E96DAE13...,0.0,5.0,,Paul Coulis,Business,Indiana University Bloomington,1197121,13,Male


In [None]:
# Take a random sample of 10,000 rows from the dataset
sampled_df = df.sample(n=10000, random_state=42)

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def preprocess(text):
    return tokenizer(text, padding=True, truncation=True, return_tensors="pt")

def encode_text(text):
    inputs = preprocess(text)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Create a utility function to generate professor report
def generate_professor_report(course_code, top_n=5):
    # Filter the dataset for the given course code
    course_reviews = sampled_df[sampled_df['class_Rating'] == course_code]

    # Encode the comments
    course_reviews['encoded_comments'] = course_reviews['comment_Rating'].apply(encode_text)

    # Aggregate the encoded comments by professor
    professor_embeddings = course_reviews.groupby('ProfessorName')['encoded_comments'].apply(np.mean).reset_index()

    # Calculate the cosine similarity between professor embeddings
    embeddings = np.stack(professor_embeddings['encoded_comments'].values)
    similarities = cosine_similarity(embeddings)

    # Find the top N similar professors
    similar_professors = pd.DataFrame(similarities, index=professor_embeddings['ProfessorName'], columns=professor_embeddings['ProfessorName'])

    # Get the top N professors for the course
    top_professors = similar_professors.mean(axis=1).sort_values(ascending=False).head(top_n).index.tolist()

    # Generate the report
    report = sampled_df[sampled_df['ProfessorName'].isin(top_professors)]

    return report[['ProfessorName', 'SchoolName', 'Department', 'comment_Rating']]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
# Example usage
course_code = 'INDV101'
report = generate_professor_report(course_code)
print(f"Professor report for the course {course_code}:")
print(report)

Professor report for the course INDV101:
        ProfessorName             SchoolName  Department  \
0        Amy Fountain  University of Arizona   Languages   
3259  Suzanne Delaney  University of Arizona  Psychology   

                                         comment_Rating  
0     she very passionate about what she teaches but...  
3259  structure of mind and behavior with delaney is...  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  course_reviews['encoded_comments'] = course_reviews['comment_Rating'].apply(encode_text)


In [None]:
# Take a random sample of 10,000 rows from the dataset
sampled_df = df.sample(n=10000, random_state=42)

In [None]:
from transformers import pipeline
# Extract relevant text for gender prediction (here we use 'comment_Rating')
comments = sampled_df['comment_Rating'].astype(str).tolist()

# Load a text classification pipeline (basic LLM for gender classification)
classifier = pipeline('text-classification', model='bhadresh-savani/bert-base-uncased-emotion')

# Placeholder function to infer gender from comments (basic assumption using emotion model)
def predict_gender(comment):
    try:
        # Get prediction from LLM
        prediction = classifier(comment[:512])  # Limiting comment to 512 tokens
        # Based on the response, we will assume gender (this is just for demonstration)
        # Adjust this based on how the LLM's output looks
        if 'positive' in prediction[0]['label'].lower():
            return 'Male'  # This is an arbitrary choice for the sake of example
        elif 'negative' in prediction[0]['label'].lower():
            return 'Female'  # Adjust based on real labels if a dedicated model is used
        else:
            return 'they/them'  # Defaulting to non-binary
    except Exception as e:
        return 'Unknown'  # In case the model fails to predict

# Apply gender prediction to each comment
sampled_df['predicted_gender_from_LLM'] = df['comment_Rating'].apply(predict_gender)

# Verify the predicted gender with the existing 'predictedGender' column
sampled_df['is_gender_matching'] = df['predictedGender'] == df['predicted_gender_from_LLM']

# Show mismatch cases
mismatch_df = sampled_df[sampled_df['is_gender_matching'] == False]

# Save the results to a new CSV file
sampled_df.to_csv("gender_prediction_verification.csv", index=False)

print("Gender prediction and verification completed. Mismatch cases saved.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/935 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



KeyboardInterrupt: 

In [None]:
import pandas as pd
import re


def clean_comments(comment, professor_name):

    if isinstance(comment, str) and isinstance(professor_name, str):

        cleaned_comment = re.sub(professor_name, '', comment, flags=re.IGNORECASE)
         cleaned_comment = re.sub(r'\b(he|she)\b', 'he', cleaned_comment, flags=re.IGNORECASE)


        cleaned_comment = re.sub(r'\b(his|her)\b', 'her', cleaned_comment, flags=re.IGNORECASE)


        cleaned_comment = re.sub(r'\b(he|she)\b', 'they', cleaned_comment, flags=re.IGNORECASE)


        cleaned_comment = re.sub(r'\b(his|her)\b', 'their', cleaned_comment, flags=re.IGNORECASE)

        return cleaned_comment
    return comment




sampled_df.head()


Unnamed: 0,__typename_Rating,adminReviewedAt_Rating,attendanceMandatory_Rating,clarityRating_Rating,class_Rating,comment_Rating,createdByUser_Rating,date_Rating,difficultyRating_Rating,flagStatus_Rating,...,thumbs_Rating,thumbsDownTotal_Rating,thumbsUpTotal_Rating,wouldTakeAgain_Rating,Department,SchoolName,LegacyId,NumRatings,predictedGender,cleaned_comment
6252,Rating,2019-03-13 19:46:47 +0000 UTC,non mandatory,5.0,ACCT215,he is really funny and the lectures are not bo...,False,2019-03-13 19:46:15 +0000 UTC,2.0,UNFLAGGED,...,[],0.0,0.0,1.0,Business,Iowa State University,1754338,144,Male,they is really funny and the lectures are not ...
4684,Rating,2013-01-24 22:23:42 +0000 UTC,,5.0,COM323,this teacher was was great she made the class ...,False,2013-01-24 18:10:33 +0000 UTC,1.0,UNFLAGGED,...,[{'computerId': '9FD4833FF016AF42C26E0DCF31158...,1.0,0.0,,Communication,University of Alabama,1794489,1,Female,this teacher was was great they made the class...
1731,Rating,,,5.0,GLY1000,funny teacher if you go to class it is easy be...,False,2004-11-23 02:02:13 +0000 UTC,1.0,UNFLAGGED,...,[],0.0,0.0,,Geology,Florida State University,406776,69,Male,funny teacher if you go to class it is easy be...
4742,Rating,2023-12-22 00:55:44 +0000 UTC,mandatory,5.0,ACC210,great professor has a slight accent but as lon...,False,2023-12-22 00:55:28 +0000 UTC,3.0,UNFLAGGED,...,[],0.0,0.0,1.0,Accounting,University of Hawaii at Manoa,1094192,45,Male,great professor has a slight accent but as lon...
4521,Rating,2014-12-30 12:54:14 +0000 UTC,,3.0,SPEAV261,professor morris is a great guy i ended up wit...,False,2014-12-23 01:47:51 +0000 UTC,2.0,UNFLAGGED,...,"[{'computerId': 'upw52ksxp7x2pzo674y8ydwi', 'i...",0.0,0.0,,Public Environmental Affairs,Indiana University Bloomington,1969228,2,Male,professor morris is a great guy i ended up wit...
