In [1]:
import pandas as pd

# Load the data
professors_df = pd.read_csv('RateMyProfAnalysis/UMassReviews/professors_df_clean.csv')
reviews_df = pd.read_csv('RateMyProfAnalysis/UMassReviews/reviews_df_clean_sentiment.csv')

import json

def clean_and_parse_json(x):
    if not isinstance(x, str):
        return x
    try:
        # First attempt: direct JSON parse
        return json.loads(x)
    except json.JSONDecodeError:
        try:
            # Second attempt: Replace single quotes with double quotes
            x = x.replace("'", '"')
            return json.loads(x)
        except json.JSONDecodeError:
            try:
                # Third attempt: Use ast.literal_eval (safer than eval)
                import ast
                return ast.literal_eval(x)
            except:
                print(f"Failed to parse: {x[:100]}...")  # Print first 100 chars
                return {}  # Return empty dict if all parsing attempts fail


reviews_df['class_identifiers'] = reviews_df['class_identifiers'].apply(clean_and_parse_json)
print(reviews_df['class_identifiers'][0])



[{'key': '203815-100', 'name': 'PSYCH100'}]


In [2]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer

from nltk.stem import PorterStemmer
from sumy.nlp.tokenizers import Tokenizer

def summarize_text(text, num_sentences=4):
    stemmer = PorterStemmer()  # Initialize the NLTK stemmer
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LsaSummarizer()
    summarizer.stemmer = stemmer  # Set the NLTK stemmer for the summarizer
    summary = summarizer(parser.document, num_sentences)
    return " ".join([str(sentence) for sentence in summary])

def get_all_reviews_for_professor(professor_id):
    review_list = reviews_df[reviews_df['tid'] == professor_id]['comment'].tolist()
    return " ".join(review_list)

def get_raw_reviews_for_professor(professor_id):
    return reviews_df[reviews_df['tid'] == professor_id]

print(summarize_text(get_all_reviews_for_professor(2936075)))

His lectures are basically him saying "This should be obvious" while teaching students concepts instead of carefully taking time to offer thorough explanations. Wilson was clearly lacking in social skills, and he held grudges against students who were genuinely trying to learn something new. Discussions and office hours were also poorly run, but he refused to take the class's weaknesses as a reflection on his pedagogy. Gotcha questions on exams, apathetic,disinterested, there aren't enough adjectives to describe the way this man treats people and runs his classroom.


In [3]:
# Summarize by class
def get_reviews_for_class(class_id):
    review_list = []
    for index, review in reviews_df.iterrows():  # Use iterrows to iterate over DataFrame rows
        class_identifiers = review['class_identifiers']
        if isinstance(class_identifiers, list):  # Ensure class_identifiers is a list
            for item in class_identifiers:
                if item.get('key') == class_id:
                    review_list.append(review['comment'])
    return " ".join(review_list)

def get_raw_reviews_for_class(class_id):
    filtered_reviews = reviews_df[reviews_df['class_identifiers'].apply(lambda x: isinstance(x, list) and any(item.get('key') == class_id for item in x))]
    return filtered_reviews

print(summarize_text(get_reviews_for_class('83082-380')))



However the class is a lot of work you MUST attend all lectures and keep up with readings to do well. You'll enjoy going to lecture, even though it' an 8AM (don't  skip, because a large chunk of test material centers around case studies discussed, videos shown, and diagnostic criteria presented in class. He makes the class interesting by sharing stories of his actual cases and provides three review sessions a week if you need help or extra credit. Didn't live up to my expectations after what I read on here...Nice guy, bit of an ego, but knows his stuff.


In [4]:
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer

def summarize_text_compare(text, summarizer_type='lsa', sentences_count=4, language="english"):
    stemmer = PorterStemmer()  # Initialize the NLTK stemmer
    parser = PlaintextParser.from_string(text, Tokenizer(language))

    """
    Parameters:
    - text: string, the text to summarize
    - summarizer_type: string, type of summarizer ('lsa', 'lexrank', 'luhn', or 'textrank')
    - sentences_count: int, number of sentences in summary
    - language: string, language of text
    
    Returns:
    - string, the summarized text
    """
    
    # Choose summarizer
    if summarizer_type.lower() == 'lsa':
        summarizer = LsaSummarizer()
    elif summarizer_type.lower() == 'lexrank':
        summarizer = LexRankSummarizer()
    elif summarizer_type.lower() == 'luhn':
        summarizer = LuhnSummarizer()
    elif summarizer_type.lower() == 'textrank':
        summarizer = TextRankSummarizer()
    else:
        raise ValueError("Invalid summarizer type")
    
    # Add stop words
    summarizer.stemmer = stemmer
    
    # Create summary
    summary = summarizer(parser.document, sentences_count)
    
    # Join sentences and return
    return " ".join([str(sentence) for sentence in summary])

for item in ['lsa', 'lexrank', 'luhn', 'textrank']:
    print(summarize_text_compare(get_reviews_for_class('83082-380'), item))




However the class is a lot of work you MUST attend all lectures and keep up with readings to do well. You'll enjoy going to lecture, even though it' an 8AM (don't  skip, because a large chunk of test material centers around case studies discussed, videos shown, and diagnostic criteria presented in class. He makes the class interesting by sharing stories of his actual cases and provides three review sessions a week if you need help or extra credit. Didn't live up to my expectations after what I read on here...Nice guy, bit of an ego, but knows his stuff.
His lectures are very engaging and makes it easy to get up early to go to class. His lectures are so interesting. Halgin's class is the best! Very funny as well for abnormal psych his class was very interesting and was one of my favorite classes that i have taken.
I wish he was a professor for more psych classes because the amount I learned reflects how well he taught the material in a fun and interesting way, and made me want to go to 

In [5]:
def sumy_sentiment_by_class(class_id):
    # Get the reviews for the specified class
    reviews = get_raw_reviews_for_class(class_id)
    # Calculate the average sentiment of the reviews
    import numpy as np
    average_sentiment = np.mean(reviews['sentiment_polarity'].tolist())

    from textblob import TextBlob
    summary_sentiment = TextBlob(summarize_text_compare(get_reviews_for_class(class_id), 'lsa')).sentiment.polarity

    return {'average_sentiment': average_sentiment, 'summary_sentiment': summary_sentiment}

print(sumy_sentiment_by_class('83082-380'))

def sumy_sentiment_by_professor(professor_id):
    # Get the reviews for the specified professor
    reviews = get_raw_reviews_for_professor(professor_id)

    # Calculate the average sentiment of the reviews
    import numpy as np
    average_sentiment = np.mean(reviews['sentiment_polarity'].tolist())

    from textblob import TextBlob
    summary_sentiment = TextBlob(summarize_text_compare(get_all_reviews_for_professor(professor_id), 'lsa')).sentiment.polarity

    return {'average_sentiment': average_sentiment, 'summary_sentiment': summary_sentiment}

print(sumy_sentiment_by_professor(2936075))


{'average_sentiment': np.float64(0.36357141514265673), 'summary_sentiment': 0.20844155844155843}
{'average_sentiment': np.float64(-0.0009831377042025218), 'summary_sentiment': 0.02121212121212121}


In [7]:
def summary_evaluation(summary_functions):

    def run_summary_function(summary_function, data_list):
        evaluation_list = []
        from tqdm import tqdm
        progress_bar = tqdm(total=len(data_list), desc=f"Processing {summary_function.__name__}", bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]')
        for item in data_list:
            evaluation_list.append(summary_function(item))
            progress_bar.update(1)
        progress_bar.close()  # Close the progress bar after completion
        
        error_list = []
        absolute_error_list = []
        for item in evaluation_list:
            error_list.append(item['average_sentiment'] - item['summary_sentiment'])
            absolute_error_list.append(abs(item['average_sentiment'] - item['summary_sentiment']))

        return {'error_list': error_list, 'absolute_error_list': absolute_error_list}

    classes = []

    for review in reviews_df['class_identifiers']:
        for item in review:
            classes.append(item['key'])

    unique_classes = list(set(classes))
    return [run_summary_function(summary_functions[0], professors_df['id']), run_summary_function(summary_functions[1], unique_classes)]


errors = summary_evaluation([sumy_sentiment_by_professor, sumy_sentiment_by_class])
professor_errors = errors[0]
class_errors = errors[1]

import numpy as np

professor_absolute_error_list = professor_errors['absolute_error_list']
mean_absolute_error = np.mean(professor_absolute_error_list)
median_absolute_error = np.median(professor_absolute_error_list)
std_dev_absolute_error = np.std(professor_absolute_error_list)
print("Professor Summary Evaluation")
print(f"Mean Absolute Error: {mean_absolute_error}")
print(f"Median Absolute Error: {median_absolute_error}")
print(f"Standard Deviation of Absolute Error: {std_dev_absolute_error}")

class_absolute_error_list = class_errors['absolute_error_list']
mean_absolute_error = np.mean(class_absolute_error_list)
median_absolute_error = np.median(class_absolute_error_list)
std_dev_absolute_error = np.std(class_absolute_error_list)

print("Class Summary Evaluation")
print(f"Mean Absolute Error: {mean_absolute_error}")
print(f"Median Absolute Error: {median_absolute_error}")
print(f"Standard Deviation of Absolute Error: {std_dev_absolute_error}")



Processing sumy_sentiment_by_professor:   0%|                                                        | 0/1000 [00:00<?]

Processing sumy_sentiment_by_professor: 100%|█████████████████████████████████████████████████| 1000/1000 [02:09<00:00]
Processing sumy_sentiment_by_class: 100%|███████████████████████████████████████████████████| 5486/5486 [2:12:33<00:00]

Professor Summary Evaluation
Mean Absolute Error: 0.12279705164002853
Median Absolute Error: 0.10215078298824566
Standard Deviation of Absolute Error: 0.09396239068258952
Class Summary Evaluation
Mean Absolute Error: 0.08607122578162171
Median Absolute Error: 0.0522152627465128
Standard Deviation of Absolute Error: 0.10616727651244011





In [None]:
# Use modern approach with LLM to summarize reviews

