In [4]:
import pandas as pd

# Load the data
import os
print(os.getcwd())

# Now load the data
professors_df = pd.read_csv('../UMassReviews/professors_df_clean.csv')
reviews_df = pd.read_csv('../UMassReviews/reviews_df_clean_sentiment.csv')


import json

def clean_and_parse_json(x):
    if not isinstance(x, str):
        return x
    try:
        # First attempt: direct JSON parse
        return json.loads(x)
    except json.JSONDecodeError:
        try:
            # Second attempt: Replace single quotes with double quotes
            x = x.replace("'", '"')
            return json.loads(x)
        except json.JSONDecodeError:
            try:
                # Third attempt: Use ast.literal_eval (safer than eval)
                import ast
                return ast.literal_eval(x)
            except:
                print(f"Failed to parse: {x[:100]}...")  # Print first 100 chars
                return {}  # Return empty dict if all parsing attempts fail


reviews_df['class_identifiers'] = reviews_df['class_identifiers'].apply(clean_and_parse_json)
print(reviews_df['class_identifiers'][0])



c:\Users\austi\machinelearning\RateMyProfessorStats\Notebooks
[{'key': '203815-100', 'name': 'PSYCH100'}]


In [9]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from nltk.stem import PorterStemmer
import nltk

# Download required NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')

def summarize_text(text, num_sentences=4):
    stemmer = PorterStemmer()  # Initialize the NLTK stemmer
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LsaSummarizer()
    summarizer.stemmer = stemmer  # Set the NLTK stemmer for the summarizer
    summary = summarizer(parser.document, num_sentences)
    return " ".join([str(sentence) for sentence in summary])

def get_all_reviews_for_professor(professor_id):
    review_list = reviews_df[reviews_df['tid'] == professor_id]['comment'].tolist()
    return " ".join(review_list)

def get_raw_reviews_for_professor(professor_id):
    return reviews_df[reviews_df['tid'] == professor_id]

print(summarize_text(get_all_reviews_for_professor(2936075)))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\austi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\austi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


His lectures are basically him saying "This should be obvious" while teaching students concepts instead of carefully taking time to offer thorough explanations. Wilson was clearly lacking in social skills, and he held grudges against students who were genuinely trying to learn something new. Discussions and office hours were also poorly run, but he refused to take the class's weaknesses as a reflection on his pedagogy. Gotcha questions on exams, apathetic,disinterested, there aren't enough adjectives to describe the way this man treats people and runs his classroom.


In [10]:
# Summarize by class
def get_reviews_for_class(class_id):
    review_list = []
    for index, review in reviews_df.iterrows():  # Use iterrows to iterate over DataFrame rows
        class_identifiers = review['class_identifiers']
        if isinstance(class_identifiers, list):  # Ensure class_identifiers is a list
            for item in class_identifiers:
                if item.get('key') == class_id:
                    review_list.append(review['comment'])
    return " ".join(review_list)

def get_raw_reviews_for_class(class_id):
    filtered_reviews = reviews_df[reviews_df['class_identifiers'].apply(lambda x: isinstance(x, list) and any(item.get('key') == class_id for item in x))]
    return filtered_reviews

print(summarize_text(get_reviews_for_class('83082-380')))



However the class is a lot of work you MUST attend all lectures and keep up with readings to do well. You'll enjoy going to lecture, even though it' an 8AM (don't  skip, because a large chunk of test material centers around case studies discussed, videos shown, and diagnostic criteria presented in class. He makes the class interesting by sharing stories of his actual cases and provides three review sessions a week if you need help or extra credit. Didn't live up to my expectations after what I read on here...Nice guy, bit of an ego, but knows his stuff.


In [11]:
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer

def summarize_text_compare(text, summarizer_type='lsa', sentences_count=4, language="english"):
    stemmer = PorterStemmer()  # Initialize the NLTK stemmer
    parser = PlaintextParser.from_string(text, Tokenizer(language))

    """
    Parameters:
    - text: string, the text to summarize
    - summarizer_type: string, type of summarizer ('lsa', 'lexrank', 'luhn', or 'textrank')
    - sentences_count: int, number of sentences in summary
    - language: string, language of text
    
    Returns:
    - string, the summarized text
    """
    
    # Choose summarizer
    if summarizer_type.lower() == 'lsa':
        summarizer = LsaSummarizer()
    elif summarizer_type.lower() == 'lexrank':
        summarizer = LexRankSummarizer()
    elif summarizer_type.lower() == 'luhn':
        summarizer = LuhnSummarizer()
    elif summarizer_type.lower() == 'textrank':
        summarizer = TextRankSummarizer()
    else:
        raise ValueError("Invalid summarizer type")
    
    # Add stop words
    summarizer.stemmer = stemmer
    
    # Create summary
    summary = summarizer(parser.document, sentences_count)
    
    # Join sentences and return
    return " ".join([str(sentence) for sentence in summary])

for item in ['lsa', 'lexrank', 'luhn', 'textrank']:
    print(summarize_text_compare(get_reviews_for_class('83082-380'), item))




However the class is a lot of work you MUST attend all lectures and keep up with readings to do well. You'll enjoy going to lecture, even though it' an 8AM (don't  skip, because a large chunk of test material centers around case studies discussed, videos shown, and diagnostic criteria presented in class. He makes the class interesting by sharing stories of his actual cases and provides three review sessions a week if you need help or extra credit. Didn't live up to my expectations after what I read on here...Nice guy, bit of an ego, but knows his stuff.
His lectures are very engaging and makes it easy to get up early to go to class. His lectures are so interesting. Halgin's class is the best! Very funny as well for abnormal psych his class was very interesting and was one of my favorite classes that i have taken.
I wish he was a professor for more psych classes because the amount I learned reflects how well he taught the material in a fun and interesting way, and made me want to go to 

In [12]:
def sumy_sentiment_by_class(class_id):
    # Get the reviews for the specified class
    reviews = get_raw_reviews_for_class(class_id)
    # Calculate the average sentiment of the reviews
    import numpy as np
    average_sentiment = np.mean(reviews['sentiment_polarity'].tolist())

    from textblob import TextBlob
    summary_sentiment = TextBlob(summarize_text_compare(get_reviews_for_class(class_id), 'lsa')).sentiment.polarity

    return {'average_sentiment': average_sentiment, 'summary_sentiment': summary_sentiment}

print(sumy_sentiment_by_class('83082-380'))

def sumy_sentiment_by_professor(professor_id):
    # Get the reviews for the specified professor
    reviews = get_raw_reviews_for_professor(professor_id)

    # Calculate the average sentiment of the reviews
    import numpy as np
    average_sentiment = np.mean(reviews['sentiment_polarity'].tolist())

    from textblob import TextBlob
    summary_sentiment = TextBlob(summarize_text_compare(get_all_reviews_for_professor(professor_id), 'lsa')).sentiment.polarity

    return {'average_sentiment': average_sentiment, 'summary_sentiment': summary_sentiment}

print(sumy_sentiment_by_professor(2936075))


{'average_sentiment': np.float64(0.36357141514265673), 'summary_sentiment': 0.20844155844155843}
{'average_sentiment': np.float64(-0.0009831377042025218), 'summary_sentiment': 0.02121212121212121}


In [None]:
import pandas as pd
import os

if os.path.exists('../UmassReviews/professor_errors.pkl') and os.path.exists('../UmassReviews/class_errors.pkl'):
    # Load error data from CSV files
    professor_errors = pd.read_pickle('../UmassReviews/professor_errors.pkl')
    class_errors = pd.read_pickle('../UmassReviews/class_errors.pkl')
else:
    def summary_evaluation(summary_functions):

        def run_summary_function(summary_function, data_list):
            evaluation_list = []
            from tqdm import tqdm
            progress_bar = tqdm(total=len(data_list), desc=f"Processing {summary_function.__name__}", bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]')
            for item in data_list:
                evaluation_list.append(summary_function(item))
                progress_bar.update(1)
            progress_bar.close()  # Close the progress bar after completion
            
            error_list = []
            absolute_error_list = []
            for item in evaluation_list:
                error_list.append(item['average_sentiment'] - item['summary_sentiment'])
                absolute_error_list.append(abs(item['average_sentiment'] - item['summary_sentiment']))

            return {'error_list': error_list, 'absolute_error_list': absolute_error_list}

        classes = []

        for review in reviews_df['class_identifiers']:
            for item in review:
                classes.append(item['key'])

        unique_classes = list(set(classes))
        return [run_summary_function(summary_functions[0], professors_df['id']), run_summary_function(summary_functions[1], unique_classes)]


    errors = summary_evaluation([sumy_sentiment_by_professor, sumy_sentiment_by_class])
    professor_errors = errors[0]
    class_errors = errors[1]

import numpy as np

professor_absolute_error_list = professor_errors['absolute_error_list']
mean_absolute_error = np.mean(professor_absolute_error_list)
median_absolute_error = np.median(professor_absolute_error_list)
std_dev_absolute_error = np.std(professor_absolute_error_list)
print("Professor Summary Evaluation")
print(f"Mean Absolute Error: {mean_absolute_error}")
print(f"Median Absolute Error: {median_absolute_error}")
print(f"Standard Deviation of Absolute Error: {std_dev_absolute_error}")

class_absolute_error_list = class_errors['absolute_error_list']
mean_absolute_error = np.mean(class_absolute_error_list)
median_absolute_error = np.median(class_absolute_error_list)
std_dev_absolute_error = np.std(class_absolute_error_list)

print("Class Summary Evaluation")
print(f"Mean Absolute Error: {mean_absolute_error}")
print(f"Median Absolute Error: {median_absolute_error}")
print(f"Standard Deviation of Absolute Error: {std_dev_absolute_error}")



  warn(message % (words_count, sentences_count))
Processing sumy_sentiment_by_professor: 100%|██████████| 1000/1000 [02:21<00:00]
Processing sumy_sentiment_by_class: 100%|██████████| 5486/5486 [1:55:41<00:00]  

Professor Summary Evaluation
Mean Absolute Error: 0.12208193490171151
Median Absolute Error: 0.0986419331367494
Standard Deviation of Absolute Error: 0.09374860007190475
Class Summary Evaluation
Mean Absolute Error: 0.08643856197440637
Median Absolute Error: 0.0522152627465128
Standard Deviation of Absolute Error: 0.1067269912822949





In [14]:
import pickle

# Save the error data to pickle files
with open('professor_errors.pkl', 'wb') as f:
    pickle.dump(professor_errors, f)
    
with open('class_errors.pkl', 'wb') as f:
    pickle.dump(class_errors, f)

print("Error data saved to professor_errors.pkl and class_errors.pkl")


Error data saved to professor_errors.pkl and class_errors.pkl


A. Semantic Understanding (sentence-transformers):

```
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')
```

Purpose: Converts text into meaningful numerical vectors that capture semantic meaning
Use case: Helps group similar reviews together, even if they use different words
Example: "Great teacher" and "Excellent instructor" would be recognized as similar
B. Topic Modeling (BERTopic):
```
from bertopic import BERTopic
topic_model = BERTopic()
```
- Purpose: Automatically discovers themes/topics in reviews
How it works: Groups similar discussions together and identifies representative words
Example: Might find clusters like "Teaching Style", "Grading Fairness", "Workload"
C. Sentiment Analysis (TextBlob):
- Purpose: Determines the emotional tone of reviews
Use: Helps separate positive and negative aspects of each theme
Example: "Tough but fair" vs "Unnecessarily difficult"
D. Summarization (Transformers):
```
from transformers import pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
```
- Purpose: Creates concise summaries of grouped reviews
Use: After finding themes, summarizes the main points
Example: Converting 20 similar reviews about "tough grading" into one representative summary

Process Flow:
```
class ReviewAnalyzer:
    def __init__(self):
        # Initialize models
        self.sentence_model = SentenceTransformer('all-mpnet-base-v2')
        self.topic_model = BERTopic()
        self.summarizer = pipeline("summarization")

    def process_reviews(self, reviews):
        # 1. Initial Processing
        # Clean and validate reviews
        valid_reviews = [r for r in reviews if isinstance(r, str) and len(r.strip()) > 10]

        # 2. Theme Discovery
        # Find main topics using BERTopic
        topics, probs = self.topic_model.fit_transform(valid_reviews)
        
        # 3. Sentiment Analysis
        # Analyze sentiment for each theme
        sentiments = {
            topic: [
                TextBlob(review).sentiment.polarity 
                for review, t in zip(valid_reviews, topics) 
                if t == topic
            ]
            for topic in set(topics)
        }

        # 4. Theme Summarization
        # Group reviews by theme and summarize
        theme_summaries = {}
        for topic in set(topics):
            theme_reviews = [r for r, t in zip(valid_reviews, topics) if t == topic]
            if theme_reviews:
                summary = self.summarizer(" ".join(theme_reviews[:3]))
                theme_summaries[topic] = summary

```

Key Features to Consider:
A. Aspect-Based Analysis:
```
aspects = {
    'teaching_style': ['lectures', 'explains', 'teaching'],
    'workload': ['homework', 'assignments', 'projects'],
    'grading': ['grades', 'tests', 'exams'],
    'personality': ['helpful', 'understanding', 'approachable']
}
```
- Purpose: Pre-defined categories to organize feedback
Use: Helps ensure important aspects aren't missed
B. Frequency Analysis:
'''
from collections import Counter
def get_key_phrases(reviews, min_count=2):
    phrases = []
    for review in reviews:
        blob = TextBlob(review)
        phrases.extend(blob.noun_phrases)
    return Counter(phrases)
'''


In [16]:
import pandas as pd
import numpy as np
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
from collections import Counter
from textblob import TextBlob
from bertopic import BERTopic
from tqdm import tqdm
import torch
import warnings
warnings.filterwarnings('ignore')  # Suppress warnings

class ComprehensiveReviewAnalyzer:
    def __init__(self, device=None):
        """Initialize all necessary models and configurations"""
        self.device = device if device else ('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")
        
        # Initialize models
        print("Loading models...")
        self.sentence_model = SentenceTransformer('all-mpnet-base-v2')
        self.summarizer = pipeline("summarization", 
                                 model="facebook/bart-large-cnn",
                                 device=0 if self.device == 'cuda' else -1)
        self.topic_model = BERTopic(language="english", 
                                  calculate_probabilities=True,
                                  verbose=True)
        
        # Define aspect categories
        self.aspects = {
            'teaching_style': ['lectures', 'explains', 'teaching', 'presentation', 'clarity'],
            'difficulty': ['hard', 'easy', 'difficult', 'straightforward', 'challenging'],
            'workload': ['homework', 'assignments', 'workload', 'projects', 'work'],
            'grading': ['grades', 'tests', 'exams', 'quizzes', 'fair', 'marking'],
            'personality': ['helpful', 'understanding', 'approachable', 'office hours', 'kind'],
            'engagement': ['interesting', 'boring', 'engaging', 'interactive', 'enthusiasm']
        }

    def preprocess_reviews(self, reviews):
        """Clean and validate reviews"""
        # Remove empty or invalid reviews
        valid_reviews = [
            str(r).strip() 
            for r in reviews 
            if isinstance(r, (str, float, int)) and str(r).strip()
        ]
        
        # Remove very short reviews
        valid_reviews = [r for r in valid_reviews if len(r.split()) >= 5]
        
        # Basic cleaning
        cleaned_reviews = [
            r.replace('\n', ' ').replace('\r', ' ').strip()
            for r in valid_reviews
        ]
        
        return cleaned_reviews

    def extract_themes(self, reviews):
        """Extract main themes using BERTopic"""
        try:
            topics, probs = self.topic_model.fit_transform(reviews)
            topic_info = self.topic_model.get_topic_info()
            
            # Get representative docs for each topic
            topic_docs = {}
            for topic in set(topics):
                if topic != -1:  # Skip outlier topic
                    topic_docs[topic] = [
                        reviews[i] for i, t in enumerate(topics) if t == topic
                    ]
            
            return {
                'topic_info': topic_info,
                'review_topics': topics,
                'probabilities': probs,
                'topic_docs': topic_docs
            }
        except Exception as e:
            print(f"Error in theme extraction: {e}")
            return None

    def analyze_sentiment_patterns(self, reviews):
        """Analyze sentiment patterns in reviews"""
        sentiments = []
        for review in reviews:
            blob = TextBlob(review)
            sentiments.append({
                'polarity': blob.sentiment.polarity,
                'subjectivity': blob.sentiment.subjectivity,
                'text': review
            })
        
        # Group by sentiment
        sentiment_groups = {
            'positive': [s for s in sentiments if s['polarity'] > 0.2],
            'negative': [s for s in sentiments if s['polarity'] < -0.2],
            'neutral': [s for s in sentiments if -0.2 <= s['polarity'] <= 0.2]
        }
        
        # Calculate statistics
        stats = {
            'average_polarity': np.mean([s['polarity'] for s in sentiments]),
            'sentiment_distribution': {
                k: len(v) for k, v in sentiment_groups.items()
            }
        }
        
        return {'groups': sentiment_groups, 'stats': stats}

    def extract_aspect_sentiments(self, reviews):
        """Analyze sentiments for specific aspects"""
        aspect_sentiments = {aspect: [] for aspect in self.aspects}
        
        for review in reviews:
            review_lower = review.lower()
            
            # Check each aspect
            for aspect, keywords in self.aspects.items():
                # If any keyword is found, analyze sentiment for this aspect
                if any(keyword in review_lower for keyword in keywords):
                    sentiment = TextBlob(review).sentiment.polarity
                    aspect_sentiments[aspect].append({
                        'sentiment': sentiment,
                        'text': review
                    })
        
        # Calculate statistics for each aspect
        aspect_stats = {}
        for aspect, sentiments in aspect_sentiments.items():
            if sentiments:
                aspect_stats[aspect] = {
                    'average_sentiment': np.mean([s['sentiment'] for s in sentiments]),
                    'count': len(sentiments),
                    'example_positive': next((s['text'] for s in sentiments 
                                           if s['sentiment'] > 0.2), None),
                    'example_negative': next((s['text'] for s in sentiments 
                                           if s['sentiment'] < -0.2), None)
                }
        
        return aspect_stats

    def summarize_theme(self, reviews, max_length=130):
        """Summarize a group of related reviews"""
        try:
            # Combine reviews but limit total length
            combined = " ".join(reviews[:5])  # Take first 5 reviews as sample
            summary = self.summarizer(combined, 
                                    max_length=max_length, 
                                    min_length=30, 
                                    do_sample=False)[0]['summary_text']
            return summary
        except Exception as e:
            print(f"Error in summarization: {e}")
            return None

    def generate_comprehensive_analysis(self, reviews):
        """Generate a comprehensive analysis of all reviews"""
        print("Starting comprehensive review analysis...")
        
        # Preprocess reviews
        cleaned_reviews = self.preprocess_reviews(reviews)
        if not cleaned_reviews:
            return {"error": "No valid reviews found"}
        
        print(f"Analyzing {len(cleaned_reviews)} valid reviews...")
        
        # Extract themes
        themes = self.extract_themes(cleaned_reviews)
        
        # Analyze sentiments
        sentiment_analysis = self.analyze_sentiment_patterns(cleaned_reviews)
        
        # Analyze aspects
        aspect_analysis = self.extract_aspect_sentiments(cleaned_reviews)
        
        # Generate theme summaries
        theme_summaries = {}
        if themes and 'topic_docs' in themes:
            for topic, docs in themes['topic_docs'].items():
                theme_summaries[topic] = self.summarize_theme(docs)
        
        return {
            'review_count': len(cleaned_reviews),
            'themes': themes,
            'sentiment_analysis': sentiment_analysis,
            'aspect_analysis': aspect_analysis,
            'theme_summaries': theme_summaries
        }

def format_analysis_results(analysis):
    """Format analysis results for readable output"""
    if 'error' in analysis:
        return analysis['error']
    
    output = []
    output.append(f"Analysis of {analysis['review_count']} Reviews\n")
    
    # Overall sentiment
    sentiment_stats = analysis['sentiment_analysis']['stats']
    output.append("Overall Sentiment:")
    output.append(f"Average rating: {sentiment_stats['average_polarity']:.2f}")
    output.append("Distribution:")
    for sentiment, count in sentiment_stats['sentiment_distribution'].items():
        output.append(f"- {sentiment}: {count}")
    output.append("")
    
    # Aspect analysis
    output.append("Aspect Analysis:")
    for aspect, stats in analysis['aspect_analysis'].items():
        output.append(f"\n{aspect.replace('_', ' ').title()}:")
        output.append(f"- Mentioned in {stats['count']} reviews")
        output.append(f"- Average sentiment: {stats['average_sentiment']:.2f}")
        if stats['example_positive']:
            output.append(f"- Positive example: {stats['example_positive'][:100]}...")
        if stats['example_negative']:
            output.append(f"- Negative example: {stats['example_negative'][:100]}...")
    
    # Theme summaries
    if analysis['theme_summaries']:
        output.append("\nMain Themes:")
        for topic, summary in analysis['theme_summaries'].items():
            if summary:
                output.append(f"\nTheme {topic}:")
                output.append(f"- {summary}")
    
    return "\n".join(output)

# Example usage:
if __name__ == "__main__":
    # Load your reviews
    reviews_df = pd.read_csv('path_to_your_reviews.csv')
    
    # Initialize analyzer
    analyzer = ComprehensiveReviewAnalyzer()
    
    # Analyze reviews for a specific professor
    professor_reviews = reviews_df[reviews_df['professor_id'] == 'PROF_ID']['comments'].tolist()
    
    # Generate and format analysis
    analysis = analyzer.generate_comprehensive_analysis(professor_reviews)
    formatted_output = format_analysis_results(analysis)
    
    print(formatted_output)

KeyboardInterrupt: 

: 