In [3]:
import pandas as pd

# Load the data
import os
print(os.getcwd())

# Now load the data
professors_df = pd.read_csv('../UMassReviews/professors_df_clean.csv')
reviews_df = pd.read_csv('../UMassReviews/reviews_df_clean_sentiment.csv')


import json

def clean_and_parse_json(x):
    if not isinstance(x, str):
        return x
    try:
        # First attempt: direct JSON parse
        return json.loads(x)
    except json.JSONDecodeError:
        try:
            # Second attempt: Replace single quotes with double quotes
            x = x.replace("'", '"')
            return json.loads(x)
        except json.JSONDecodeError:
            try:
                # Third attempt: Use ast.literal_eval (safer than eval)
                import ast
                return ast.literal_eval(x)
            except:
                print(f"Failed to parse: {x[:100]}...")  # Print first 100 chars
                return {}  # Return empty dict if all parsing attempts fail


reviews_df['class_identifiers'] = reviews_df['class_identifiers'].apply(clean_and_parse_json)
print(reviews_df['class_identifiers'][0])



c:\Users\austi\machinelearning\RateMyProfessorStats\Notebooks
[{'key': '203815-100', 'name': 'PSYCH100'}]


In [4]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from nltk.stem import PorterStemmer
import nltk

# Download required NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')

def summarize_text(text, num_sentences=4):
    stemmer = PorterStemmer()  # Initialize the NLTK stemmer
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LsaSummarizer()
    summarizer.stemmer = stemmer  # Set the NLTK stemmer for the summarizer
    summary = summarizer(parser.document, num_sentences)
    return " ".join([str(sentence) for sentence in summary])

def get_all_reviews_for_professor(professor_id):
    review_list = reviews_df[reviews_df['tid'] == professor_id]['comment'].tolist()
    return " ".join(review_list)

def get_raw_reviews_for_professor(professor_id):
    return reviews_df[reviews_df['tid'] == professor_id]

print(summarize_text(get_all_reviews_for_professor(2936075)))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\austi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\austi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


His lectures are basically him saying "This should be obvious" while teaching students concepts instead of carefully taking time to offer thorough explanations. Wilson was clearly lacking in social skills, and he held grudges against students who were genuinely trying to learn something new. Discussions and office hours were also poorly run, but he refused to take the class's weaknesses as a reflection on his pedagogy. Gotcha questions on exams, apathetic,disinterested, there aren't enough adjectives to describe the way this man treats people and runs his classroom.


In [5]:
# Summarize by class
def get_reviews_for_class(class_id):
    review_list = []
    for index, review in reviews_df.iterrows():  # Use iterrows to iterate over DataFrame rows
        class_identifiers = review['class_identifiers']
        if isinstance(class_identifiers, list):  # Ensure class_identifiers is a list
            for item in class_identifiers:
                if item.get('key') == class_id:
                    review_list.append(review['comment'])
    return " ".join(review_list)

def get_raw_reviews_for_class(class_id):
    filtered_reviews = reviews_df[reviews_df['class_identifiers'].apply(lambda x: isinstance(x, list) and any(item.get('key') == class_id for item in x))]
    return filtered_reviews

print(summarize_text(get_reviews_for_class('83082-380')))



However the class is a lot of work you MUST attend all lectures and keep up with readings to do well. You'll enjoy going to lecture, even though it' an 8AM (don't  skip, because a large chunk of test material centers around case studies discussed, videos shown, and diagnostic criteria presented in class. He makes the class interesting by sharing stories of his actual cases and provides three review sessions a week if you need help or extra credit. Didn't live up to my expectations after what I read on here...Nice guy, bit of an ego, but knows his stuff.


In [6]:
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer

def summarize_text_compare(text, summarizer_type='lsa', sentences_count=4, language="english"):
    stemmer = PorterStemmer()  # Initialize the NLTK stemmer
    parser = PlaintextParser.from_string(text, Tokenizer(language))

    """
    Parameters:
    - text: string, the text to summarize
    - summarizer_type: string, type of summarizer ('lsa', 'lexrank', 'luhn', or 'textrank')
    - sentences_count: int, number of sentences in summary
    - language: string, language of text
    
    Returns:
    - string, the summarized text
    """
    
    # Choose summarizer
    if summarizer_type.lower() == 'lsa':
        summarizer = LsaSummarizer()
    elif summarizer_type.lower() == 'lexrank':
        summarizer = LexRankSummarizer()
    elif summarizer_type.lower() == 'luhn':
        summarizer = LuhnSummarizer()
    elif summarizer_type.lower() == 'textrank':
        summarizer = TextRankSummarizer()
    else:
        raise ValueError("Invalid summarizer type")
    
    # Add stop words
    summarizer.stemmer = stemmer
    
    # Create summary
    summary = summarizer(parser.document, sentences_count)
    
    # Join sentences and return
    return " ".join([str(sentence) for sentence in summary])

for item in ['lsa', 'lexrank', 'luhn', 'textrank']:
    print(summarize_text_compare(get_reviews_for_class('83082-380'), item))




However the class is a lot of work you MUST attend all lectures and keep up with readings to do well. You'll enjoy going to lecture, even though it' an 8AM (don't  skip, because a large chunk of test material centers around case studies discussed, videos shown, and diagnostic criteria presented in class. He makes the class interesting by sharing stories of his actual cases and provides three review sessions a week if you need help or extra credit. Didn't live up to my expectations after what I read on here...Nice guy, bit of an ego, but knows his stuff.
His lectures are very engaging and makes it easy to get up early to go to class. His lectures are so interesting. Halgin's class is the best! Very funny as well for abnormal psych his class was very interesting and was one of my favorite classes that i have taken.
I wish he was a professor for more psych classes because the amount I learned reflects how well he taught the material in a fun and interesting way, and made me want to go to 

In [7]:
def sumy_sentiment_by_class(class_id):
    # Get the reviews for the specified class
    reviews = get_raw_reviews_for_class(class_id)
    # Calculate the average sentiment of the reviews
    import numpy as np
    average_sentiment = np.mean(reviews['sentiment_polarity'].tolist())

    from textblob import TextBlob
    summary_sentiment = TextBlob(summarize_text_compare(get_reviews_for_class(class_id), 'lsa')).sentiment.polarity

    return {'average_sentiment': average_sentiment, 'summary_sentiment': summary_sentiment}

print(sumy_sentiment_by_class('83082-380'))

def sumy_sentiment_by_professor(professor_id):
    # Get the reviews for the specified professor
    reviews = get_raw_reviews_for_professor(professor_id)

    # Calculate the average sentiment of the reviews
    import numpy as np
    average_sentiment = np.mean(reviews['sentiment_polarity'].tolist())

    from textblob import TextBlob
    summary_sentiment = TextBlob(summarize_text_compare(get_all_reviews_for_professor(professor_id), 'lsa')).sentiment.polarity

    return {'average_sentiment': average_sentiment, 'summary_sentiment': summary_sentiment}

print(sumy_sentiment_by_professor(2936075))


{'average_sentiment': 0.36357141514265673, 'summary_sentiment': 0.20844155844155843}
{'average_sentiment': -0.0009831377042025218, 'summary_sentiment': 0.02121212121212121}


In [8]:
import pandas as pd
import os

if os.path.exists('../UmassReviews/sumy_professor_errors.pkl') and os.path.exists('../UmassReviews/sumy_class_errors.pkl'):
    # Load error data from CSV files
    professor_errors = pd.read_pickle('../UmassReviews/sumy_professor_errors.pkl')
    class_errors = pd.read_pickle('../UmassReviews/sumy_class_errors.pkl')
else:
    def summary_evaluation(summary_functions):

        def run_summary_function(summary_function, data_list):
            evaluation_list = []
            from tqdm import tqdm
            progress_bar = tqdm(total=len(data_list), desc=f"Processing {summary_function.__name__}", bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]')
            for item in data_list:
                evaluation_list.append(summary_function(item))
                progress_bar.update(1)
            progress_bar.close()  # Close the progress bar after completion
            
            error_list = []
            absolute_error_list = []
            for item in evaluation_list:
                error_list.append(item['average_sentiment'] - item['summary_sentiment'])
                absolute_error_list.append(abs(item['average_sentiment'] - item['summary_sentiment']))

            return {'error_list': error_list, 'absolute_error_list': absolute_error_list}

        classes = []

        for review in reviews_df['class_identifiers']:
            for item in review:
                classes.append(item['key'])

        unique_classes = list(set(classes))
        return [run_summary_function(summary_functions[0], professors_df['id']), run_summary_function(summary_functions[1], unique_classes)]


    errors = summary_evaluation([sumy_sentiment_by_professor, sumy_sentiment_by_class])
    professor_errors = errors[0]
    class_errors = errors[1]

import numpy as np

professor_absolute_error_list = professor_errors['absolute_error_list']
mean_absolute_error = np.mean(professor_absolute_error_list)
median_absolute_error = np.median(professor_absolute_error_list)
std_dev_absolute_error = np.std(professor_absolute_error_list)
print("Professor Summary Evaluation")
print(f"Mean Absolute Error: {mean_absolute_error}")
print(f"Median Absolute Error: {median_absolute_error}")
print(f"Standard Deviation of Absolute Error: {std_dev_absolute_error}")

class_absolute_error_list = class_errors['absolute_error_list']
mean_absolute_error = np.mean(class_absolute_error_list)
median_absolute_error = np.median(class_absolute_error_list)
std_dev_absolute_error = np.std(class_absolute_error_list)

print("Class Summary Evaluation")
print(f"Mean Absolute Error: {mean_absolute_error}")
print(f"Median Absolute Error: {median_absolute_error}")
print(f"Standard Deviation of Absolute Error: {std_dev_absolute_error}")



Professor Summary Evaluation
Mean Absolute Error: 0.12208193490171151
Median Absolute Error: 0.0986419331367494
Standard Deviation of Absolute Error: 0.09374860007190475
Class Summary Evaluation
Mean Absolute Error: 0.08643856197440637
Median Absolute Error: 0.0522152627465128
Standard Deviation of Absolute Error: 0.1067269912822949


In [9]:
import pickle

# Save the error data to pickle files
with open('../UmassReviews/sumy_professor_errors.pkl', 'wb') as f:
    pickle.dump(professor_errors, f)
    
with open('../UmassReviews/sumy_class_errors.pkl', 'wb') as f:
    pickle.dump(class_errors, f)

print("Error data saved to professor_errors.pkl and class_errors.pkl")


Error data saved to professor_errors.pkl and class_errors.pkl


# Advanced Summarization Roadmap
Data Preprocessing: Clean the text data by removing stop words, punctuation, and performing tokenization.

Sentiment Analysis: Use a sentiment analysis model to determine the overall sentiment of each review. Libraries like TextBlob or VADER can be useful for this.

Topic Modeling: Apply topic modeling techniques such as Latent Dirichlet Allocation (LDA) or Non-Negative Matrix Factorization (NMF) to identify common themes. The gensim library is useful for LDA.

Keyword Extraction: Use TF-IDF or RAKE to extract important keywords that frequently appear in the reviews.

Clustering: Group similar reviews together using clustering algorithms like K-Means or DBSCAN. This can help in identifying common themes.

Summarization: Generate summaries based on the identified themes and sentiments. You can use pre-trained models like BERT for abstractive summarization.

In [10]:
display(professors_df)
display(reviews_df)

Unnamed: 0,id,name,department,total_reviews_count,average_quality_rating,average_difficulty_rating,classes_taught,class_map
0,203815,John Bickford,Psychology,449,4.077951,2.031180,"['PSYCH100', 'PSYCH370', '100', 'PSCYH100', 'P...","{""100"": {""prefix"": ""PSYCH"", ""suffix"": """", ""lab..."
1,77120,Randall Phillis,Biology,388,3.920103,3.298969,"['BIO151', '151', '100', 'BIOLOGY151', 'BIO484...","{""151"": {""prefix"": ""BIO"", ""suffix"": """", ""label..."
2,1621419,Laura Francis,Biology,370,2.659459,4.000000,"['161H', 'BIO151', '285', 'BIO285', '151', 'BI...","{""161"": {""prefix"": ""BIO"", ""suffix"": ""H"", ""labe..."
3,192549,Joanna Jeneralczuk,Mathematics,323,3.780186,3.142415,"['STAT240', 'STAT501', 'STAT515', '240', 'STAT...","{""240"": {""prefix"": ""STAT"", ""suffix"": """", ""labe..."
4,1617241,Chris McDaniel,Chemistry,298,2.104027,4.342282,"['CHEM269', 'CHEM261', 'CHEM266', 'CHEM262', '...","{""269"": {""prefix"": ""CHEM"", ""suffix"": """", ""labe..."
...,...,...,...,...,...,...,...,...
995,916197,Elaine Brigham,Education,16,2.875000,2.687500,"['EDU210', 'EDUC001', 'EDUC210', 'CUSP166', 'H...","{""210"": {""prefix"": ""EDUC"", ""suffix"": """", ""labe..."
996,25287,Bruce Laurie,History,16,4.437500,3.937500,"['391CL', 'HIST494K', 'HIST494', 'HIS494', 'HI...","{""391"": {""prefix"": """", ""suffix"": ""LC"", ""labels..."
997,1276772,John Ridgway,Computer Science,16,2.375000,2.750000,"['CSC262', 'COMPSCI320', 'CS320', 'CS230', 'CS...","{""262"": {""prefix"": ""CSC"", ""suffix"": """", ""label..."
998,920515,Amanda Johnson,Anthropology,16,3.000000,2.750000,"['ANTH397', 'ANTH364', 'ANTH205', 'ANTH397RE',...","{""397"": {""prefix"": ""ANTH"", ""suffix"": """", ""labe..."


Unnamed: 0,class,comment,date,difficultyRating,isForCredit,qualityRating,wouldTakeAgain,tid,class_identifiers,high_quality_difficulty_for_credit,...,year,month,day,day_of_week,hour,sentiment,sentiment_polarity,season,caps_ratio,exclamation_count
0,PSYCH100,Bickford was easily my favorite teacher. His l...,2025-01-13,2,True,5,1.0,203815,"[{'key': '203815-100', 'name': 'PSYCH100'}]",0,...,2025,1,13,Monday,0,"Sentiment(polarity=0.25, subjectivity=0.616666...",0.250000,Winter,0.011869,1
1,PSYCH100,He might just be the most relatable and humoro...,2024-12-24,2,True,5,1.0,203815,"[{'key': '203815-100', 'name': 'PSYCH100'}]",0,...,2024,12,24,Tuesday,0,"Sentiment(polarity=0.31888888888888894, subjec...",0.318889,Winter,0.025806,1
2,PSYCH370,exams are in person but are open note so just ...,2024-12-16,1,True,5,1.0,203815,"[{'key': '203815-370', 'name': 'PSYCH370'}]",0,...,2024,12,16,Monday,0,"Sentiment(polarity=0.2225, subjectivity=0.71)",0.222500,Winter,0.000000,2
3,PSYCH100,hands down the BEST professor i've had here so...,2024-12-09,2,True,5,1.0,203815,"[{'key': '203815-100', 'name': 'PSYCH100'}]",0,...,2024,12,9,Monday,0,"Sentiment(polarity=0.2743055555555556, subject...",0.274306,Winter,0.032573,0
4,100,"Super funny guy, lectures are great. Gives LOT...",2024-12-07,2,True,5,1.0,203815,"[{'key': '203815-100', 'name': 'PSYCH100'}]",0,...,2024,12,7,Saturday,0,"Sentiment(polarity=0.3458333333333333, subject...",0.345833,Winter,0.051724,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45169,MKTG201,"One of my favorite classes this semester, alth...",2015-12-16,2,True,4,-1.0,1732553,"[{'key': '1732553-201', 'name': 'MKTG201'}]",0,...,2015,12,16,Wednesday,0,"Sentiment(polarity=0.32916666666666666, subjec...",0.329167,Winter,0.023529,0
45170,MKTG201,"Professor Ross definitely knows his stuff, and...",2015-11-09,3,True,3,-1.0,1732553,"[{'key': '1732553-201', 'name': 'MKTG201'}]",0,...,2015,11,9,Monday,0,"Sentiment(polarity=0.32, subjectivity=0.4)",0.320000,Fall,0.018265,0
45171,MKT318,"He is not clear at all, so far its the midway ...",2014-10-23,1,True,2,-1.0,1732553,"[{'key': '1732553-318', 'name': 'MKT318'}]",0,...,2014,10,23,Thursday,0,"Sentiment(polarity=-0.2733072916666667, subjec...",-0.273307,Fall,0.021053,3
45172,MGMT250,Do not take ANY class with this guy! He gave 2...,2013-12-19,4,False,1,-1.0,1732553,"[{'key': '1732553-250', 'name': 'Marketing250'}]",0,...,2013,12,19,Thursday,0,"Sentiment(polarity=-0.009999999999999986, subj...",-0.010000,Winter,0.028571,1


In [11]:
display(professors_df.describe().T)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,1000.0,974707.402,926389.830037,7541.0,88280.75,658174.5,1854628.0,3047994.0
total_reviews_count,1000.0,45.174,44.61574,16.0,22.0,31.0,49.0,449.0
average_quality_rating,1000.0,3.600967,0.910661,1.0,3.0,3.678205,4.353416,5.0
average_difficulty_rating,1000.0,3.003176,0.689239,1.166667,2.5,3.035099,3.504032,4.833333


In [12]:
import pandas as pd
import re

# Common English stop words - you can modify this list
STOP_WORDS = {
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he',
    'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were',
    'will', 'with', 'the', 'this', 'but', 'they', 'have', 'had', 'what', 'when',
    'where', 'who', 'which', 'why', 'how'
}

DOMAIN_STOP_WORDS = {
    'professor', 'prof', 'class', 'course', 'lecture', 'semester',
    'grade', 'exam', 'exams', 'test', 'quiz', 'homework', 'assignment', 'homeworks', 'quizzes'
}
STOP_WORDS.update(DOMAIN_STOP_WORDS)

def clean_text(text):
    """
    Clean text before tokenization:
    1. Convert to lowercase
    2. Remove URLs
    3. Remove email addresses
    4. Remove phone numbers
    5. Handle special characters
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove phone numbers
    text = re.sub(r'\d{3}[-\.\s]?\d{3}[-\.\s]?\d{4}', '', text)
    
    # Remove special characters and digits
    text = re.sub(r'[^\w\s]', ' ', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())

     # Remove course numbers
    text = re.sub(r'\b[A-Z]{2,4}\s?\d{3}\b', '', text)
    
    # Remove semester references
    text = re.sub(r'\b(spring|fall|summer|winter)\s?\d{4}\b', '', text, flags=re.IGNORECASE)
    
    return text

def enhanced_tokenize(text):
    """
    Tokenize and remove stop words
    """
    # First clean the text
    cleaned_text = clean_text(text)
    
    # Split into tokens
    tokens = cleaned_text.split()
    
    # Remove stop words
    tokens = [token for token in tokens if token not in STOP_WORDS]
    
    return tokens

def process_reviews(reviews_df, text_column='comments'):
    """
    Process multiple reviews
    """
    processed_data = []
    
    for review in reviews_df[text_column]:
        try:
            if isinstance(review, str):
                # Clean and tokenize
                tokens = enhanced_tokenize(review)
                processed_text = ' '.join(tokens)
                
                # Calculate some basic metrics
                processed_data.append({
                    'original_text': review,
                    'cleaned_text': clean_text(review),
                    'tokens': tokens,
                    'processed_text': processed_text,
                    'token_count': len(tokens)
                })
            else:
                raise ValueError("Invalid review text")
                
        except Exception as e:
            print(f"Warning: Error processing review: {str(e)}")
            processed_data.append({
                'original_text': review if isinstance(review, str) else '',
                'cleaned_text': '',
                'tokens': [],
                'processed_text': '',
                'token_count': 0
            })
    
    return pd.DataFrame(processed_data)

# Test the preprocessing
test_reviews = pd.DataFrame({
    'comments': [
        "The professor's email is prof@university.edu and their phone is 123-456-7890. Check http://class.com for materials!",
        "This class was really great! The professor explained everything clearly.",
        "I didn't understand anything... The homework was too difficult :(",
    ]
})

processed_df = process_reviews(test_reviews)

# Show example
print("\nExample Processing:")
for i, row in processed_df.iterrows():
    print(f"\nOriginal: {row['original_text']}")
    print(f"Cleaned: {row['cleaned_text']}")
    print(f"Tokens without stop words: {row['tokens']}")
    print(f"Token count: {row['token_count']}")


Example Processing:

Original: The professor's email is prof@university.edu and their phone is 123-456-7890. Check http://class.com for materials!
Cleaned: the professor s email is and their phone is check for materials
Tokens without stop words: ['s', 'email', 'their', 'phone', 'check', 'materials']
Token count: 6

Original: This class was really great! The professor explained everything clearly.
Cleaned: this class was really great the professor explained everything clearly
Tokens without stop words: ['really', 'great', 'explained', 'everything', 'clearly']
Token count: 5

Original: I didn't understand anything... The homework was too difficult :(
Cleaned: i didn t understand anything the homework was too difficult
Tokens without stop words: ['i', 'didn', 't', 'understand', 'anything', 'too', 'difficult']
Token count: 7


In [13]:
reviews_processed = process_reviews(reviews_df, text_column='comment')
# Preview a sample of processed reviews
print("\nExample Processing:")
for _, row in reviews_processed.iloc[:5].iterrows():
    print(f"\nOriginal: {row['original_text']}")
    print(f"Tokens: {row['tokens']}")
    print(f"Token count: {row['token_count']}")




Example Processing:

Original: Bickford was easily my favorite teacher. His lectures were fun to attend even though they weren't mandatory. You have to complete 6 reaction papers in class, there are 4 midterms (no final) and you get a cheat sheet for all of them- pretty easy all multiple choice. The textbook assignments were quick, and helped to learn the material! 
Tokens: ['bickford', 'easily', 'my', 'favorite', 'teacher', 'his', 'lectures', 'fun', 'attend', 'even', 'though', 'weren', 't', 'mandatory', 'you', 'complete', '6', 'reaction', 'papers', 'there', '4', 'midterms', 'no', 'final', 'you', 'get', 'cheat', 'sheet', 'all', 'them', 'pretty', 'easy', 'all', 'multiple', 'choice', 'textbook', 'assignments', 'quick', 'helped', 'learn', 'material']
Token count: 41

Original: He might just be the most relatable and humorous professor I had. His lectures were very engaging and I learned something every time I left. He allows a cheat sheet for every exam, and you should do great as long a

In [14]:
# Add sentiment analysis using TextBlob
from textblob import TextBlob

def get_sentiment(text):
    try:
        return TextBlob(text).sentiment
    except:
        return TextBlob('').sentiment

# Apply sentiment analysis to the original text
reviews_processed['sentiment'] = reviews_processed['original_text'].apply(get_sentiment)

# Preview the sentiment scores
print("\nExample Reviews with Sentiment:")
for _, row in reviews_processed.iloc[:5].iterrows():
    print(f"\nText: {row['original_text'][:100]}...")
    print(f"Sentiment: {row['sentiment']}")



Example Reviews with Sentiment:

Text: Bickford was easily my favorite teacher. His lectures were fun to attend even though they weren't ma...
Sentiment: Sentiment(polarity=0.25, subjectivity=0.6166666666666667)

Text: He might just be the most relatable and humorous professor I had. His lectures were very engaging an...
Sentiment: Sentiment(polarity=0.31888888888888894, subjectivity=0.5788888888888889)

Text: exams are in person but are open note so just remember to study with textbooks because that helps so...
Sentiment: Sentiment(polarity=0.2225, subjectivity=0.71)

Text: hands down the BEST professor i've had here so far. every time i left his lectures i felt as if i ha...
Sentiment: Sentiment(polarity=0.2743055555555556, subjectivity=0.3861111111111111)

Text: Super funny guy, lectures are great. Gives LOTS of extra credit opportunities and allows cheat sheet...
Sentiment: Sentiment(polarity=0.3458333333333333, subjectivity=0.6291666666666667)


In [15]:
print(reviews_processed.head())
print(reviews_processed.describe())


                                       original_text  \
0  Bickford was easily my favorite teacher. His l...   
1  He might just be the most relatable and humoro...   
2  exams are in person but are open note so just ...   
3  hands down the BEST professor i've had here so...   
4  Super funny guy, lectures are great. Gives LOT...   

                                        cleaned_text  \
0  bickford was easily my favorite teacher his le...   
1  he might just be the most relatable and humoro...   
2  exams are in person but are open note so just ...   
3  hands down the best professor i ve had here so...   
4  super funny guy lectures are great gives lots ...   

                                              tokens  \
0  [bickford, easily, my, favorite, teacher, his,...   
1  [might, just, most, relatable, humorous, i, hi...   
2  [person, open, note, so, just, remember, study...   
3  [hands, down, best, i, ve, here, so, far, ever...   
4  [super, funny, guy, lectures, great, gives,

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import numpy as np
import pandas as pd

def create_focused_topic_model(reviews_df):
    ASPECTS = {
        'LECTURE_DELIVERY': {
            'words': [
                'lectures', 'presentation', 'speaks', 'voice', 'pace',
                'delivery', 'stage presence', 'clarity', 'organization', 'structure'
            ],
            'weight': 2.0  # Increased weight
        },
        'STUDENT_ENGAGEMENT': {
            'words': [
                'interesting', 'engaging', 'fun', 'entertaining', 'discussion',
                'interactive', 'enthusiasm', 'exciting', 'participation', 'attention'
            ],
            'weight': 1.8
        },
        'TEACHING_RESOURCES': {
            'words': [
                'slides', 'examples', 'practice', 'questions', 'materials',
                'textbook', 'handouts', 'resources', 'tools', 'visuals'
            ],
            'weight': 1.6
        },
        'LEARNING_SUPPORT': {
            'words': [
                'help', 'office hours', 'feedback', 'guidance', 'explanation',
                'understanding', 'clarity', 'support', 'assistance', 'review'
            ],
            'weight': 1.4
        },
        'COURSE_REQUIREMENTS': {
            'words': [
                'assignments', 'exams', 'homework', 'grading', 'workload',
                'deadlines', 'attendance', 'requirements', 'schedule', 'difficulty'
            ],
            'weight': 1.2
        }
    }
    
    # Rest of the function remains the same
    vocab = {}
    for aspect, details in ASPECTS.items():
        for word in details['words']:
            vocab[word] = details['weight']
    
    # Create vectorizer with weighted vocabulary
    vectorizer = TfidfVectorizer(
        vocabulary=list(vocab.keys()),
        ngram_range=(1, 3),
        min_df=5,
        max_df=0.9
    )
    
    # Process documents using processed_text instead of comment
    print(f"Processing {len(reviews_df)} documents...")
    tfidf = vectorizer.fit_transform(reviews_df['processed_text'])  # Changed from 'comment' to 'processed_text'
    
    # Rest of the function remains the same
    feature_names = vectorizer.get_feature_names_out()
    weights = np.array([vocab[word] for word in feature_names])
    tfidf_weighted = tfidf.multiply(weights)
    
    nmf = NMF(
        n_components=5,
        init='nndsvd',
        random_state=42,
        max_iter=500
    )
    
    nmf_output = nmf.fit_transform(tfidf_weighted)
    
    return nmf, vectorizer, tfidf_weighted, nmf_output

def analyze_topics(nmf_model, vectorizer, nmf_output, processed_df, original_df):
    """
    Enhanced topic analysis with better coverage
    """
    feature_names = vectorizer.get_feature_names_out()
    doc_topics = nmf_output.argmax(axis=1)
    
    # Calculate topic confidence scores
    topic_strengths = []
    for doc_vector in nmf_output:
        sorted_topics = np.sort(doc_vector)
        # Lowered confidence threshold calculation
        strength = (sorted_topics[-1] - sorted_topics[-2]) / (sorted_topics[-1] + 1e-6)
        topic_strengths.append(strength)
    
    # Add topic information to processed_df
    processed_df['main_topic'] = doc_topics
    processed_df['topic_confidence'] = topic_strengths
    
    # Merge with original DataFrame to get metadata
    merged_df = processed_df.merge(
        original_df[['comment', 'qualityRating', 'difficultyRating', 'wouldTakeAgain']], 
        left_on='original_text',
        right_on='comment',
        how='left'
    )
    
    # Get representative reviews with adjusted confidence threshold
    topics_info = []
    for topic_idx, topic in enumerate(nmf_model.components_):
        top_words_idx = topic.argsort()[:-16:-1]
        top_words = [feature_names[i] for i in top_words_idx]
        weights = [topic[i] for i in top_words_idx]
        
        # Lower confidence threshold to capture more reviews
        confident_reviews = merged_df[
            (merged_df['main_topic'] == topic_idx) & 
            (merged_df['topic_confidence'] > 0.3)  # Lowered from 0.6
        ]
        
        # Sort by both quality rating and confidence
        confident_reviews = confident_reviews.sort_values(
            by=['qualityRating', 'topic_confidence'], 
            ascending=[False, False]
        )
        
        topics_info.append({
            'topic_id': topic_idx,
            'words': top_words,
            'weights': weights,
            'doc_count': len(confident_reviews),
            'percentage': (len(confident_reviews)/len(processed_df)*100),
            'example_reviews': confident_reviews['original_text'].head(3).tolist()
        })
    
    return topics_info, merged_df

In [50]:
try:
    nmf_model, vectorizer, tfidf, nmf_output = create_focused_topic_model(reviews_processed)
    topics_info, reviews_with_topics = analyze_topics(
        nmf_model, 
        vectorizer, 
        nmf_output, 
        reviews_processed,
        reviews_df  # Pass in the original DataFrame with metadata
    )
    
    print("\nDetailed Topic Analysis Results:")
    for topic in topics_info:
        print(f"\nTopic {topic['topic_id']} ({topic['percentage']:.1f}% of reviews):")
        
        print("\nKey Phrases:")
        for word, weight in zip(topic['words'][:20], topic['weights'][:20]):
            print(f"- {word}: {weight:.4f}")
        
        print("\nRepresentative Reviews:")
        for review in topic['example_reviews']:
            print(f"- {review[:200]}...")  # Truncate long reviews
        print("-" * 80)
            
except Exception as e:
    print(f"Error during topic analysis: {e}")

Processing 45174 documents...

Detailed Topic Analysis Results:

Topic 0 (13.0% of reviews):

Key Phrases:
- lectures: 11.7078
- slides: 0.4143
- textbook: 0.3668
- assignments: 0.3111
- engaging: 0.2828
- attention: 0.2694
- attendance: 0.2609
- discussion: 0.2545
- practice: 0.1860
- review: 0.1753
- grading: 0.1745
- office hours: 0.1436
- entertaining: 0.1359
- understanding: 0.1195
- examples: 0.1132

Representative Reviews:
- hands down the BEST professor i've had here so far. every time i left his lectures i felt as if i had actually learned something and the way he teaches and explains things actually makes sense to me. ...
- Super funny guy, lectures are great. Gives LOTS of extra credit opportunities and allows cheat sheets on all exams. ...
- Extra credit opportunities with iClicker and reaction papers sometimes had extra credit. There are test make-ups, which the better score is always taken, so there's no harm. Very funny and gives good ...
--------------------------------

In [51]:
def analyze_topic_accuracy(reviews_with_topics, topic_id, sample_size=50):
    """
    Analyze a larger sample of reviews for a given topic
    """
    # Get reviews for this topic with high confidence
    topic_reviews = reviews_with_topics[
        (reviews_with_topics['main_topic'] == topic_id) & 
        (reviews_with_topics['topic_confidence'] > 0.2)
    ]
    
    # Sort by confidence and get sample
    sampled_reviews = topic_reviews.sort_values(
        by='topic_confidence', 
        ascending=False
    ).head(sample_size)
    
    # Print analysis
    print(f"\nTopic {topic_id} Analysis ({len(topic_reviews)} total reviews)")
    print("-" * 80)
    print(f"Sample of {sample_size} high-confidence reviews:\n")
    
    for idx, row in sampled_reviews.iterrows():
        conf = row['topic_confidence']
        quality = row['qualityRating']
        print(f"\nConfidence: {conf:.3f} | Quality Rating: {quality}")
        print(f"Review: {row['original_text'][:200]}...")
    
    return sampled_reviews

# Analyze each topic
for topic_id in range(5):
    sampled_data = analyze_topic_accuracy(reviews_with_topics, topic_id)


Topic 0 Analysis (5915 total reviews)
--------------------------------------------------------------------------------
Sample of 50 high-confidence reviews:


Confidence: 1.000 | Quality Rating: 4
Review: He is the greatest.  Extremely passionate, always interested to hear what students have to say, gets very into his lectures.  Genuine guy and great TA's - both he and the TA's are incredibly understan...

Confidence: 1.000 | Quality Rating: 2
Review: nash herself is very nice and loves her subject, but the class is horrible.  you have to read a chapter a week of the book, lectures put you to sleep, but you have to go to do well on the tests.  disc...

Confidence: 1.000 | Quality Rating: 5
Review: Great professor! I took Anatomy and Physiology 1 with her. Her lectures aren't mandatory but you should deff go. She is a great teacher and attending the lectures means way less time studying later. I...

Confidence: 1.000 | Quality Rating: 5
Review: She explains every thing down with such g