In [27]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# 1. Load Data
df = pd.read_csv('../data/processed/sentiment_data.csv')

# 2. Convert Ratings & Sentiment to Numeric
score_cols = ['rating_1', 'rating_2', 'rating_3', 'rating_4', 'rating_5', 'rating_6']
for col in score_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df['is_pos'] = (df['bert_label'] == 'Positive').astype(int)
df['is_neg'] = (df['bert_label'] == 'Negative').astype(int)
df['is_neu'] = (df['bert_label'] == 'Neutral').astype(int)

# --- Advanced Keyword Detection ---
def check_keywords(text, keywords):
    if not isinstance(text, str): return 0
    return 1 if any(k in text for k in keywords) else 0

# Detect "Project"
df['has_project'] = df['clean_comment_text'].apply(lambda x: check_keywords(x, ['پروژه']))

# Detect "Homework"
df['has_homework'] = df['clean_comment_text'].apply(lambda x: check_keywords(x, ['تمرین', 'تکلیف', 'هومورک']))

# Detect "Attendance"
df['has_attendance'] = df['clean_comment_text'].apply(lambda x: check_keywords(x, ['حضور', 'غیبت', 'لیست']))
# ---------------------------------------

# 3. Create Professor Profiles (Now with ALL features)
prof_profile = df.groupby('professor_name').agg({
    'rating_1': 'mean',
    'rating_2': 'mean',
    'rating_3': 'mean',
    'rating_4': 'mean',
    'rating_5': 'mean',
    'rating_6': 'mean',
    'is_pos': 'mean',
    'is_neg': 'mean',
    'is_neu': 'mean',
    'has_project': 'mean',    # Existing
    'has_homework': 'mean',   # New
    'has_attendance': 'mean', # New
    'id': 'count'
}).rename(columns={'id': 'comment_count'})

# 4. Filter (Keep profs with 5+ comments)
prof_profile = prof_profile[prof_profile['comment_count'] >= 5]

print(f"Total Professors: {len(prof_profile)}")
print("Columns:", prof_profile.columns.tolist())
# You will now see 'has_homework' and 'has_attendance' in the list!

Total Professors: 293
Columns: ['rating_1', 'rating_2', 'rating_3', 'rating_4', 'rating_5', 'rating_6', 'is_pos', 'is_neg', 'is_neu', 'has_project', 'has_homework', 'has_attendance', 'comment_count']


In [28]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# 1. Define Feature Vectors
# We select the columns that represent the professor's "Content":
# - Numerical Ratings (rating_1 to rating_6)
# - Sentiment percentages (is_pos, is_neg, is_neu)
# - Keyword/Structure features (has_project)
feature_cols = [
    'rating_1', 'rating_2', 'rating_3', 'rating_4', 'rating_5', 'rating_6',
    'is_pos', 'is_neg', 'is_neu',
    'has_project', 'has_homework', 'has_attendance',
]

# Ensure no missing values (fill with 0 for safety)
X = prof_profile[feature_cols].fillna(0)

# 2. Calculate Similarity Matrix
# Compute cosine similarity between all professors based on these vectors
similarity_matrix = cosine_similarity(X)

# Convert into a readable DataFrame for easy lookup
sim_df = pd.DataFrame(
    similarity_matrix,
    index=prof_profile.index,
    columns=prof_profile.index
)

# 3. Define the Recommendation Function
def get_content_based_recommendations(prof_name, top_n=5):
    """
    Implementation of Scenario 1.
    Input: A target professor's name.
    Output: Top N most similar professors based on their ratings, sentiment, and keyword profile.
    """
    # Validation: Check if the professor exists
    if prof_name not in sim_df.index:
        print(f"Error: Professor '{prof_name}' not found in the database.")
        return None

    # Get similarity scores for this professor
    # Sort descending so the highest similarity (closest to 1.0) comes first
    similar_scores = sim_df[prof_name].sort_values(ascending=False)

    # Drop the professor themselves from the list
    similar_scores = similar_scores.drop(prof_name)

    # Return the top N results
    return similar_scores.head(top_n)

print("Content-Based Filtering system built successfully.")

Content-Based Filtering system built successfully.


In [29]:
# 1. Standard Test
valid_prof = prof_profile.index[0]
print(f"TEST 1: Standard Recommendation for '{valid_prof}'")
res1 = get_content_based_recommendations(valid_prof)
display(res1)
print("-" * 50)

# 2. Error Handling
print(f"TEST 2: Invalid Name Handling")
get_content_based_recommendations("Dr. Ghost")
print("-" * 50)

# 3. Project-Based Matching
# Find a professor with projects (has_project > 0.1)
proj_profs = prof_profile[prof_profile['has_project'] > 0.1].index
if len(proj_profs) > 0:
    p_prof = proj_profs[0]
    print(f"TEST 3: Project Structure Similarity for '{p_prof}'")
    res3 = get_content_based_recommendations(p_prof)
    # Check if recommendations also have projects
    if res3 is not None:
        display(prof_profile.loc[res3.index][['rating_1', 'has_project']])
else:
    print("Skipping Test 3 (No project professors found)")
print("-" * 50)

# 4. Homework Matching (NEW!)
# Find a professor where homework is mentioned often (has_homework > 0.1)
hw_profs = prof_profile[prof_profile['has_homework'] > 0.1].index
if len(hw_profs) > 0:
    h_prof = hw_profs[0]
    print(f"TEST 4: Homework Structure Similarity for '{h_prof}'")
    print("(Expect recommendations to also have homework mentions)")
    res4 = get_content_based_recommendations(h_prof)
    if res4 is not None:
        display(prof_profile.loc[res4.index][['rating_1', 'has_homework']])
else:
    print("Skipping Test 4 (No heavy-homework professors found)")
print("-" * 50)

# 5. Attendance Matching (NEW!)
# Find a professor where attendance is mentioned often (has_attendance > 0.1)
att_profs = prof_profile[prof_profile['has_attendance'] > 0.1].index
if len(att_profs) > 0:
    a_prof = att_profs[0]
    print(f"TEST 5: Attendance Policy Similarity for '{a_prof}'")
    print("(Expect recommendations to also have attendance mentions)")
    res5 = get_content_based_recommendations(a_prof)
    if res5 is not None:
        display(prof_profile.loc[res5.index][['rating_1', 'has_attendance']])
else:
    print("Skipping Test 5 (No strict-attendance professors found)")
print("-" * 50)

print("Enhanced Testing Complete.")

TEST 1: Standard Recommendation for 'ابراهیم اردشیر لاریجانی'


professor_name
رضا انتظاری ملکی       0.998454
محمدباقر قائمی         0.998053
محمد سعیدی مهرابادی    0.997739
محمد باقر قائمی        0.997047
فاطمه دباغ کاشانی      0.996873
Name: ابراهیم اردشیر لاریجانی, dtype: float64

--------------------------------------------------
TEST 2: Invalid Name Handling
Error: Professor 'Dr. Ghost' not found in the database.
--------------------------------------------------
TEST 3: Project Structure Similarity for 'ابراهیم اردشیر لاریجانی'


Unnamed: 0_level_0,rating_1,has_project
professor_name,Unnamed: 1_level_1,Unnamed: 2_level_1
رضا انتظاری ملکی,8.166667,0.416667
محمدباقر قائمی,7.7,0.0
محمد سعیدی مهرابادی,7.4,0.0
محمد باقر قائمی,7.44,0.038462
فاطمه دباغ کاشانی,4.8,0.0


--------------------------------------------------
TEST 4: Homework Structure Similarity for 'ابراهیم اردشیر لاریجانی'
(Expect recommendations to also have homework mentions)


Unnamed: 0_level_0,rating_1,has_homework
professor_name,Unnamed: 1_level_1,Unnamed: 2_level_1
رضا انتظاری ملکی,8.166667,0.583333
محمدباقر قائمی,7.7,0.2
محمد سعیدی مهرابادی,7.4,0.0
محمد باقر قائمی,7.44,0.269231
فاطمه دباغ کاشانی,4.8,0.0


--------------------------------------------------
TEST 5: Attendance Policy Similarity for 'ابوالفضل اسکندری'
(Expect recommendations to also have attendance mentions)


Unnamed: 0_level_0,rating_1,has_attendance
professor_name,Unnamed: 1_level_1,Unnamed: 2_level_1
حجت قاسمی,7.857143,0.125
رضا ترکاشون,5.4,0.0
ایت قره قانی,8.263158,0.083333
جواد غفاری,7.6875,0.424242
احسان دهقانی برسیانی,8.0,0.0


--------------------------------------------------
Enhanced Testing Complete.


In [31]:
def filter_professors_by_rules(
    min_score=None, 
    min_fairness=None, 
    project_based=None,    
    heavy_homework=None,   
    strict_attendance=None,
    top_n=5
):
    """
    Scenario 2 Implementation:
    Filters the professor list based on strict rules (Hard Filters).
    If no filters are applied, returns the top rated professors overall.
    """
    # Start with a copy of the full data
    results = prof_profile.copy()
    
    # --- Filter 1: High Score (rating_1) ---
    if min_score is not None:
        results = results[results['rating_1'] >= min_score]
        
    # --- Filter 2: Fair Grading (rating_3) ---
    if min_fairness is not None:
        results = results[results['rating_3'] >= min_fairness]

    # --- Filter 3: Project-Based ---
    if project_based is True:
        # User explicitly wants projects
        results = results[results['has_project'] >= 0.10]
    elif project_based is False:
        # User explicitly wants NO projects (Theory based)
        results = results[results['has_project'] < 0.10]
    
    # --- Filter 4: Homework ---
    if heavy_homework is True:
        results = results[results['has_homework'] >= 0.20]
    elif heavy_homework is False:
        results = results[results['has_homework'] < 0.20]

    # --- Filter 5: Attendance ---
    if strict_attendance is True:
        results = results[results['has_attendance'] >= 0.15]
    elif strict_attendance is False:
        results = results[results['has_attendance'] < 0.15]

    # Sort by Quality (rating_1) and return the top N
    results = results.sort_values(by='rating_1', ascending=False).head(top_n)
    
    # Return relevant columns to verify
    cols_to_show = ['rating_1', 'rating_3', 'has_project', 'has_homework', 'has_attendance']
    # Filter columns that exist in the dataframe to prevent errors
    cols_to_show = [c for c in cols_to_show if c in results.columns]
        
    return results[cols_to_show]

print("Rule-Based Filtering Complete.")

Rule-Based Filtering Complete.


In [32]:
# 1. Test: The "Ideal" Professor
print("TEST 1: The 'Ideal' Professor (Score>8, Fair>8, Projects)")
res1 = filter_professors_by_rules(min_score=8.0, min_fairness=8.0, project_based=True)
display(res1)
print("-" * 50)

# 2. Test: The "Lazy" Student
print("TEST 2: The 'Chill' Semester (No Homework, No Attendance Checks)")
# heavy_homework=False -> Must be light homework
# strict_attendance=False -> Must be chill attendance
res2 = filter_professors_by_rules(heavy_homework=False, strict_attendance=False, min_score=7.0)
display(res2)
print("-" * 50)

# 3. Test: The "Hardcore" Student
print("TEST 3: The 'Hardcore' Student (Projects + Heavy Homework)")
res3 = filter_professors_by_rules(project_based=True, heavy_homework=True)
display(res3)
print("-" * 50)

# 4. Test: The "Browsing" Student (No Filters)
print("TEST 4: No Filters (Just show me the Top 5 Professors overall)")
# All arguments are None by default, so it just sorts by rating
res4 = filter_professors_by_rules()
display(res4)
print("-" * 50)

# 5. Test: The "Pure Theory" Student
print("TEST 5: Pure Theory (Explicitly Project_Based = False)")
# This confirms our fix works: It should ONLY show professors with low project scores
res5 = filter_professors_by_rules(project_based=False, min_score=8.0)
display(res5)
print("-" * 50)

print("Rule-Based Tests Complete."

TEST 1: The 'Ideal' Professor (Score>8, Fair>8, Projects)


Unnamed: 0_level_0,rating_1,rating_3,has_project,has_homework,has_attendance
professor_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
سهیل گنجه فر,10.0,10.0,0.8,0.4,0.2
روح الدین میری,9.75,10.0,0.1,0.2,0.1
ذبیح الله ذبیحی لهرمی,9.7,9.4,0.1,0.0,0.1
محمد صدیقی,9.636364,9.818182,0.428571,0.142857,0.0
محمود مهرداد شکریه,9.625,9.25,0.222222,0.666667,0.111111


--------------------------------------------------
TEST 2: The 'Chill' Semester (No Homework, No Attendance Checks)


Unnamed: 0_level_0,rating_1,rating_3,has_project,has_homework,has_attendance
professor_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
محمدرضا جعفر فرد,10.0,10.0,0.0,0.0,0.0
مریم زارعشاهی,10.0,10.0,0.0,0.0,0.0
نرگس طاهری,9.727273,9.636364,0.0,0.178571,0.142857
ذبیح الله ذبیحی لهرمی,9.7,9.4,0.1,0.0,0.1
محمد صدیقی,9.636364,9.818182,0.428571,0.142857,0.0


--------------------------------------------------
TEST 3: The 'Hardcore' Student (Projects + Heavy Homework)


Unnamed: 0_level_0,rating_1,rating_3,has_project,has_homework,has_attendance
professor_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
سهیل گنجه فر,10.0,10.0,0.8,0.4,0.2
روح الدین میری,9.75,10.0,0.1,0.2,0.1
محمود مهرداد شکریه,9.625,9.25,0.222222,0.666667,0.111111
بیژن محمدی,9.5,9.5,0.2,0.2,0.0
اقای دکتز احمد رهبر,9.4,7.4,0.4,0.4,0.4


--------------------------------------------------
TEST 4: No Filters (Just show me the Top 5 Professors overall)


Unnamed: 0_level_0,rating_1,rating_3,has_project,has_homework,has_attendance
professor_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
مریم عبدی,10.0,8.857143,0.0,0.142857,0.428571
مریم زارعشاهی,10.0,10.0,0.0,0.0,0.0
محمدرضا جعفر فرد,10.0,10.0,0.0,0.0,0.0
مجید ایلچی قزاان,10.0,10.0,0.0,0.2,0.2
سهیل گنجه فر,10.0,10.0,0.8,0.4,0.2


--------------------------------------------------
TEST 5: Pure Theory (Explicitly Project_Based = False)


Unnamed: 0_level_0,rating_1,rating_3,has_project,has_homework,has_attendance
professor_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
مجید ایلچی قزاان,10.0,10.0,0.0,0.2,0.2
مریم عبدی,10.0,8.857143,0.0,0.142857,0.428571
محمدرضا جعفر فرد,10.0,10.0,0.0,0.0,0.0
مریم زارعشاهی,10.0,10.0,0.0,0.0,0.0
فائزه میرشفیعی,10.0,9.6,0.0,0.4,0.0


--------------------------------------------------
Rule-Based Tests Complete.


In [36]:
def get_hybrid_recommendation(
    target_prof=None,      # <--- Now OPTIONAL
    min_score=None, 
    project_based=None,    # Tri-State
    heavy_homework=None,   # Tri-State
    strict_attendance=None,# Tri-State
    top_n=5
):
    """
    Unified Recommender:
    - If 'target_prof' is given: Finds similar professors who meet the criteria.
    - If 'target_prof' is None: Finds top-rated professors who meet the criteria.
    """
    
    # --- STEP 1: Determine the Candidate Pool ---
    if target_prof:
        # PATH A: Similarity Search
        if target_prof not in sim_df.index:
            print(f"⚠️ Error: Professor '{target_prof}' not found.")
            return None
        # Get top 50 similar professors (sorted by similarity)
        similar_candidates_index = sim_df[target_prof].sort_values(ascending=False).drop(target_prof).head(50).index
        candidates = prof_profile.loc[similar_candidates_index].copy()
    else:
        # PATH B: Global Search (No Name)
        # Start with EVERYONE
        candidates = prof_profile.copy()
    
    # --- STEP 2: Rule-Based Filtering ---
    
    # 1. Score Filter
    if min_score is not None:
        candidates = candidates[candidates['rating_1'] >= min_score]
        
    # 2. Project Filter (Tri-State)
    if project_based is True:
        candidates = candidates[candidates['has_project'] >= 0.10]
    elif project_based is False:
        candidates = candidates[candidates['has_project'] < 0.10]

    # 3. Homework Filter (Tri-State)
    if heavy_homework is True:
        candidates = candidates[candidates['has_homework'] >= 0.20]
    elif heavy_homework is False:
        candidates = candidates[candidates['has_homework'] < 0.20]

    # 4. Attendance Filter (Tri-State)
    if strict_attendance is True:
        candidates = candidates[candidates['has_attendance'] >= 0.15]
    elif strict_attendance is False:
        candidates = candidates[candidates['has_attendance'] < 0.15]

    # --- STEP 3: Sorting & Return ---
    
    if target_prof is None:
        # If we didn't search by similarity, we must sort by Quality (Rating)
        candidates = candidates.sort_values(by='rating_1', ascending=False)
    
    # (If target_prof WAS provided, 'candidates' is already sorted by similarity from Step 1)

    cols_to_show = ['rating_1', 'has_project', 'has_homework', 'has_attendance']
    # Safety check
    cols_to_show = [c for c in cols_to_show if c in candidates.columns]
    
    return candidates[cols_to_show].head(top_n)

print("Hybrid System Finalized.")

Hybrid System Finalized.


In [37]:
target = 'اسماء سلیمانی'

# --- PART 1: WITH NAME (Similarity + Filter) ---
print("--- PART 1: WITH NAME (Similarity Mode) ---")

# Test 1.1: "Like Asma, but STRICTLY Theory (No Projects)"
print(f"TEST 1.1: Like '{target}' + Project_Based=False")
res1 = get_hybrid_recommendation(target_prof=target, project_based=False)
display(res1)
print("-" * 30)

# Test 1.2: "Like Asma, but STRICTLY Heavy Homework"
print(f"TEST 1.2: Like '{target}' + Heavy_Homework=True")
res2 = get_hybrid_recommendation(target_prof=target, heavy_homework=True)
display(res2)
print("-" * 30)

# Test 1.3: "Like Asma, but CHILL Attendance"
print(f"TEST 1.3: Like '{target}' + Strict_Attendance=False")
res3 = get_hybrid_recommendation(target_prof=target, strict_attendance=False)
display(res3)
print("-" * 50)


# --- PART 2: NO NAME (Discovery Mode) ---
print("--- PART 2: NO NAME (Discovery Mode) ---")

# Test 2.1: "Just find me Project Professors (Score > 9)"
print(f"TEST 2.1: Any Prof + Project_Based=True + Score > 9")
res4 = get_hybrid_recommendation(target_prof=None, min_score=9.0, project_based=True)
display(res4)
print("-" * 30)

# Test 2.2: "Just find me Light Homework Professors"
print(f"TEST 2.2: Any Prof + Heavy_Homework=False")
res5 = get_hybrid_recommendation(target_prof=None, heavy_homework=False)
display(res5)
print("-" * 30)

# Test 2.3: "The 'Strict' Search (Strict Attendance + Heavy Homework)"
print(f"TEST 2.3: Any Prof + Strict_Attendance=True + Heavy_Homework=True")
res6 = get_hybrid_recommendation(target_prof=None, strict_attendance=True, heavy_homework=True)
display(res6)
print("-" * 50)

print("Comprehensive Tests Complete.")

--- PART 1: WITH NAME (Similarity Mode) ---
TEST 1.1: Like 'اسماء سلیمانی' + Project_Based=False


Unnamed: 0_level_0,rating_1,has_project,has_homework,has_attendance
professor_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
هنگامه تمیمی,9.4,0.0,0.1875,0.1875
محمود جواد جنتی,9.8,0.0,0.2,0.0
مجید ایلچی قزاان,10.0,0.0,0.2,0.2
محسن کلانتر,9.5,0.0,0.666667,0.166667
نرگس هوشمند,9.1,0.0,0.3,0.3


------------------------------
TEST 1.2: Like 'اسماء سلیمانی' + Heavy_Homework=True


Unnamed: 0_level_0,rating_1,has_project,has_homework,has_attendance
professor_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
محمود جواد جنتی,9.8,0.0,0.2,0.0
مجید ایلچی قزاان,10.0,0.0,0.2,0.2
محسن کلانتر,9.5,0.0,0.666667,0.166667
نرگس هوشمند,9.1,0.0,0.3,0.3
روح الدین میری,9.75,0.1,0.2,0.1


------------------------------
TEST 1.3: Like 'اسماء سلیمانی' + Strict_Attendance=False


Unnamed: 0_level_0,rating_1,has_project,has_homework,has_attendance
professor_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
محمود جواد جنتی,9.8,0.0,0.2,0.0
سعید رستگاری,9.142857,0.285714,0.0,0.0
روح الدین میری,9.75,0.1,0.2,0.1
حمید سیفی,9.4375,0.0625,0.0625,0.0625
امین اوحدی اصفهانی,9.6,0.0,0.0,0.0


--------------------------------------------------
--- PART 2: NO NAME (Discovery Mode) ---
TEST 2.1: Any Prof + Project_Based=True + Score > 9


Unnamed: 0_level_0,rating_1,has_project,has_homework,has_attendance
professor_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
سهیل گنجه فر,10.0,0.8,0.4,0.2
روح الدین میری,9.75,0.1,0.2,0.1
ذبیح الله ذبیحی لهرمی,9.7,0.1,0.0,0.1
محمد صدیقی,9.636364,0.428571,0.142857,0.0
محمود مهرداد شکریه,9.625,0.222222,0.666667,0.111111


------------------------------
TEST 2.2: Any Prof + Heavy_Homework=False


Unnamed: 0_level_0,rating_1,has_project,has_homework,has_attendance
professor_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
مریم زارعشاهی,10.0,0.0,0.0,0.0
مریم عبدی,10.0,0.0,0.142857,0.428571
محمدرضا جعفر فرد,10.0,0.0,0.0,0.0
مجید جلیلی,9.928571,0.0,0.0,0.357143
محمدصادق عسگری,9.9,0.0,0.0,0.2


------------------------------
TEST 2.3: Any Prof + Strict_Attendance=True + Heavy_Homework=True


Unnamed: 0_level_0,rating_1,has_project,has_homework,has_attendance
professor_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
سهیل گنجه فر,10.0,0.8,0.4,0.2
مجید ایلچی قزاان,10.0,0.0,0.2,0.2
مجید نوروزی - امید محمدیان,9.7,0.0,0.2,0.4
مجیدرضا ایت اللهی,9.6,0.0,1.0,0.6
محسن کلانتر,9.5,0.0,0.666667,0.166667


--------------------------------------------------
Comprehensive Tests Complete.
