In [None]:
import pandas as pd

In [None]:
mentees_df = pd.read_csv('mentees_cleaned.csv')
mentors_df = pd.read_csv('mentors_cleaned.csv')

mentors_df = mentors_df[mentors_df["Opt-in"] == "Yes"]
mentees_df['Prefers Underrepresented'] = mentees_df['Prefers Underrepresented'].astype(str)
mentors_df['Prefers Underrepresented'] = mentors_df['Prefers Underrepresented'].astype(str)
mentees_df['Underrepresented Group'] = mentees_df['Underrepresented Group'].astype(str)
mentors_df['Underrepresented Group'] = mentors_df['Underrepresented Group'].astype(str)
df_repeated = mentors_df.loc[mentors_df.index.repeat(mentors_df['Mentee Count'])].reset_index(drop=True)
mentors_df = df_repeated

feature_columns = [
    'Year',
    'Major',
    'Education Background',
    'Mentorship Activities',
    'Roles',
    'Underpresented'  # consists of multiple columns
]

In [None]:
def compare_year(mentor, mentee):
    year_to_int = {
        'Poly': 0,
        'Year 1 (Incoming)': 1,
        'Year 2 (Matriculated 2024)': 2,
        'Year 3 (Matriculated 2023)': 3,
        'Year 4 (Matriculated 2022)': 4,
        'Year 5/Graduated (Matriculated 2021)': 5,
        'Year 5 (Matriculated 2021) / Graduated': 5,
        'MAX': 5,
    }
    return 1000 * (year_to_int[mentor['Year']] <= year_to_int[mentee['Year']])


def compare_major(mentor, mentee):
    if mentor['Major'] == mentee['Major']:
        return 0
    if "Computer Science" in [mentor['Major'], mentee['Major']]:
        return 5
    return 10  # if majors are different and neither is Computer Science


def compare_education_background(mentor, mentee):
    return 10 * (mentor['Education Background'] != mentee['Education Background'])


def compare_mentorship_activities(mentor, mentee):
    result = 0
    mentor_act = mentor['Mentorship Activities']
    mentee_act = mentee['Mentorship Activities']
    if (mentor_act + mentee_act).count('career paths') == 1:
        result += 5
    if (mentor_act + mentee_act).count('resume review') == 1:
        result += 5
    if (mentor_act + mentee_act).count('mock interview') == 1:
        result += 5
    if (mentor_act + mentee_act).count('new to application') == 1:
        result += 5
    if (mentor_act + mentee_act).count('feedback') == 1:
        result += 5
    return result


def compare_roles(mentor, mentee):
    if 'not sure yet' in mentee['Roles']:
        return 5
    mentee_roles = set(mentee['Roles'].split(', '))
    mentor_roles = set(mentor['Roles'].split(', '))
    diff = mentee_roles.difference(mentor_roles)
    return 10 * min(3, len(diff))


def compare_underrepresented(mentor, mentee):
    match = 60
    if "prefer to speak" in mentee['Prefers Underrepresented'] or "prefer to speak" in mentor['Prefers Underrepresented']:
        if "female" in mentee['Underrepresented Group'].lower() and "female" in mentor['Underrepresented Group'].lower():
            match -= 30
        if "first generation" in mentee['Underrepresented Group'].lower() and "first generation" in mentor['Underrepresented Group'].lower():
            match -= 30
        if "LBGTQ+" in mentee['Underrepresented Group'].lower() and "LBGTQ+" in mentor['Underrepresented Group'].lower():
            match -= 30
    return max(0, match)

def similarity(mentor, mentee):
    return compare_year(mentor, mentee) + \
           compare_major(mentor, mentee) + \
           compare_education_background(mentor, mentee) + \
           compare_mentorship_activities(mentor, mentee) + \
           compare_roles(mentor, mentee) + \
           compare_underrepresented(mentor, mentee)
           

In [None]:
import pandas as pd
import numpy as np
from munkres import Munkres
from manytomany.constrained_kmedoids import KMedoids


def match_mentees_to_mentors(mentors: pd.DataFrame, 
                             mentees: pd.DataFrame, 
                             similarity_func: callable):
    '''Modreg-style matching of mentees to mentor groups using the Hungarian method.

    Both mentor and mentee POV are returned for convenience.
    
    Args:
        mentors: pd.DataFrame, representing the mentors
        mentees: pd.DataFrame, representing the mentees
        mentor_groups: dict, mapping mentor group IDs to lists of mentor IDs
        mentees_per_mentor: int, the number of mentees per mentor
        similarity_func: callable, a function that takes two pd.Series and returns a number. Smaller is more similar.
        
    Returns:
        pd.DataFrame, representing the assignments from mentor POV.
        pd.DataFrame, representing the assignments by mentee POV.
    '''
    mentor_groups = { k: {k} for k in range(len(mentors_df)) }
    # Generate similarity matrix
    similarity_matrix = pd.DataFrame(index=mentor_groups.keys(), columns=mentees.index)
    for mentor_group_id, mentor_group in mentor_groups.items():
        for mentee_id, mentee in mentees.iterrows():
            similarity_matrix.loc[mentor_group_id, mentee_id] = similarity_func([mentors.iloc[mentor_id] for mentor_id in mentor_group][0], mentee)

    assignments = pd.DataFrame(index=mentor_groups.keys(), columns=[f'assigned_{i}' for i in range(1)])

    # Match mentees to mentor groups
    mentees_pool = mentees.copy()
    for round in range(1):
        matchings = Munkres().compute(similarity_matrix.values.astype(np.float32))
        for mentor_group_id_index, mentee_id_index in matchings:
            matched_mentee = mentees_pool.index[mentee_id_index]
            matched_mentor_group = list(mentor_groups.keys())[mentor_group_id_index]

            assignments.loc[matched_mentor_group][f'assigned_{round}'] = matched_mentee

        similarity_matrix = similarity_matrix.drop(assignments[f'assigned_{round}'], axis=1)
        mentees_pool = mentees_pool.drop(assignments[f'assigned_{round}'])

    # Generate table of assignments from mentor POV for convenience
    assignments_by_mentor = pd.DataFrame(
        index=mentors.index, 
        columns=[f'assignment_{i}' for i in range(1)])

    for assignment in assignments.iterrows():
        mentor_group = mentor_groups[assignment[0]]

        for mentor in mentor_group:
            assignments_by_mentor.iloc[mentor] = assignment[1]

    # Generate table of assignments from mentee POV for convenience
    max_mentor_group_size = max(len(mentor_group) for mentor_group in mentor_groups.values())
    assignments_by_mentee = pd.DataFrame(index=mentees.index, columns=[f'Mentor {i}' for i in range(max_mentor_group_size)])


    for round, value in assignments.items():
        for mentor_group, mentee in value.items():
            mentor_group_names = [mentors.iloc[mentor_id].name for mentor_id in mentor_groups[mentor_group]]

            # Pad with NaNs to fit with assignments_by_mentee dimensions
            if len(mentor_group_names) < max_mentor_group_size:
                mentor_group_names += [np.nan] * (max_mentor_group_size - len(mentor_group_names))
            assignments_by_mentee.loc[mentee] = mentor_group_names

    return assignments_by_mentor, assignments_by_mentee

_, assignments = match_mentees_to_mentors(mentors_df, mentees_df, similarity)
assignments

In [None]:
mentees_df["Assigned Mentor"] = assignments["Mentor 0"].map(mentors_df['Telegram'])
# assignments["Telegram"] = assignments["Mentor 0"].map(mentors_df['Telegram'])
# mentors_df.to_csv('debug.csv')
# assignments.to_csv('assignments.csv', index=False)
mentees_df.to_csv('matched.csv', index=False)