In [1]:
# !python -m pip install names

In [2]:
import pandas as pd
import numpy as np
import random

### Reference Tables

In [3]:
interests_df = pd.DataFrame([
    ['Volunteer work/Community involvement', 1, 1, 0.02],
    ['Sports, health & wellness', 2, 2, 0.04],
    ['Art, music & entertainment', 3, 3, 0.1],
    ['Gaming', 4, 3, 0.2],
    ['Traveling', 5, 2, 0.015],
    ['Competition', 6, 4, 0.1],
    ['Reading', 7, 5, 0.1],
    ['Investing', 8, 6, 0.2],
    ['Cooking & culinary', 9, 2, 0.015],
    ['Science & education', 10, 7, 0.02],
    ['Writing/blogging', 11, 5, 0.02],
    ['Business', 12, 6, 0.15],
    ['Technology advancements', 13, 7, 0.02],
], columns=['interest_name', 'interest_id', 'interest_group', 'interest_prob'])

interests_df

Unnamed: 0,interest_name,interest_id,interest_group,interest_prob
0,Volunteer work/Community involvement,1,1,0.02
1,"Sports, health & wellness",2,2,0.04
2,"Art, music & entertainment",3,3,0.1
3,Gaming,4,3,0.2
4,Traveling,5,2,0.015
5,Competition,6,4,0.1
6,Reading,7,5,0.1
7,Investing,8,6,0.2
8,Cooking & culinary,9,2,0.015
9,Science & education,10,7,0.02


In [4]:
n_universities = 200
n_majors = 100
n_countries = 10
n_states = 10

universities = []
for i in range(n_universities):
    country_id = random.choices(
        range(n_countries),
        weights=[np.exp(n_countries-j) for j in range(n_countries)],
        k=1
    )[0]
    state_id = random.choice(range(n_states))
    universities.append([i, country_id, state_id])

universities_df = pd.DataFrame(universities, columns=['university_id', 'country_id', 'state_id'])

### Dummy Mentor and Mentee Database

In [5]:
import names

In [6]:
n_mentors = 500
n_mentees = 3000

In [7]:
mentors = []
name = None
mentors_name = []
for i in range(n_mentors):
    while name is None or name in mentors_name:
        name = names.get_full_name().lower().replace(' ', '_')
    n_interest_squared = np.random.randint(1, 16)
    n_interest = int(np.floor(np.sqrt(n_interest_squared)))
    interests = np.random.choice(
        interests_df.interest_id, n_interest, replace=False, p=interests_df.interest_prob
    ).tolist()
    university_id = random.choices(
        universities_df.university_id, universities_df.university_id
    )[0]
    major_id = random.choices(
        range(n_majors), range(n_majors)
    )[0]
    country_id, state_id = universities_df.loc[university_id, ['country_id', 'state_id']]
    n_slot_squared = np.random.randint(1, 36)
    n_slot = int(np.floor(np.sqrt(n_slot_squared)))
    mentors_name.append(name)
    mentors.append([i, name, interests, university_id, major_id, country_id, state_id, n_slot])

mentees = []
name = None
mentees_name = []
for i in range(n_mentees):
    while name is None or name in mentees_name:
        name = names.get_full_name().lower().replace(' ', '_')
    n_interest_squared = np.random.randint(1, 16)
    n_interest = int(np.floor(np.sqrt(n_interest_squared)))
    interests = np.random.choice(interests_df.interest_id, n_interest, replace=False)
    university_id = random.choices(
        universities_df.university_id, universities_df.university_id
    )[0]
    major_id = random.choices(
        range(n_majors), range(n_majors)
    )[0]
    country_id, state_id = universities_df.loc[university_id, ['country_id', 'state_id']]
    mentees_name.append(name)
    mentees.append([i, name, interests, university_id, major_id, country_id, state_id])


In [8]:
mentors_df = pd.DataFrame(
    mentors,
    columns=['mentor_id', 'mentor_name', 'interests', 'university_id', 'major_id', 'country_id', 'state_id', 'n_slot']
)
mentees_df = pd.DataFrame(
    mentees,
    columns=['mentee_id', 'mentee_name', 'interests', 'university_id', 'major_id', 'country_id', 'state_id']
)

### Compatibility Scoring

In [9]:
def score_academic(mentor_id, mentee_id):
    score = 0
    mentor_uni, mentor_major = mentors_df.loc[mentor_id, ['university_id', 'major_id']]
    mentee_uni, mentee_major = mentees_df.loc[mentee_id, ['university_id', 'major_id']]
    if mentor_uni == mentee_uni:
        score += 5
    if mentor_major == mentee_major:
        score += 15
    return score
    

In [10]:
def score_domicile(mentor_id, mentee_id):
    score = 0
    mentor_country, mentor_state = mentors_df.loc[mentor_id, ['country_id', 'state_id']]
    mentee_country, mentee_state = mentees_df.loc[mentee_id, ['country_id', 'state_id']]
    if mentor_country == mentee_country:
        score += 5
        if mentor_state == mentee_state:
            score += 15
    return score

In [11]:
def score_interest(mentor_id, mentee_id):
    score = 0
    mentor_interests = mentors_df.loc[mentor_id, 'interests']
    mentee_interests = mentees_df.loc[mentee_id, 'interests']

    mentor_interest_groups = interests_df.query('interest_id in @mentor_interests').interest_group.unique()
    for interest in mentee_interests:
        if interest in mentor_interests:
            score += 0.5
        if interest in mentor_interest_groups:
            score += 0.5
    score *= (60 // len(mentor_interests))
    return score

In [12]:
def score(mentor_id, mentee_id):
    return score_academic(mentor_id, mentee_id) + \
        score_domicile(mentor_id, mentee_id) + \
        score_interest(mentor_id, mentee_id)

In [28]:
import itertools
pairs = pd.DataFrame(
    itertools.product(mentors_df.mentor_id, mentees_df.mentee_id),
    columns=['mentor_id', 'mentee_id']
)
pairs.set_index(['mentor_id', 'mentee_id'], inplace=True, drop=True)

In [29]:
rejected_pairs = pd.read_csv('rejected_pairs.csv')
rejected_pairs.set_index(['mentor_id', 'mentee_id'], inplace=True, drop=True)
pairs.drop(rejected_pairs.index, inplace=True)

pairs.reset_index(inplace=True)

In [13]:
from tqdm.auto import tqdm
tqdm.pandas()

pairs['score'] = pairs.progress_apply(lambda x: score(x.mentor_id, x.mentee_id), axis=1)
# 200 it/s on google colab, 500 it/s on macbook pro m1

100%|██████████| 1500000/1500000 [47:42<00:00, 524.09it/s] 


In [14]:
pairs = pairs.sort_values('score', ascending=False).reset_index(drop=True)

### Matching

In [15]:
mentors_tracker_df = mentors_df.set_index('mentor_id', inplace=False) # tracks the number of slots left
final = pd.DataFrame([], columns=['mentor_id', 'mentee_id', 'score'])  # final matching pairs

while True:
    # find the distribution of interests
    interests_dist = pd.DataFrame(mentors_tracker_df['interests'].tolist()).to_numpy().flatten()
    interests_dist = [x for x in interests_dist if not np.isnan(x)]
    interests_dist_df = pd.DataFrame([interests_dist, interests_dist], index=['interest_id', 'count']).T.groupby('interest_id').count()
    
    # finds the least popular interest
    least_pop_interest = interests_dist_df.sort_values('count').reset_index().loc[0, 'interest_id']
    print(interests_df.query('interest_id == @least_pop_interest').interest_name.values[0])

    # populate mentors with this interest
    filtered_mentors = mentors_tracker_df[
        mentors_tracker_df['interests'].apply(lambda x: True if least_pop_interest in x else False)
    ].index.tolist()

    # filter pairs to only mentors with this interest
    filtered_pairs = pairs.query('mentor_id in @filtered_mentors').sort_values('score', ascending=False).reset_index(drop=True)


    i = final.shape[0]
    while True:
        # find the best pair
        most_compatible = filtered_pairs.loc[0, ['mentor_id', 'mentee_id', 'score']].copy()

        # remove the pair from the list
        filtered_pairs.drop(0, inplace=True)

        # add the pair to the final matching pairs
        final.loc[i, :] = most_compatible

        # substrack the mentor's slot
        mentors_tracker_df.loc[most_compatible.mentor_id, 'n_slot'] -= 1

        # if the mentor has no slots left, remove them
        if mentors_tracker_df.loc[most_compatible.mentor_id, 'n_slot'] == 0:
            mentors_tracker_df.drop(most_compatible.mentor_id, inplace=True)
            filtered_pairs.drop(
                filtered_pairs[filtered_pairs['mentor_id'] == most_compatible.mentor_id].index,
                inplace=True
            )
        
        filtered_pairs.reset_index(drop=True, inplace=True)
        i += 1

        # stop if there are no more pairs
        if filtered_pairs.shape[0] == 0:
            break
    
    # stop if there are no more mentors
    if mentors_tracker_df.shape[0] == 0:
        break

final[['mentor_id', 'mentee_id']] = final[['mentor_id', 'mentee_id']].astype(int)

Cooking & culinary
Science & education
Traveling
Writing/blogging
Technology advancements
Volunteer work/Community involvement
Sports, health & wellness
Art, music & entertainment
Competition
Reading
Business
Gaming
Investing


In [16]:
final.to_csv('final.csv', index=False)