# Dating Algorithm Recommender System

In [1]:
# libraries
import pandas as pd
import random
import numpy as np

# Creating Data

## Profile Questions

In [2]:
# Creating a Dataset of men and women
men = pd.DataFrame()

women = pd.DataFrame()

# Number of users
num = 1000

# Dating profile questions for each
qs = ['Q1', 'Q2', 'Q3', 'Q4', 'Q5']

# Answers to profile questions
ans = ['A', 'B', 'C', 'D', 'E']

for q in qs:
    
    # Making them categorical for preprocessing later
    men[q] = pd.Categorical(random.choices(ans, k=num), categories=ans)
    
    women[q] = pd.Categorical(random.choices(ans, k=num), categories=ans)
    
    # IDs
    men['id'] = ["m"+str(i) for i in range(num)]
    
    women['id'] = ["w"+str(i) for i in range(num)]
    
# Setting index
men.set_index('id', inplace=True)

women.set_index('id', inplace=True)

# Showing the DFs
display(men)

display(women)

Unnamed: 0_level_0,Q1,Q2,Q3,Q4,Q5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
m0,C,B,D,E,E
m1,C,D,B,C,E
m2,B,C,B,B,D
m3,B,D,A,A,D
m4,D,B,C,D,A
...,...,...,...,...,...
m995,D,D,C,E,E
m996,B,D,E,A,E
m997,B,E,B,B,A
m998,B,B,A,D,B


Unnamed: 0_level_0,Q1,Q2,Q3,Q4,Q5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
w0,B,E,D,E,E
w1,B,D,C,D,A
w2,B,A,A,D,E
w3,E,C,B,C,A
w4,A,E,B,D,E
...,...,...,...,...,...
w995,C,C,B,B,C
w996,D,B,B,D,E
w997,E,E,D,D,B
w998,D,D,E,B,A


## Compatibility Rating

In [3]:
# Creating match ratings between users
ratings = pd.DataFrame(index=men.index, columns=women.index)

for i in ratings.columns:
    ratings[i] = random.choices(range(0,6), k=num)
    
ratings

id,w0,w1,w2,w3,w4,w5,w6,w7,w8,w9,...,w990,w991,w992,w993,w994,w995,w996,w997,w998,w999
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
m0,4,2,2,3,4,1,4,0,5,3,...,1,1,5,1,4,2,5,3,0,0
m1,2,2,4,2,1,5,1,0,5,5,...,2,4,3,2,5,0,4,4,5,3
m2,4,5,4,2,2,5,3,0,3,2,...,3,2,2,2,4,2,4,3,1,1
m3,0,2,4,3,0,0,3,3,4,4,...,2,3,4,5,2,5,1,5,0,0
m4,4,3,5,3,1,1,5,1,1,0,...,4,4,0,5,4,3,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
m995,3,0,1,1,5,4,4,0,1,2,...,0,4,5,1,2,1,2,4,5,5
m996,4,1,4,3,0,1,4,5,4,1,...,2,0,1,2,4,0,4,1,4,3
m997,4,2,0,5,5,3,2,2,2,3,...,2,5,1,5,3,4,5,1,1,3
m998,2,0,4,5,2,0,5,0,1,4,...,3,1,2,0,0,0,0,2,3,4


# Finding Similar Users

### Finding Users who had the most 0 ratings
Meaning they haven't seen the other person yet, will make recommendations for them.

In [4]:
# Man
m_user = ratings.T.apply(pd.Series.value_counts).T.sort_values(by=0, ascending=False).iloc[0]

m_user

0    205
1    168
2    153
3    163
4    154
5    157
Name: m438, dtype: int64

In [5]:
# Woman
w_user = ratings.apply(pd.Series.value_counts).T.sort_values(by=0, ascending=False).iloc[0]

w_user

0    207
1    149
2    153
3    162
4    169
5    160
Name: w881, dtype: int64

## Users that haven't been rated/matched yet by the user above

In [6]:
# Man's unseen users (0 rating)
m_nrate = ratings.T[ratings.T[m_user.name]==0].index

m_nrate

Index(['w0', 'w7', 'w20', 'w22', 'w28', 'w31', 'w32', 'w33', 'w37', 'w50',
       ...
       'w944', 'w955', 'w967', 'w968', 'w969', 'w975', 'w980', 'w988', 'w994',
       'w998'],
      dtype='object', name='id', length=205)

In [7]:
# Woman's unseen users (0 rating)
w_nrate = ratings[ratings[w_user.name]==0].index

w_nrate

Index(['m0', 'm1', 'm2', 'm5', 'm31', 'm34', 'm43', 'm45', 'm52', 'm54',
       ...
       'm940', 'm944', 'm951', 'm952', 'm957', 'm963', 'm969', 'm982', 'm983',
       'm992'],
      dtype='object', name='id', length=207)

## Top 10 Similar Users

__Simple Data Preprocessing__

In [8]:
# First need to replace the answers with their numerical values
n_men = men.apply(lambda x: x.cat.codes)

n_women = women.apply(lambda x: x.cat.codes)

In [9]:
display(n_men)
display(n_women)

Unnamed: 0_level_0,Q1,Q2,Q3,Q4,Q5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
m0,2,1,3,4,4
m1,2,3,1,2,4
m2,1,2,1,1,3
m3,1,3,0,0,3
m4,3,1,2,3,0
...,...,...,...,...,...
m995,3,3,2,4,4
m996,1,3,4,0,4
m997,1,4,1,1,0
m998,1,1,0,3,1


Unnamed: 0_level_0,Q1,Q2,Q3,Q4,Q5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
w0,1,4,3,4,4
w1,1,3,2,3,0
w2,1,0,0,3,4
w3,4,2,1,2,0
w4,0,4,1,3,4
...,...,...,...,...,...
w995,2,2,1,1,2
w996,3,1,1,3,4
w997,4,4,3,3,1
w998,3,3,4,1,0


In [10]:
# Similar men
m_sim = n_men.T.corrwith(n_men.T[m_user.name]).sort_values(ascending=False)[1:11]

m_sim

id
m965    1.000000
m695    1.000000
m438    1.000000
m393    0.979958
m278    0.975900
m861    0.963087
m545    0.952579
m281    0.952579
m680    0.952579
m737    0.952579
dtype: float64

In [11]:
# Simliar women
w_sim = n_women.T.corrwith(n_women.T[w_user.name]).sort_values(ascending=False)[1:11]

w_sim

id
w72     0.963087
w124    0.958373
w105    0.956522
w88     0.951734
w641    0.942168
w0      0.935585
w961    0.935585
w800    0.935585
w268    0.935585
w455    0.933564
dtype: float64

## The Similar Users' Ratings for the Unseen Users

In [12]:
# Similar mens' ratings
msim_rate = ratings.loc[list(m_sim.index)][m_nrate]

msim_rate

id,w0,w7,w20,w22,w28,w31,w32,w33,w37,w50,...,w944,w955,w967,w968,w969,w975,w980,w988,w994,w998
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
m965,5,5,0,1,2,3,2,3,2,5,...,4,0,1,2,3,1,5,0,0,5
m695,1,2,2,1,4,5,3,1,5,5,...,4,3,3,2,0,2,3,5,0,3
m438,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
m393,4,5,0,0,2,3,3,3,2,3,...,2,3,3,1,4,0,1,3,2,3
m278,3,0,0,4,3,3,1,2,4,1,...,3,3,5,3,0,4,4,0,5,0
m861,0,0,5,4,3,0,1,4,4,1,...,2,1,1,2,1,5,5,4,5,5
m545,5,2,4,5,5,0,3,4,0,1,...,3,3,4,4,4,3,0,3,5,5
m281,5,3,5,2,4,0,3,0,0,2,...,5,1,5,3,1,5,0,4,3,2
m680,4,4,0,0,4,3,5,1,2,5,...,2,3,4,2,1,0,5,0,4,5
m737,0,3,3,2,2,5,1,5,0,2,...,4,0,1,1,5,1,4,0,5,3


In [13]:
# Similar womens' ratings
wsim_rate = ratings[list(w_sim.index)].T[w_nrate]

wsim_rate

id,m0,m1,m2,m5,m31,m34,m43,m45,m52,m54,...,m940,m944,m951,m952,m957,m963,m969,m982,m983,m992
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
w72,5,3,0,2,4,3,5,2,2,4,...,2,3,3,0,0,1,4,5,3,2
w124,3,1,1,4,5,0,2,0,3,0,...,5,1,4,4,2,5,4,5,2,0
w105,0,3,1,5,1,3,5,0,2,1,...,5,4,1,5,3,0,5,2,5,2
w88,0,5,1,5,2,5,3,5,4,4,...,3,0,1,5,4,1,3,3,0,0
w641,5,3,4,1,4,0,5,2,5,4,...,5,5,0,3,3,5,0,2,1,5
w0,4,2,4,3,4,0,3,5,0,3,...,0,3,0,3,2,3,4,4,2,5
w961,0,1,1,2,2,3,3,5,0,5,...,4,4,5,2,4,5,5,5,4,4
w800,1,4,3,0,3,0,0,5,4,1,...,1,3,2,0,0,1,0,1,5,0
w268,0,1,2,1,3,2,1,4,0,5,...,4,1,3,2,2,3,0,1,4,3
w455,2,2,2,1,1,2,1,3,2,4,...,2,4,0,0,0,3,1,2,1,4


# Comparing Methods of Prediction

In [14]:
# Man predictions
m_predict = pd.DataFrame()

# Average
m_predict['avg'] = msim_rate.mean().round()

# Frequency
m_predict['freq'] = msim_rate.mode().T[0]

# Median
m_predict['median'] = msim_rate.median().round()

m_predict

Unnamed: 0_level_0,avg,freq,median
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
w0,3.0,0.0,4.0
w7,2.0,0.0,2.0
w20,2.0,0.0,1.0
w22,2.0,0.0,2.0
w28,3.0,2.0,3.0
...,...,...,...
w975,2.0,0.0,2.0
w980,3.0,0.0,4.0
w988,2.0,0.0,2.0
w994,3.0,5.0,4.0


In [15]:
# Woman predictions
w_predict = pd.DataFrame()

# Average
w_predict['avg'] = wsim_rate.mean().round()

# Frequency
w_predict['freq'] = wsim_rate.mode().T[0]

# Median
w_predict['median'] = wsim_rate.median().round()

w_predict

Unnamed: 0_level_0,avg,freq,median
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
m0,2.0,0.0,2.0
m1,2.0,1.0,2.0
m2,2.0,1.0,2.0
m5,2.0,1.0,2.0
m31,3.0,4.0,3.0
...,...,...,...
m963,3.0,1.0,3.0
m969,3.0,0.0,4.0
m982,3.0,2.0,2.0
m983,3.0,1.0,2.0


# Handling a New User

## 1. Onboarding a New User
New user will answer questions to determine other profiles most similar to them.

In [16]:
# Dataframe of new user
m_new_user = pd.DataFrame(
    [random.choices(ans, k=5)],
    columns=men.columns,
    index=['m'+str(int(men.index[-1][1:])+1)] # New ID for the new user
)

# Categorizing the answers
m_new_user = m_new_user.apply(lambda x: pd.Categorical(x, categories=ans))

# The New User and their answers
m_new_user

Unnamed: 0,Q1,Q2,Q3,Q4,Q5
m1000,C,C,B,D,B


## 2. Finding Similar Users
These similar users' compatibility ratings will be used to find better recommendations for the new user.

In [17]:
# Categorizing the new user's answers to fit with the rest of the users
m_new_user = m_new_user.apply(lambda x: x.cat.codes, axis=1)

m_new_user

Unnamed: 0,Q1,Q2,Q3,Q4,Q5
m1000,2,2,1,3,1


In [19]:
# First need to replace the answers with their numerical values
n_men = men.apply(lambda x: x.cat.codes)

# Getting the correlation score and sorting by most correlated
new_sim = n_men.corrwith(m_new_user.iloc[0], axis=1).sort_values(ascending=False)[:10]

# The most similar users to the new user
new_sim

id
m714    0.979958
m214    0.975900
m838    0.975900
m899    0.968822
m257    0.968822
m601    0.964286
m7      0.964286
m643    0.962533
m511    0.945611
m409    0.944911
dtype: float64

## 3. Getting the Similar Users' Aggregated Ratings

In [20]:
# Getting the similar users' ratings
new_sim_rate = ratings.T[new_sim.index]

# Finding top recommended based on the median of the similar users' rating
new_sim_rate.median(axis=1).round().sort_values(ascending=False)[:10]

id
w949    5.0
w60     5.0
w225    4.0
w457    4.0
w349    4.0
w126    4.0
w828    4.0
w350    4.0
w452    4.0
w900    4.0
dtype: float64

__These are the top recommended users that may be the most compatible with the new user__

# Helper Functions
These functions will consolidate the process above.

In [31]:
def matchMan(men_df, women_df, ratings, new_man_answers, num_sim=10):
    """
    This function will return the most likely compatible women based on a few given
    dataframes for a new male user.  Will use the top N similar users' compatibility 
    ratings to find the potentially most compatible women.
    """
    
    # First need to replace the DF answers with their numerical values
    men_df = men_df.apply(lambda x: x.cat.codes)

    women_df = women_df.apply(lambda x: x.cat.codes)
    
    # Dataframe of new user
    new_man = pd.DataFrame(
        [new_man_answers],
        columns=men_df.columns,
        index=['m'+str(int(men_df.index[-1][1:])+1)] # Getting the new man's id 
    )
    
    # Categorical answers to the profile questions
    ans = ['A', 'B', 'C', 'D', 'E']
    
    # Categorizing the answers
    new_man = new_man.apply(
        lambda x: pd.Categorical(x, categories=ans)
    ).apply(
        lambda x: x.cat.codes, axis=1
    )
        
    # Getting the top N similar users
    sim_men = men_df.corrwith(
        new_man.iloc[0], 
        axis=1
    ).sort_values(ascending=False)[:num_sim].index
    
    # Getting the similar users' ratings
    sim_rate = ratings.T[sim_men]
    
    # The potentially most compatible women for the new man
    most_comp = sim_rate.median(axis=1).round().sort_values(ascending=False)
    
    return most_comp

In [32]:
# New man's answers
new_man_answers = random.choices(ans, k=5)

matchMan(
    men, 
    women, 
    ratings, 
    new_man_answers, 
    num_sim=10)

id
w609    4.0
w826    4.0
w215    4.0
w862    4.0
w217    4.0
       ... 
w880    0.0
w32     0.0
w214    0.0
w699    0.0
w356    0.0
Length: 1000, dtype: float64

In [33]:
def matchWoman(men_df, women_df, ratings, new_woman_answers, num_sim=10):
    """
    This function will return the most likely compatible men based on a few given
    dataframes for a new female user.  Will use the top N similar users' compatibility 
    ratings to find the potentially most compatible men.
    """
    # First need to replace the DF answers with their numerical values
    men_df = men_df.apply(lambda x: x.cat.codes)

    women_df = women_df.apply(lambda x: x.cat.codes)
    
    # Dataframe of new user
    new_woman = pd.DataFrame(
        [new_woman_answers],
        columns=women_df.columns,
        index=['m'+str(int(women_df.index[-1][1:])+1)] # Getting the new woman's id 
    )
    
    # Categorical answers to the profile questions
    ans = ['A', 'B', 'C', 'D', 'E']
    
    # Categorizing the answers
    new_woman = new_woman.apply(
        lambda x: pd.Categorical(x, categories=ans)
    ).apply(
        lambda x: x.cat.codes, axis=1
    )
        
    # Getting the top N similar users
    sim_women = women_df.corrwith(
        new_woman.iloc[0], 
        axis=1
    ).sort_values(ascending=False)[:num_sim].index
    
    # Getting the similar users' ratings
    sim_rate = ratings[sim_women]
    
    # The potentially most compatible men for the new woman
    most_comp = sim_rate.median(axis=1).round().sort_values(ascending=False)
    
    return most_comp

In [34]:
# New woman's answers
new_woman_answers = random.choices(ans, k=5)

matchWoman(
    men, 
    women, 
    ratings, 
    new_woman_answers, 
    num_sim=10)

id
m437    5.0
m280    4.0
m800    4.0
m295    4.0
m806    4.0
       ... 
m989    0.0
m153    0.0
m336    0.0
m74     0.0
m367    0.0
Length: 1000, dtype: float64

# Predicting Compatibility Rating 
Between Two Users (man and woman)

In [35]:
def predictComp(m, w, men_df=men, women_df=women, ratings=ratings, num_sim=10):
    """
    Compares two existing users with one another and returns
    their compatibility score or at least one existing user with
    a new set of answers to also return a compatibility score.  
    Defaults to the already established DFs.
    """
    
    try:
        answers = list(men_df.loc[m].values)
        
        comp_score = matchMan(
            men_df, 
            women_df, 
            ratings, 
            answers, 
            num_sim=10
        )[w]
        
        return comp_score
    
    except:
        answers = list(women_df.loc[w].values)
        
        comp_score = matchWoman(
            men_df, 
            women_df, 
            ratings, 
            answers, 
            num_sim=10
        )[m]
        
        return comp_score

    finally:
        if type(m)==list:
            
            comp_score = matchMan(
                men_df, 
                women_df, 
                ratings, 
                m, 
                num_sim=10
            )[w]
        
            return comp_score
        
        elif type(w)==list:
            
            comp_score = matchWoman(
                men_df, 
                women_df, 
                ratings, 
                answers, 
                num_sim=10
            )[m]

            return comp_score

In [36]:
# Example with two existing users
print("Two existing:", predictComp('m4', 'w13'))


# Answers to profile questions
ans = ['A', 'B', 'C', 'D', 'E']

# New user answers (man or woman)
new_user_ans = random.choices(ans, k=5)

# Example with an already existing male user
print("New female user:", predictComp('m20', new_user_ans))

# Example with an already existing female user
print("New male user:", predictComp(new_user_ans, 'w333'))

Two existing: 2.0
New female user: 2.0
New male user: 3.0


# Comparing Predictions with Actual Values

## 1. Find a Woman User
With the least amount of zeroes

In [28]:
# Finding the right user
user = ratings.apply(pd.Series.value_counts).T.sort_values(by=0, ascending=False).iloc[-1].name

# Getting their ratings and filling in 0s with Nans
user_ratings = ratings[user].replace(0, np.nan)

user_ratings

id
m0      5.0
m1      2.0
m2      4.0
m3      4.0
m4      NaN
       ... 
m995    4.0
m996    1.0
m997    NaN
m998    NaN
m999    2.0
Name: w724, Length: 1000, dtype: float64

# 2. Getting Predictions
For this user

In [43]:
# User's answers
user_ans = list(women.loc[user].values)

preds = matchWoman(
    men, 
    women, 
    ratings, 
    user_ans, 
    num_sim=10
)

preds

id
m0      4.0
m315    4.0
m259    4.0
m827    4.0
m824    4.0
       ... 
m342    0.0
m790    0.0
m143    0.0
m654    0.0
m44     0.0
Length: 1000, dtype: float64

# 3. Comparing Predictions with Actual Values

In [49]:
# Combining both
comb = pd.concat([user_ratings, preds], axis=1).rename(columns={"w724":"actual",
                                                                0:"preds"})

# Dropping Nans
comb.dropna(inplace=True)

comb

Unnamed: 0_level_0,actual,preds
id,Unnamed: 1_level_1,Unnamed: 2_level_1
m0,5.0,4.0
m1,2.0,1.0
m2,4.0,3.0
m3,4.0,3.0
m5,4.0,4.0
...,...,...
m992,3.0,1.0
m993,4.0,2.0
m995,4.0,2.0
m996,1.0,3.0


In [52]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rmse = sqrt(mean_squared_error(comb['actual'], comb['preds']))

print(f"On average, predictions are off by {round(rmse,2)}")

On average, predictions are off by 1.49


__Since the underlying dataset is random, it is hard to tell if this evaluation is truly accurate.__