# Dating Algorithm Recommender System

In [1]:
# libraries
import pandas as pd
import random
import numpy as np

# Creating Data

## Profile Questions

In [47]:
# Creating a Dataset of men and women
men = pd.DataFrame()

women = pd.DataFrame()

# Number of users
num = 1000

# Dating profile questions for each
qs = ['Q1', 'Q2', 'Q3', 'Q4', 'Q5']

# Answers to profile questions
ans = ['A', 'B', 'C', 'D', 'E']

for q in qs:
    
    # Making them categorical for preprocessing later
    men[q] = pd.Categorical(random.choices(ans, k=num), categories=ans)
    
    women[q] = pd.Categorical(random.choices(ans, k=num), categories=ans)
    
    # IDs
    men['id'] = ["m"+str(i) for i in range(num)]
    
    women['id'] = ["w"+str(i) for i in range(num)]
    
# Setting index
men.set_index('id', inplace=True)

women.set_index('id', inplace=True)

# Showing the DFs
display(men)

display(women)

Unnamed: 0_level_0,Q1,Q2,Q3,Q4,Q5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
m0,E,A,A,E,E
m1,B,E,A,B,A
m2,A,C,C,E,B
m3,E,B,A,D,E
m4,A,E,C,D,C
...,...,...,...,...,...
m995,D,D,A,C,C
m996,B,E,B,A,E
m997,D,A,C,A,A
m998,B,E,B,B,C


Unnamed: 0_level_0,Q1,Q2,Q3,Q4,Q5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
w0,A,A,A,C,E
w1,B,D,D,A,B
w2,C,C,A,D,E
w3,E,B,D,B,B
w4,D,E,E,B,A
...,...,...,...,...,...
w995,C,B,C,D,E
w996,D,C,A,A,E
w997,B,B,A,C,A
w998,B,C,B,E,B


## Compatibility Rating

In [6]:
# Creating match ratings between users
ratings = pd.DataFrame(index=men.index, columns=women.index)

for i in ratings.columns:
    ratings[i] = random.choices(range(0,6), k=num)
    
ratings

id,w0,w1,w2,w3,w4,w5,w6,w7,w8,w9,...,w990,w991,w992,w993,w994,w995,w996,w997,w998,w999
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
m0,5,1,0,1,4,5,5,1,4,3,...,0,4,2,3,2,3,3,2,3,3
m1,0,3,0,0,4,0,3,1,5,4,...,3,4,3,3,0,0,0,0,3,4
m2,0,0,2,4,1,5,1,0,1,4,...,0,5,3,1,4,0,3,2,4,4
m3,0,3,3,5,3,2,3,4,1,2,...,0,1,0,0,1,4,5,4,0,3
m4,5,2,0,1,5,2,0,1,5,0,...,5,5,2,3,5,2,5,1,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
m995,1,1,5,4,1,2,0,1,0,4,...,5,1,2,0,4,3,3,2,5,4
m996,4,1,2,0,1,4,5,5,4,1,...,5,3,0,4,3,1,3,3,1,2
m997,2,0,1,3,0,4,1,3,5,5,...,3,3,3,2,5,3,4,1,4,3
m998,5,5,4,2,3,4,4,4,2,5,...,4,3,0,2,5,1,1,3,2,5


# Finding Similar Users

### Finding Users who had the most 0 ratings
Meaning they haven't seen the other person yet, will make recommendations for them.

In [7]:
# Man
m_user = ratings.T.apply(pd.Series.value_counts).T.sort_values(by=0, ascending=False).iloc[0]

m_user

0    206
1    155
2    167
3    162
4    143
5    167
Name: m63, dtype: int64

In [8]:
# Woman
w_user = ratings.apply(pd.Series.value_counts).T.sort_values(by=0, ascending=False).iloc[0]

w_user

0    197
1    149
2    167
3    166
4    162
5    159
Name: w459, dtype: int64

## Users that haven't been rated/matched yet by the user above

In [9]:
# Man's unseen users (0 rating)
m_nrate = ratings.T[ratings.T[m_user.name]==0].index

m_nrate

Index(['w13', 'w15', 'w20', 'w23', 'w32', 'w38', 'w40', 'w44', 'w47', 'w57',
       ...
       'w959', 'w965', 'w968', 'w979', 'w986', 'w987', 'w991', 'w993', 'w994',
       'w996'],
      dtype='object', name='id', length=206)

In [10]:
# Woman's unseen users (0 rating)
w_nrate = ratings[ratings[w_user.name]==0].index

w_nrate

Index(['m0', 'm1', 'm4', 'm5', 'm10', 'm12', 'm13', 'm21', 'm24', 'm26',
       ...
       'm964', 'm965', 'm970', 'm975', 'm979', 'm981', 'm982', 'm983', 'm988',
       'm999'],
      dtype='object', name='id', length=197)

## Top 10 Similar Users

__Simple Data Preprocessing__

In [58]:
# First need to replace the answers with their numerical values
men = men.apply(lambda x: x.cat.codes)

women = women.apply(lambda x: x.cat.codes)

In [61]:
display(men)
display(women)

Unnamed: 0_level_0,Q1,Q2,Q3,Q4,Q5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
m0,4,0,0,4,4
m1,1,4,0,1,0
m2,0,2,2,4,1
m3,4,1,0,3,4
m4,0,4,2,3,2
...,...,...,...,...,...
m995,3,3,0,2,2
m996,1,4,1,0,4
m997,3,0,2,0,0
m998,1,4,1,1,2


Unnamed: 0_level_0,Q1,Q2,Q3,Q4,Q5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
w0,0,0,0,2,4
w1,1,3,3,0,1
w2,2,2,0,3,4
w3,4,1,3,1,1
w4,3,4,4,1,0
...,...,...,...,...,...
w995,2,1,2,3,4
w996,3,2,0,0,4
w997,1,1,0,2,0
w998,1,2,1,4,1


In [59]:
# Similar men
m_sim = men.T.corrwith(men.T[m_user.name]).sort_values(ascending=False)[1:11]

m_sim

id
m63     1.000000
m811    1.000000
m397    1.000000
m994    0.946100
m358    0.943456
m862    0.943456
m496    0.943456
m984    0.943456
m619    0.943456
m794    0.941742
dtype: float64

In [60]:
# Simliar women
w_sim = women.T.corrwith(women.T[w_user.name]).sort_values(ascending=False)[1:11]

w_sim

id
w91     1.000000
w893    0.979958
w158    0.975260
w850    0.975260
w628    0.958373
w626    0.952579
w250    0.952579
w54     0.947895
w529    0.943701
w920    0.943242
dtype: float64

## The Similar Users' Ratings for the Unseen Users

In [64]:
# Similar mens' ratings
msim_rate = ratings.loc[list(m_sim.index)][m_nrate]

msim_rate

id,w13,w15,w20,w23,w32,w38,w40,w44,w47,w57,...,w959,w965,w968,w979,w986,w987,w991,w993,w994,w996
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
m63,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
m811,0,3,4,1,1,4,3,2,1,0,...,5,2,5,3,5,3,1,1,1,1
m397,5,1,4,3,1,0,4,2,3,5,...,0,1,3,5,3,5,3,5,3,4
m994,5,3,3,4,1,5,1,5,4,0,...,3,1,2,4,5,4,2,5,3,2
m358,1,0,3,0,3,4,2,1,2,2,...,4,5,4,0,3,4,5,4,3,0
m862,0,1,5,2,4,1,1,4,1,2,...,4,3,0,1,1,1,2,0,4,5
m496,4,5,5,1,1,4,0,5,5,0,...,5,4,4,5,1,4,0,1,1,4
m984,5,2,2,0,4,5,0,4,0,4,...,1,3,0,4,1,4,2,1,2,4
m619,0,3,2,2,2,2,4,4,4,1,...,4,5,3,2,1,1,1,2,3,5
m794,5,2,2,4,5,3,4,3,1,1,...,1,0,0,5,2,5,4,2,3,1


In [65]:
# Similar womens' ratings
wsim_rate = ratings[list(w_sim.index)].T[w_nrate]

wsim_rate

id,m0,m1,m4,m5,m10,m12,m13,m21,m24,m26,...,m964,m965,m970,m975,m979,m981,m982,m983,m988,m999
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
w91,3,2,0,3,3,1,1,5,5,4,...,1,2,5,3,1,3,0,5,0,0
w893,0,1,2,3,3,1,0,0,5,0,...,2,1,3,4,3,2,0,1,2,3
w158,5,5,4,2,2,4,2,1,5,3,...,5,5,0,5,0,4,2,2,5,0
w850,5,4,5,2,3,1,2,1,2,0,...,5,1,1,3,4,0,5,2,2,4
w628,0,4,0,0,1,4,2,3,3,4,...,2,1,2,5,4,2,0,4,0,5
w626,0,4,2,2,1,5,4,5,5,3,...,2,5,3,5,2,1,4,3,2,5
w250,0,2,1,3,3,3,5,2,0,5,...,4,4,4,1,3,2,3,5,2,4
w54,3,1,3,1,4,0,0,4,2,1,...,1,4,0,4,3,4,4,5,4,3
w529,2,1,0,1,2,2,1,0,1,0,...,3,0,5,0,2,0,3,2,4,4
w920,0,2,1,2,5,3,0,5,5,1,...,4,4,5,2,3,2,2,2,2,0


# Comparing Methods of Prediction

In [66]:
# Man predictions
m_predict = pd.DataFrame()

# Average
m_predict['avg'] = msim_rate.mean().round()

# Frequency
m_predict['freq'] = msim_rate.mode().T[0]

# Median
m_predict['median'] = msim_rate.median().round()

m_predict

Unnamed: 0_level_0,avg,freq,median
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
w13,2.0,0.0,2.0
w15,2.0,3.0,2.0
w20,3.0,2.0,3.0
w23,2.0,0.0,2.0
w32,2.0,1.0,2.0
...,...,...,...
w987,3.0,4.0,4.0
w991,2.0,2.0,2.0
w993,2.0,1.0,2.0
w994,2.0,3.0,3.0


In [67]:
# Woman predictions
w_predict = pd.DataFrame()

# Average
w_predict['avg'] = wsim_rate.mean().round()

# Frequency
w_predict['freq'] = wsim_rate.mode().T[0]

# Median
w_predict['median'] = wsim_rate.median().round()

w_predict

Unnamed: 0_level_0,avg,freq,median
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
m0,2.0,0.0,1.0
m1,3.0,1.0,2.0
m4,2.0,0.0,2.0
m5,2.0,2.0,2.0
m10,3.0,3.0,3.0
...,...,...,...
m981,2.0,2.0,2.0
m982,2.0,0.0,2.0
m983,3.0,2.0,2.0
m988,2.0,2.0,2.0


# Handling a New User

In [84]:
# Dataframe of new user
m_new_user = pd.DataFrame(
    [random.choices(ans, k=5)],
    columns=men.columns,
    index=['m'+str(int(men.index[-1][1:])+1)]
)

# Categorizing the answers
m_new_user = m_new_user.apply(lambda x: pd.Categorical(x, categories=ans))

m_new_user

Unnamed: 0,Q1,Q2,Q3,Q4,Q5
m1000,A,D,E,B,E


## Finding the Top 10 Most Similar Users for the New User

In [86]:
m_new_user = m_new_user.apply(lambda x: x.cat.codes, axis=1)

m_new_user

Unnamed: 0,Q1,Q2,Q3,Q4,Q5
m1000,0,3,4,1,4


In [87]:
new_sim = men.corrwith(m_new_user.iloc[0], axis=1).sort_values(ascending=False)[:10]

new_sim

id
m613    1.000000
m336    0.980038
m136    0.974250
m206    0.973124
m308    0.973124
m957    0.963343
m270    0.957427
m949    0.957427
m369    0.954786
m249    0.954786
dtype: float64

## Getting Recommendations

In [88]:
# Getting the similar users' ratings
new_sim_rate = ratings.T[new_sim.index]

# Finding top recommended based on the median of the similar users' rating
new_sim_rate.median(axis=1).round().sort_values(ascending=False)

id
w466    5.0
w175    4.0
w223    4.0
w198    4.0
w623    4.0
       ... 
w677    0.0
w934    0.0
w455    0.0
w586    0.0
w836    0.0
Length: 1000, dtype: float64