In [75]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import scipy.sparse as sparse
from sklearn.preprocessing import MinMaxScaler

import implicit

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [76]:
df = pd.read_parquet('./data/train_mfti.parquet', engine='pyarrow')
df

Unnamed: 0,event_date,event_timestamp,vacancy_id_,cookie_id,user_id,event_type
0,2022-08-01,1659323026,129850,97990f1a021d4be19aa3f955b7eacab4,951f53de61764ea0b51317200a0dbbfc,show_vacancy
1,2022-08-01,1659377255,108347,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy
2,2022-08-01,1659376695,109069,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy
3,2022-08-01,1659376722,171425,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy
4,2022-08-01,1659374929,252384,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy
...,...,...,...,...,...,...
12292583,2022-08-11,1660246479,162851,0d3c58c6864546c689fa0997de484b30,18124998fd644e80b667a1d84bf63846,preview_click_response
12292584,2022-08-11,1660246106,136818,ef559274a81e4ff4b92aa3e9f6805886,5c71c1f1b7cf492ba05871c400a05250,preview_click_response
12292585,2022-08-11,1660250139,143721,c4453128ae584dca8f2a8d20a7c74263,33c968125459420e89ffbc8cffece317,preview_click_response
12292586,2022-08-11,1660250318,154423,c4453128ae584dca8f2a8d20a7c74263,33c968125459420e89ffbc8cffece317,preview_click_response


In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12292588 entries, 0 to 12292587
Data columns (total 6 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   event_date       object
 1   event_timestamp  int64 
 2   vacancy_id_      int64 
 3   cookie_id        object
 4   user_id          object
 5   event_type       object
dtypes: int64(2), object(4)
memory usage: 562.7+ MB


In [78]:
value = df['event_type'].value_counts()
value

event_type
show_vacancy              6198889
preview_click_vacancy     4781280
click_response             384090
click_contacts             277584
preview_click_response     190635
click_favorite             155844
preview_click_favorite     107016
preview_click_contacts     102050
click_phone                 79191
preview_click_phone         16009
Name: count, dtype: int64

In [79]:
k = []
for element in value:
    k.append(1 - (element / sum(value)))
k = pd.Series(k)
k.index = value.index
k

event_type
show_vacancy              0.495721
preview_click_vacancy     0.611044
click_response            0.968754
click_contacts            0.977419
preview_click_response    0.984492
click_favorite            0.987322
preview_click_favorite    0.991294
preview_click_contacts    0.991698
click_phone               0.993558
preview_click_phone       0.998698
dtype: float64

In [80]:
def rating(events, koef):
    data = []
    for event in events:
        data.append(k[event])
    return data

In [81]:
df['rating'] = rating(df.event_type, k)
df

Unnamed: 0,event_date,event_timestamp,vacancy_id_,cookie_id,user_id,event_type,rating
0,2022-08-01,1659323026,129850,97990f1a021d4be19aa3f955b7eacab4,951f53de61764ea0b51317200a0dbbfc,show_vacancy,0.495721
1,2022-08-01,1659377255,108347,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy,0.495721
2,2022-08-01,1659376695,109069,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy,0.495721
3,2022-08-01,1659376722,171425,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy,0.495721
4,2022-08-01,1659374929,252384,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy,0.495721
...,...,...,...,...,...,...,...
12292583,2022-08-11,1660246479,162851,0d3c58c6864546c689fa0997de484b30,18124998fd644e80b667a1d84bf63846,preview_click_response,0.984492
12292584,2022-08-11,1660246106,136818,ef559274a81e4ff4b92aa3e9f6805886,5c71c1f1b7cf492ba05871c400a05250,preview_click_response,0.984492
12292585,2022-08-11,1660250139,143721,c4453128ae584dca8f2a8d20a7c74263,33c968125459420e89ffbc8cffece317,preview_click_response,0.984492
12292586,2022-08-11,1660250318,154423,c4453128ae584dca8f2a8d20a7c74263,33c968125459420e89ffbc8cffece317,preview_click_response,0.984492


In [82]:
df_new = df[['cookie_id', 'vacancy_id_', 'rating']]
df_new

Unnamed: 0,cookie_id,vacancy_id_,rating
0,97990f1a021d4be19aa3f955b7eacab4,129850,0.495721
1,03bf8c511fa949c79845a5d81b09aa1d,108347,0.495721
2,03bf8c511fa949c79845a5d81b09aa1d,109069,0.495721
3,03bf8c511fa949c79845a5d81b09aa1d,171425,0.495721
4,03bf8c511fa949c79845a5d81b09aa1d,252384,0.495721
...,...,...,...
12292583,0d3c58c6864546c689fa0997de484b30,162851,0.984492
12292584,ef559274a81e4ff4b92aa3e9f6805886,136818,0.984492
12292585,c4453128ae584dca8f2a8d20a7c74263,143721,0.984492
12292586,c4453128ae584dca8f2a8d20a7c74263,154423,0.984492


In [83]:
df_g = df_new.copy()
df_g['vacancy_id'] = df_g['vacancy_id_']
df_g['client_id'] = df_g['cookie_id']
df_g

Unnamed: 0,cookie_id,vacancy_id_,rating,vacancy_id,client_id
0,97990f1a021d4be19aa3f955b7eacab4,129850,0.495721,129850,97990f1a021d4be19aa3f955b7eacab4
1,03bf8c511fa949c79845a5d81b09aa1d,108347,0.495721,108347,03bf8c511fa949c79845a5d81b09aa1d
2,03bf8c511fa949c79845a5d81b09aa1d,109069,0.495721,109069,03bf8c511fa949c79845a5d81b09aa1d
3,03bf8c511fa949c79845a5d81b09aa1d,171425,0.495721,171425,03bf8c511fa949c79845a5d81b09aa1d
4,03bf8c511fa949c79845a5d81b09aa1d,252384,0.495721,252384,03bf8c511fa949c79845a5d81b09aa1d
...,...,...,...,...,...
12292583,0d3c58c6864546c689fa0997de484b30,162851,0.984492,162851,0d3c58c6864546c689fa0997de484b30
12292584,ef559274a81e4ff4b92aa3e9f6805886,136818,0.984492,136818,ef559274a81e4ff4b92aa3e9f6805886
12292585,c4453128ae584dca8f2a8d20a7c74263,143721,0.984492,143721,c4453128ae584dca8f2a8d20a7c74263
12292586,c4453128ae584dca8f2a8d20a7c74263,154423,0.984492,154423,c4453128ae584dca8f2a8d20a7c74263


In [84]:
df_g['cookie_id'] = df_g['cookie_id'].astype("category")
df_g['vacancy_id_'] = df_g['vacancy_id_'].astype("category")
df_g['cookie_id'] = df_g['cookie_id'].cat.codes
df_g['vacancy_id_'] = df_g['vacancy_id_'].cat.codes
df_g

Unnamed: 0,cookie_id,vacancy_id_,rating,vacancy_id,client_id
0,196572,29849,0.495721,129850,97990f1a021d4be19aa3f955b7eacab4
1,4860,8346,0.495721,108347,03bf8c511fa949c79845a5d81b09aa1d
2,4860,9068,0.495721,109069,03bf8c511fa949c79845a5d81b09aa1d
3,4860,71424,0.495721,171425,03bf8c511fa949c79845a5d81b09aa1d
4,4860,152383,0.495721,252384,03bf8c511fa949c79845a5d81b09aa1d
...,...,...,...,...,...
12292583,17191,62850,0.984492,162851,0d3c58c6864546c689fa0997de484b30
12292584,308843,36817,0.984492,136818,ef559274a81e4ff4b92aa3e9f6805886
12292585,253538,43720,0.984492,143721,c4453128ae584dca8f2a8d20a7c74263
12292586,253538,54422,0.984492,154423,c4453128ae584dca8f2a8d20a7c74263


In [85]:
df_g.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12292588 entries, 0 to 12292587
Data columns (total 5 columns):
 #   Column       Dtype  
---  ------       -----  
 0   cookie_id    int32  
 1   vacancy_id_  int32  
 2   rating       float64
 3   vacancy_id   int64  
 4   client_id    object 
dtypes: float64(1), int32(2), int64(1), object(1)
memory usage: 375.1+ MB


In [86]:
df_grouped = df_g.copy()
df_grouped = df_grouped.drop(['vacancy_id', 'client_id'], axis=1)
df_grouped = df_grouped.groupby(['cookie_id', 'vacancy_id_']).sum().reset_index()
df_grouped.head(10)

Unnamed: 0,cookie_id,vacancy_id_,rating
0,0,37658,1.106765
1,0,53974,1.106765
2,0,74952,2.094087
3,0,76170,1.106765
4,0,82444,2.21353
5,0,87528,1.106765
6,1,6675,1.106765
7,1,8689,1.106765
8,1,15743,1.106765
9,1,69614,1.106765


In [13]:
sparse_vacancy_person = sparse.csr_matrix((df_grouped['rating'].astype(float), (df_grouped['vacancy_id_'], df_grouped['cookie_id'])))
sparse_person_vacancy = sparse.csr_matrix((df_grouped['rating'].astype(float), (df_grouped['cookie_id'], df_grouped['vacancy_id_'])))

model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)

alpha = 15
data = (sparse_person_vacancy * alpha).astype('double')
model.fit(data)

  0%|          | 0/50 [00:00<?, ?it/s]

In [15]:
content_id = 450
n_similar = 6

person_vecs = model.user_factors
vacancy_vecs = model.item_factors

vacancy_norms = np.sqrt((vacancy_vecs * vacancy_vecs).sum(axis=1))

scores = vacancy_vecs.dot(vacancy_vecs[content_id]) / vacancy_norms
top_idx = np.argpartition(scores, -n_similar)[-n_similar:]
similar = sorted(zip(top_idx, scores[top_idx] / vacancy_norms[content_id]), key=lambda x: -x[1])

for content in similar:
    idx, score = content
    print(df_grouped.vacancy_id_.loc[df_grouped.vacancy_id_ == idx].iloc[0])

450
34986
116302
70363
17385
42923


In [19]:
def recommend(person_id, sparse_person_vacancy, person_vecs, vacancy_vecs, num_contents=6):
    person_interactions = sparse_person_vacancy[person_id,:].toarray()
    person_interactions = person_interactions.reshape(-1) + 1
    person_interactions[person_interactions > 1] = 0
    rec_vector = person_vecs[person_id,:].dot(vacancy_vecs.T).toarray()
    
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    recommend_vector = person_interactions * rec_vector_scaled
    vacancy_idx = np.argsort(recommend_vector)[::-1][:num_contents]
    
    vacancy = []
    scores = []

    for idx in vacancy_idx:
        vacancy.append(df_grouped.vacancy_id_.loc[df_grouped.vacancy_id_ == idx].iloc[0])
        scores.append(recommend_vector[idx])
    
    return pd.DataFrame({'vacancy_ID': vacancy, 'score': scores})

In [20]:
person_vecs = sparse.csr_matrix(model.user_factors)
vacancy_vecs = sparse.csr_matrix(model.item_factors)

In [21]:
person_id = 0
recommendations = recommend(person_id, sparse_person_vacancy, person_vecs, vacancy_vecs)
print(recommendations)

   vacancy_ID     score
0       82869  1.000000
1       16822  0.993399
2       80381  0.901961
3       69193  0.876425
4       58241  0.857534
5       14582  0.844272


In [22]:
def user_id(cookie, dic):
    return dic.cookie_id.loc[dic.client_id == cookie].iloc[0]

In [23]:
df_sample = pd.read_parquet('./data/test_private_sample_submission_mfti.parquet', engine='pyarrow')
df_sample

Unnamed: 0,cookie_id,predictions
0,0018914ba3e54011b28fa715583d3354,"[100100, 100101, 100102, 100103, 100104]"
1,0035c298d8c64f368ae730a9cca9bb20,"[100100, 100101, 100102, 100103, 100104]"
2,00956458877448ec9fba87fb97443fdf,"[100100, 100101, 100102, 100103, 100104]"
3,0099387c921b41e7bae6c99dd8254b60,"[100100, 100101, 100102, 100103, 100104]"
4,009f65e8ae99413a8da94a491320580a,"[100100, 100101, 100102, 100103, 100104]"
...,...,...
3081,ffadd195859444d2ade2479b0611c5c1,"[100100, 100101, 100102, 100103, 100104]"
3082,ffbc08b528c64f22996873fc63872202,"[100100, 100101, 100102, 100103, 100104]"
3083,ffdeaf3c34544529880aebf17c103f6c,"[100100, 100101, 100102, 100103, 100104]"
3084,ffefa79a74804ee69e6c131e0d05b948,"[100100, 100101, 100102, 100103, 100104]"


In [24]:
print(user_id(df_sample['cookie_id'][1], df_g))

275


In [25]:
df_g.cookie_id.loc[df_g.client_id == 'ffefa79a74804ee69e6c131e0d05b948'].iloc[0]

330110

In [26]:
df_sample_users = []
for elm in df_sample['cookie_id']:
    df_sample_users.append(user_id(elm, df_g))
df_sample_users = pd.DataFrame(df_sample_users)

In [27]:
rec = []
for elm in df_sample_users[0]:
    rec.append(recommend(elm, sparse_person_vacancy, person_vecs, vacancy_vecs))
rec

[   vacancy_ID     score
 0       37701  1.000000
 1       17531  0.991740
 2      153816  0.988264
 3      150384  0.987057
 4      153945  0.979486
 5       38633  0.977630,
    vacancy_ID     score
 0       38122  1.000000
 1       50515  0.919170
 2        7525  0.915083
 3       35429  0.906054
 4       32615  0.883341
 5      154513  0.880125,
    vacancy_ID     score
 0      160153  1.000000
 1      153677  0.904712
 2      150326  0.874791
 3       71331  0.859919
 4      117682  0.859047
 5       16899  0.839755,
    vacancy_ID     score
 0       73336  1.000000
 1       53969  0.915769
 2       76397  0.816927
 3        3880  0.809677
 4       97406  0.807664
 5       15219  0.807066,
    vacancy_ID     score
 0       38122  1.000000
 1       50515  0.906573
 2         430  0.845846
 3       48685  0.838981
 4        7525  0.836924
 5       35429  0.818556,
    vacancy_ID     score
 0       98113  1.000000
 1      102607  0.997576
 2       64601  0.918392
 3       11504  0.91

In [35]:
predicted_vacancies = []
for elm in rec:
    predicted_vacancies.append(elm['vacancy_ID'])
predicted_vacancies

[0     37701
 1     17531
 2    153816
 3    150384
 4    153945
 5     38633
 Name: vacancy_ID, dtype: int32,
 0     38122
 1     50515
 2      7525
 3     35429
 4     32615
 5    154513
 Name: vacancy_ID, dtype: int32,
 0    160153
 1    153677
 2    150326
 3     71331
 4    117682
 5     16899
 Name: vacancy_ID, dtype: int32,
 0    73336
 1    53969
 2    76397
 3     3880
 4    97406
 5    15219
 Name: vacancy_ID, dtype: int32,
 0    38122
 1    50515
 2      430
 3    48685
 4     7525
 5    35429
 Name: vacancy_ID, dtype: int32,
 0     98113
 1    102607
 2     64601
 3     11504
 4    103403
 5     82869
 Name: vacancy_ID, dtype: int32,
 0     10792
 1    147534
 2    129688
 3     10791
 4    146508
 5     82083
 Name: vacancy_ID, dtype: int32,
 0      4639
 1    144317
 2      2254
 3    142144
 4     15588
 5    151533
 Name: vacancy_ID, dtype: int32,
 0      2793
 1      4639
 2     15588
 3    110228
 4      6675
 5     56860
 Name: vacancy_ID, dtype: int32,
 0    160153


In [62]:
predicted_vacancies[0]

0     37701
1     17531
2    153816
3    150384
4    153945
5     38633
Name: vacancy_ID, dtype: int32

In [88]:
predict = pd.Series(predicted_vacancies)
predict

0       0     37701
1     17531
2    153816
3    15038...
1       0     38122
1     50515
2      7525
3     3542...
2       0    160153
1    153677
2    150326
3     7133...
3       0    73336
1    53969
2    76397
3     3880
4 ...
4       0    38122
1    50515
2      430
3    48685
4 ...
                              ...                        
3081    0    160153
1    150326
2    117682
3    14264...
3082    0     81744
1     17896
2     71079
3    14066...
3083    0    147534
1    157630
2     20251
3     1250...
3084    0    160153
1    103403
2    107422
3    13734...
3085    0    145951
1    147534
2      1395
3     6424...
Length: 3086, dtype: object