In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import scipy.sparse as sparse
from sklearn.preprocessing import MinMaxScaler

import implicit

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
df = pd.read_parquet('./data/train_mfti.parquet', engine='pyarrow')
df

Unnamed: 0,event_date,event_timestamp,vacancy_id_,cookie_id,user_id,event_type
0,2022-08-01,1659323026,129850,97990f1a021d4be19aa3f955b7eacab4,951f53de61764ea0b51317200a0dbbfc,show_vacancy
1,2022-08-01,1659377255,108347,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy
2,2022-08-01,1659376695,109069,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy
3,2022-08-01,1659376722,171425,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy
4,2022-08-01,1659374929,252384,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy
...,...,...,...,...,...,...
12292583,2022-08-11,1660246479,162851,0d3c58c6864546c689fa0997de484b30,18124998fd644e80b667a1d84bf63846,preview_click_response
12292584,2022-08-11,1660246106,136818,ef559274a81e4ff4b92aa3e9f6805886,5c71c1f1b7cf492ba05871c400a05250,preview_click_response
12292585,2022-08-11,1660250139,143721,c4453128ae584dca8f2a8d20a7c74263,33c968125459420e89ffbc8cffece317,preview_click_response
12292586,2022-08-11,1660250318,154423,c4453128ae584dca8f2a8d20a7c74263,33c968125459420e89ffbc8cffece317,preview_click_response


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12292588 entries, 0 to 12292587
Data columns (total 6 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   event_date       object
 1   event_timestamp  int64 
 2   vacancy_id_      int64 
 3   cookie_id        object
 4   user_id          object
 5   event_type       object
dtypes: int64(2), object(4)
memory usage: 562.7+ MB


In [4]:
value = df['event_type'].value_counts()
value

event_type
show_vacancy              6198889
preview_click_vacancy     4781280
click_response             384090
click_contacts             277584
preview_click_response     190635
click_favorite             155844
preview_click_favorite     107016
preview_click_contacts     102050
click_phone                 79191
preview_click_phone         16009
Name: count, dtype: int64

In [5]:
k = []
for element in value:
    k.append(1 - (element / sum(value)))
k = pd.Series(k)
k.index = value.index
k

event_type
show_vacancy              0.495721
preview_click_vacancy     0.611044
click_response            0.968754
click_contacts            0.977419
preview_click_response    0.984492
click_favorite            0.987322
preview_click_favorite    0.991294
preview_click_contacts    0.991698
click_phone               0.993558
preview_click_phone       0.998698
dtype: float64

In [6]:
def rating(events, koef):
    data = []
    for event in events:
        data.append(k[event])
    return data

In [7]:
df['rating'] = rating(df.event_type, k)
df

Unnamed: 0,event_date,event_timestamp,vacancy_id_,cookie_id,user_id,event_type,rating
0,2022-08-01,1659323026,129850,97990f1a021d4be19aa3f955b7eacab4,951f53de61764ea0b51317200a0dbbfc,show_vacancy,0.495721
1,2022-08-01,1659377255,108347,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy,0.495721
2,2022-08-01,1659376695,109069,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy,0.495721
3,2022-08-01,1659376722,171425,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy,0.495721
4,2022-08-01,1659374929,252384,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy,0.495721
...,...,...,...,...,...,...,...
12292583,2022-08-11,1660246479,162851,0d3c58c6864546c689fa0997de484b30,18124998fd644e80b667a1d84bf63846,preview_click_response,0.984492
12292584,2022-08-11,1660246106,136818,ef559274a81e4ff4b92aa3e9f6805886,5c71c1f1b7cf492ba05871c400a05250,preview_click_response,0.984492
12292585,2022-08-11,1660250139,143721,c4453128ae584dca8f2a8d20a7c74263,33c968125459420e89ffbc8cffece317,preview_click_response,0.984492
12292586,2022-08-11,1660250318,154423,c4453128ae584dca8f2a8d20a7c74263,33c968125459420e89ffbc8cffece317,preview_click_response,0.984492


In [8]:
df_new = df[['cookie_id', 'vacancy_id_', 'rating']]
df_new

Unnamed: 0,cookie_id,vacancy_id_,rating
0,97990f1a021d4be19aa3f955b7eacab4,129850,0.495721
1,03bf8c511fa949c79845a5d81b09aa1d,108347,0.495721
2,03bf8c511fa949c79845a5d81b09aa1d,109069,0.495721
3,03bf8c511fa949c79845a5d81b09aa1d,171425,0.495721
4,03bf8c511fa949c79845a5d81b09aa1d,252384,0.495721
...,...,...,...
12292583,0d3c58c6864546c689fa0997de484b30,162851,0.984492
12292584,ef559274a81e4ff4b92aa3e9f6805886,136818,0.984492
12292585,c4453128ae584dca8f2a8d20a7c74263,143721,0.984492
12292586,c4453128ae584dca8f2a8d20a7c74263,154423,0.984492


In [9]:
df_g = df_new.copy()
df_g['vacancy_id'] = df_g['vacancy_id_']
df_g['client_id'] = df_g['cookie_id']
df_g

Unnamed: 0,cookie_id,vacancy_id_,rating,vacancy_id,client_id
0,97990f1a021d4be19aa3f955b7eacab4,129850,0.495721,129850,97990f1a021d4be19aa3f955b7eacab4
1,03bf8c511fa949c79845a5d81b09aa1d,108347,0.495721,108347,03bf8c511fa949c79845a5d81b09aa1d
2,03bf8c511fa949c79845a5d81b09aa1d,109069,0.495721,109069,03bf8c511fa949c79845a5d81b09aa1d
3,03bf8c511fa949c79845a5d81b09aa1d,171425,0.495721,171425,03bf8c511fa949c79845a5d81b09aa1d
4,03bf8c511fa949c79845a5d81b09aa1d,252384,0.495721,252384,03bf8c511fa949c79845a5d81b09aa1d
...,...,...,...,...,...
12292583,0d3c58c6864546c689fa0997de484b30,162851,0.984492,162851,0d3c58c6864546c689fa0997de484b30
12292584,ef559274a81e4ff4b92aa3e9f6805886,136818,0.984492,136818,ef559274a81e4ff4b92aa3e9f6805886
12292585,c4453128ae584dca8f2a8d20a7c74263,143721,0.984492,143721,c4453128ae584dca8f2a8d20a7c74263
12292586,c4453128ae584dca8f2a8d20a7c74263,154423,0.984492,154423,c4453128ae584dca8f2a8d20a7c74263


In [10]:
df_g['cookie_id'] = df_g['cookie_id'].astype("category")
df_g['vacancy_id_'] = df_g['vacancy_id_'].astype("category")
df_g['cookie_id'] = df_g['cookie_id'].cat.codes
df_g['vacancy_id_'] = df_g['vacancy_id_'].cat.codes
df_g

Unnamed: 0,cookie_id,vacancy_id_,rating,vacancy_id,client_id
0,196572,29849,0.495721,129850,97990f1a021d4be19aa3f955b7eacab4
1,4860,8346,0.495721,108347,03bf8c511fa949c79845a5d81b09aa1d
2,4860,9068,0.495721,109069,03bf8c511fa949c79845a5d81b09aa1d
3,4860,71424,0.495721,171425,03bf8c511fa949c79845a5d81b09aa1d
4,4860,152383,0.495721,252384,03bf8c511fa949c79845a5d81b09aa1d
...,...,...,...,...,...
12292583,17191,62850,0.984492,162851,0d3c58c6864546c689fa0997de484b30
12292584,308843,36817,0.984492,136818,ef559274a81e4ff4b92aa3e9f6805886
12292585,253538,43720,0.984492,143721,c4453128ae584dca8f2a8d20a7c74263
12292586,253538,54422,0.984492,154423,c4453128ae584dca8f2a8d20a7c74263


In [48]:
dic_cat2vac = df_g[['vacancy_id_', 'vacancy_id']]
dic_cat2vac = dic_cat2vac.set_index('vacancy_id_')
dic_cat2vac = dic_cat2vac.to_dict()
dic_cat2vac

{'vacancy_id': {29849: 129850,
  8346: 108347,
  9068: 109069,
  71424: 171425,
  152383: 252384,
  153832: 253833,
  158440: 258441,
  2913: 102914,
  8009: 108010,
  8241: 108242,
  11026: 111027,
  11866: 111867,
  19436: 119437,
  23084: 123085,
  23269: 123270,
  23350: 123351,
  23359: 123360,
  23517: 123518,
  24881: 124882,
  24947: 124948,
  26018: 126019,
  26120: 126121,
  8330: 108331,
  26250: 126251,
  147534: 247535,
  29786: 129787,
  64601: 164602,
  13304: 113305,
  142471: 242472,
  53559: 153560,
  156471: 256472,
  25459: 125460,
  42144: 142145,
  50234: 150235,
  7341: 107342,
  49257: 149258,
  106543: 206544,
  30259: 130260,
  34032: 134033,
  20541: 120542,
  21541: 121542,
  31395: 131396,
  151044: 251045,
  158897: 258898,
  5593: 105594,
  20848: 120849,
  27453: 127454,
  28446: 128447,
  29204: 129205,
  29780: 129781,
  37502: 137503,
  142473: 242474,
  13493: 113494,
  156620: 256621,
  34014: 134015,
  20364: 120365,
  20328: 120329,
  42213: 14221

In [60]:
dic_vac2cat = df_g[['vacancy_id', 'vacancy_id_']]
dic_vac2cat = dic_vac2cat.set_index('vacancy_id')
dic_vac2cat = dic_vac2cat.to_dict()
dic_vac2cat

{'vacancy_id_': {129850: 29849,
  108347: 8346,
  109069: 9068,
  171425: 71424,
  252384: 152383,
  253833: 153832,
  258441: 158440,
  102914: 2913,
  108010: 8009,
  108242: 8241,
  111027: 11026,
  111867: 11866,
  119437: 19436,
  123085: 23084,
  123270: 23269,
  123351: 23350,
  123360: 23359,
  123518: 23517,
  124882: 24881,
  124948: 24947,
  126019: 26018,
  126121: 26120,
  108331: 8330,
  126251: 26250,
  247535: 147534,
  129787: 29786,
  164602: 64601,
  113305: 13304,
  242472: 142471,
  153560: 53559,
  256472: 156471,
  125460: 25459,
  142145: 42144,
  150235: 50234,
  107342: 7341,
  149258: 49257,
  206544: 106543,
  130260: 30259,
  134033: 34032,
  120542: 20541,
  121542: 21541,
  131396: 31395,
  251045: 151044,
  258898: 158897,
  105594: 5593,
  120849: 20848,
  127454: 27453,
  128447: 28446,
  129205: 29204,
  129781: 29780,
  137503: 37502,
  242474: 142473,
  113494: 13493,
  256621: 156620,
  134015: 34014,
  120365: 20364,
  120329: 20328,
  142214: 422

In [49]:
dic_cat2client = df_g[['cookie_id', 'client_id']]
dic_cat2client = dic_cat2client.set_index('cookie_id')
dic_cat2client = dic_cat2client.to_dict()
dic_cat2client

{'client_id': {196572: '97990f1a021d4be19aa3f955b7eacab4',
  4860: '03bf8c511fa949c79845a5d81b09aa1d',
  43324: '21471b812ded41d2a56470e4783ccb98',
  56957: '2bcf980413854540be443220c7c144da',
  65206: '322fc17ac2004ab9a6aa18537731a4ca',
  210358: 'a266dfddc47d433d93cfe3bbf1880bf7',
  45956: '235893f630df4a0a98ff7b1b932d017d',
  22732: '117d80930340423d908ab92e66b3e955',
  293679: 'e39e9cb97784467fab85e22c3bda492d',
  225188: 'ae0baf127c6d472fa228cba9fb662c2e',
  266813: 'ce798b80b3214a3b94cedd1a23b6c44c',
  83893: '40b7985c82d64d4aa0af0b05e5b76feb',
  259852: 'c9214bbd3c864a458d7bd8783c521bbc',
  8069: '06409b09cc2048e6ba73b79a3c98e0cc',
  231546: 'b30bb97d2e8841e3b271b1e6404c0834',
  25799: '13e3ab21e5c94143a3aea84669472f8b',
  309354: 'efbff68cebed42e5abd629e158eb3a97',
  251716: 'c2d6068d35f1459285682b3bef900a8d',
  119733: '5c88a35fa9494d92a0ad532b8a8b67e8',
  102896: '4f756ed1101f4bcba6fbaa622f3c2988',
  306257: 'ed52305ad4604a92bf11394e97050995',
  138750: '6b5d27a547d0469cada01

In [62]:
dic_client2cat = df_g[['client_id', 'cookie_id']]
dic_client2cat = dic_client2cat.set_index('client_id')
dic_client2cat = dic_client2cat.to_dict()
dic_client2cat

{'cookie_id': {'97990f1a021d4be19aa3f955b7eacab4': 196572,
  '03bf8c511fa949c79845a5d81b09aa1d': 4860,
  '21471b812ded41d2a56470e4783ccb98': 43324,
  '2bcf980413854540be443220c7c144da': 56957,
  '322fc17ac2004ab9a6aa18537731a4ca': 65206,
  'a266dfddc47d433d93cfe3bbf1880bf7': 210358,
  '235893f630df4a0a98ff7b1b932d017d': 45956,
  '117d80930340423d908ab92e66b3e955': 22732,
  'e39e9cb97784467fab85e22c3bda492d': 293679,
  'ae0baf127c6d472fa228cba9fb662c2e': 225188,
  'ce798b80b3214a3b94cedd1a23b6c44c': 266813,
  '40b7985c82d64d4aa0af0b05e5b76feb': 83893,
  'c9214bbd3c864a458d7bd8783c521bbc': 259852,
  '06409b09cc2048e6ba73b79a3c98e0cc': 8069,
  'b30bb97d2e8841e3b271b1e6404c0834': 231546,
  '13e3ab21e5c94143a3aea84669472f8b': 25799,
  'efbff68cebed42e5abd629e158eb3a97': 309354,
  'c2d6068d35f1459285682b3bef900a8d': 251716,
  '5c88a35fa9494d92a0ad532b8a8b67e8': 119733,
  '4f756ed1101f4bcba6fbaa622f3c2988': 102896,
  'ed52305ad4604a92bf11394e97050995': 306257,
  '6b5d27a547d0469cada015478cabd

In [50]:
df_g.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12292588 entries, 0 to 12292587
Data columns (total 5 columns):
 #   Column       Dtype  
---  ------       -----  
 0   cookie_id    int32  
 1   vacancy_id_  int32  
 2   rating       float64
 3   vacancy_id   int64  
 4   client_id    object 
dtypes: float64(1), int32(2), int64(1), object(1)
memory usage: 375.1+ MB


In [51]:
df_grouped = df_g.copy()
df_grouped = df_grouped.drop(['vacancy_id', 'client_id'], axis=1)
df_grouped = df_grouped.groupby(['cookie_id', 'vacancy_id_']).sum().reset_index()
df_grouped.head(10)

Unnamed: 0,cookie_id,vacancy_id_,rating
0,0,37658,1.106765
1,0,53974,1.106765
2,0,74952,2.094087
3,0,76170,1.106765
4,0,82444,2.21353
5,0,87528,1.106765
6,1,6675,1.106765
7,1,8689,1.106765
8,1,15743,1.106765
9,1,69614,1.106765


In [52]:
sparse_vacancy_person = sparse.csr_matrix((df_grouped['rating'].astype(float), (df_grouped['vacancy_id_'], df_grouped['cookie_id'])))
sparse_person_vacancy = sparse.csr_matrix((df_grouped['rating'].astype(float), (df_grouped['cookie_id'], df_grouped['vacancy_id_'])))

model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)

alpha = 15
data = (sparse_person_vacancy * alpha).astype('double')
model.fit(data)

  0%|          | 0/50 [00:00<?, ?it/s]

In [53]:
content_id = 450
n_similar = 6

person_vecs = model.user_factors
vacancy_vecs = model.item_factors

vacancy_norms = np.sqrt((vacancy_vecs * vacancy_vecs).sum(axis=1))

scores = vacancy_vecs.dot(vacancy_vecs[content_id]) / vacancy_norms
top_idx = np.argpartition(scores, -n_similar)[-n_similar:]
similar = sorted(zip(top_idx, scores[top_idx] / vacancy_norms[content_id]), key=lambda x: -x[1])

for content in similar:
    idx, score = content
    print(df_grouped.vacancy_id_.loc[df_grouped.vacancy_id_ == idx].iloc[0])

450
28014
116302
75068
42923
34986


In [54]:
def recommend(person_id, sparse_person_vacancy, person_vecs, vacancy_vecs, num_contents=6):
    person_interactions = sparse_person_vacancy[person_id,:].toarray()
    person_interactions = person_interactions.reshape(-1) + 1
    person_interactions[person_interactions > 1] = 0
    rec_vector = person_vecs[person_id,:].dot(vacancy_vecs.T).toarray()
    
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    recommend_vector = person_interactions * rec_vector_scaled
    vacancy_idx = np.argsort(recommend_vector)[::-1][:num_contents]
    
    vacancy = []
    scores = []

    for idx in vacancy_idx:
        vacancy.append(df_grouped.vacancy_id_.loc[df_grouped.vacancy_id_ == idx].iloc[0])
        scores.append(recommend_vector[idx])
    
    return pd.DataFrame({'vacancy_ID': vacancy, 'score': scores})

In [55]:
person_vecs = sparse.csr_matrix(model.user_factors)
vacancy_vecs = sparse.csr_matrix(model.item_factors)

In [56]:
person_id = 0
recommendations = recommend(person_id, sparse_person_vacancy, person_vecs, vacancy_vecs)
print(recommendations)

   vacancy_ID     score
0       82869  1.000000
1       16822  0.983143
2       80381  0.899961
3       69193  0.862494
4       58241  0.837227
5       14582  0.821518


In [57]:
df_sample = pd.read_parquet('./data/test_private_sample_submission_mfti.parquet', engine='pyarrow')
df_sample

Unnamed: 0,cookie_id,predictions
0,0018914ba3e54011b28fa715583d3354,"[100100, 100101, 100102, 100103, 100104]"
1,0035c298d8c64f368ae730a9cca9bb20,"[100100, 100101, 100102, 100103, 100104]"
2,00956458877448ec9fba87fb97443fdf,"[100100, 100101, 100102, 100103, 100104]"
3,0099387c921b41e7bae6c99dd8254b60,"[100100, 100101, 100102, 100103, 100104]"
4,009f65e8ae99413a8da94a491320580a,"[100100, 100101, 100102, 100103, 100104]"
...,...,...
3081,ffadd195859444d2ade2479b0611c5c1,"[100100, 100101, 100102, 100103, 100104]"
3082,ffbc08b528c64f22996873fc63872202,"[100100, 100101, 100102, 100103, 100104]"
3083,ffdeaf3c34544529880aebf17c103f6c,"[100100, 100101, 100102, 100103, 100104]"
3084,ffefa79a74804ee69e6c131e0d05b948,"[100100, 100101, 100102, 100103, 100104]"


In [63]:
df_sample_users = []
for elm in df_sample['cookie_id']:
    df_sample_users.append(dic_client2cat['cookie_id'][elm])
df_sample_users = pd.DataFrame(df_sample_users)
df_sample_users

Unnamed: 0,0
0,109
1,275
2,722
3,739
4,766
...,...
3081,329779
3082,329841
3083,330021
3084,330110


In [64]:
rec = []
for elm in df_sample_users[0]:
    rec.append(recommend(elm, sparse_person_vacancy, person_vecs, vacancy_vecs))
rec

[   vacancy_ID     score
 0      153945  1.000000
 1      153816  0.995775
 2       37701  0.992364
 3       82381  0.981677
 4      108760  0.976131
 5      150384  0.971785,
    vacancy_ID     score
 0       38122  1.000000
 1       50515  0.908731
 2        7525  0.898453
 3       35429  0.889484
 4       20187  0.870247
 5       32615  0.866199,
    vacancy_ID     score
 0      160153  1.000000
 1      153677  0.872087
 2      150326  0.823673
 3       71331  0.820376
 4      117682  0.807915
 5      146284  0.802926,
    vacancy_ID     score
 0       73336  1.000000
 1       53969  0.890580
 2      107422  0.841942
 3       15219  0.839849
 4       97406  0.838299
 5       76397  0.829991,
    vacancy_ID     score
 0       38122  1.000000
 1       50515  0.903641
 2         430  0.849478
 3        7525  0.842777
 4       35429  0.828110
 5       48685  0.826756,
    vacancy_ID     score
 0      102607  1.000000
 1       98113  0.950537
 2       82869  0.936631
 3       64601  0.89

In [65]:
predicted_vacancies = []
for elm in rec:
    predicted_vacancies.append(elm['vacancy_ID'])
predicted_vacancies

[0    153945
 1    153816
 2     37701
 3     82381
 4    108760
 5    150384
 Name: vacancy_ID, dtype: int32,
 0    38122
 1    50515
 2     7525
 3    35429
 4    20187
 5    32615
 Name: vacancy_ID, dtype: int32,
 0    160153
 1    153677
 2    150326
 3     71331
 4    117682
 5    146284
 Name: vacancy_ID, dtype: int32,
 0     73336
 1     53969
 2    107422
 3     15219
 4     97406
 5     76397
 Name: vacancy_ID, dtype: int32,
 0    38122
 1    50515
 2      430
 3     7525
 4    35429
 5    48685
 Name: vacancy_ID, dtype: int32,
 0    102607
 1     98113
 2     82869
 3     64601
 4     11504
 5    103403
 Name: vacancy_ID, dtype: int32,
 0     48713
 1     13304
 2    137340
 3    144076
 4    141652
 5     76140
 Name: vacancy_ID, dtype: int32,
 0      4639
 1     10791
 2    144317
 3    160153
 4    147534
 5      2254
 Name: vacancy_ID, dtype: int32,
 0      2793
 1      4639
 2     15588
 3      6675
 4     52045
 5    110228
 Name: vacancy_ID, dtype: int32,
 0    160153


In [89]:
df_pred = pd.DataFrame(predicted_vacancies, index=range(len(predicted_vacancies)))
df_pred

In [100]:
result = df_sample[['cookie_id']]

In [101]:
result[1] = df_pred[1]
result[2] = df_pred[2]
result[3] = df_pred[3]
result[4] = df_pred[4]
result[5] = df_pred[5]
result

Unnamed: 0,cookie_id,1,2,3,4,5
0,0018914ba3e54011b28fa715583d3354,153816,37701,82381,108760,150384
1,0035c298d8c64f368ae730a9cca9bb20,50515,7525,35429,20187,32615
2,00956458877448ec9fba87fb97443fdf,153677,150326,71331,117682,146284
3,0099387c921b41e7bae6c99dd8254b60,53969,107422,15219,97406,76397
4,009f65e8ae99413a8da94a491320580a,50515,430,7525,35429,48685
...,...,...,...,...,...,...
3081,ffadd195859444d2ade2479b0611c5c1,150326,117682,11504,8241,142641
3082,ffbc08b528c64f22996873fc63872202,81744,71079,140662,117682,65932
3083,ffdeaf3c34544529880aebf17c103f6c,20251,147534,81975,93330,80381
3084,ffefa79a74804ee69e6c131e0d05b948,107422,103403,137340,127707,64601
