In [3]:
import ast
import dill
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import tensorflow as tf
import tensorflow.keras.backend as K
import warnings
warnings.filterwarnings('ignore')

from collections import Counter
from random import randint, random
from scipy.sparse import coo_matrix, hstack
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances, cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances as ED
# from tensorflow import keras
from tqdm import tqdm

# import requests
# import gc

from scipy import spatial

In [5]:
RANDOM_SEED = 42
K_RECOS = 10

DATA_PATH = "data/kion_train"
RESULTS_PATH = "results/hw5"

In [None]:
# url = "https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip"

In [None]:

# req = requests.get(url, stream=True)

# with open("kion.zip", "wb") as fd:
#     total_size_in_bytes = int(req.headers.get("Content-Length", 0))
#     progress_bar = tqdm(desc="kion dataset download", total=total_size_in_bytes, unit="iB", unit_scale=True)
#     for chunk in req.iter_content(chunk_size=2**20):
#         progress_bar.update(len(chunk))
#         fd.write(chunk)

kion dataset download:  92%|█████████▏| 72.4M/78.8M [00:01<00:00, 74.4MiB/s]

In [None]:
# import zipfile as zf

# files = zf.ZipFile("kion.zip", "r")
# files.extractall()
# files.close()

In [8]:
interactions = pd.read_csv(f'{DATA_PATH}/interactions.csv', parse_dates=["last_watch_dt"])

interactions.rename(
    columns={
        "total_dur": "weight", 
        "last_watch_dt": "datetime"
    }, 
    inplace=True) 

users = pd.read_csv(f'{DATA_PATH}/users.csv')

items = pd.read_csv(f'{DATA_PATH}/items.csv')
items = items.rename(columns = {'id' : 'item_id'})

# Prepare user features

In [9]:
user_cat_feats = ["age", "income", "sex", "kids_flg"]
users_ohe = users.user_id
for feat in user_cat_feats:
    ohe_feat = pd.get_dummies(users[feat], prefix=feat)
    users_ohe = pd.concat([users_ohe, ohe_feat], axis=1)

users_ohe.head()

Unnamed: 0,user_id,age_age_18_24,age_age_25_34,age_age_35_44,age_age_45_54,age_age_55_64,age_age_65_inf,income_income_0_20,income_income_150_inf,income_income_20_40,income_income_40_60,income_income_60_90,income_income_90_150,sex_Ж,sex_М,kids_flg_0,kids_flg_1
0,973171,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,True
1,962099,True,False,False,False,False,False,False,False,True,False,False,False,False,True,True,False
2,1047345,False,False,False,True,False,False,False,False,False,True,False,False,True,False,True,False
3,721985,False,False,False,True,False,False,False,False,True,False,False,False,True,False,True,False
4,704055,False,False,True,False,False,False,False,False,False,False,True,False,True,False,True,False


# Prepare item features

In [10]:
items["country"] = items["countries"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
items["country"] = items["country"].fillna(' ')
items["country"] = items["country"].apply(lambda x: x[0])
items.head(1)

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords,country
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ...",испания


In [11]:
items['country'].nunique()

86

In [12]:
item_cat_feats = ['content_type', 'for_kids', 'age_rating', 'studios', 'country']

items_ohe = items.item_id

for feat in item_cat_feats:
    ohe_feat = pd.get_dummies(items[feat], prefix=feat)
    items_ohe = pd.concat([items_ohe, ohe_feat], axis=1)

items_ohe.head()

Unnamed: 0,item_id,content_type_film,content_type_series,for_kids_0.0,for_kids_1.0,age_rating_0.0,age_rating_6.0,age_rating_12.0,age_rating_16.0,age_rating_18.0,...,country_хорватия,country_чехия,country_чили,country_швейцария,country_швеция,country_эквадор,country_эстония,country_юар,country_югославия,country_япония
0,10711,True,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
1,2508,True,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
2,10716,True,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
3,7868,True,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
4,16268,True,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False


# Filter interactions

In [13]:
print(f"N users before: {interactions.user_id.nunique()}")
print(f"N items before: {interactions.item_id.nunique()}\n")

interactions_df = interactions[interactions.watched_pct > 10]


valid_users = []

c = Counter(interactions_df.user_id)
for user_id, entries in c.most_common():
    if entries > 5:
        valid_users.append(user_id)

valid_items = []

c = Counter(interactions_df.item_id)
for item_id, entries in c.most_common():
    if entries > 10:
        valid_items.append(item_id)

interactions = interactions[interactions.user_id.isin(valid_users)]
interactions = interactions[interactions.item_id.isin(valid_items)]

print(f"N users after: {interactions.user_id.nunique()}")
print(f"N items after: {interactions.item_id.nunique()}")

N users before: 962179
N items before: 15706

N users after: 170682
N items after: 6901


In [14]:
common_users = set(interactions.user_id.unique()).intersection(set(users_ohe.user_id.unique()))
common_items = set(interactions.item_id.unique()).intersection(set(items_ohe.item_id.unique()))

print(len(common_users))
print(len(common_items))

interactions = interactions[interactions.item_id.isin(common_items)]
interactions = interactions[interactions.user_id.isin(common_users)]

items_ohe = items_ohe[items_ohe.item_id.isin(common_items)]
users_ohe = users_ohe[users_ohe.user_id.isin(common_users)]

139210
6901


In [16]:
interactions["uid"] = interactions["user_id"].astype("category")
interactions["uid"] = interactions["uid"].cat.codes

interactions["iid"] = interactions["item_id"].astype("category")
interactions["iid"] = interactions["iid"].cat.codes

interactions.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct,uid,iid
0,176549,9506,2021-05-11,4250,72.0,22413,3945
1,699317,1659,2021-05-29,8317,100.0,88768,675
3,864613,7638,2021-07-05,14483,100.0,109925,3163
6,1016458,354,2021-08-14,1672,25.0,128826,139
7,884009,693,2021-08-04,703,14.0,112355,279


In [17]:
interactions_vec = np.zeros((interactions.uid.nunique(), interactions.iid.nunique()))

for user_id, item_id in zip(interactions.uid, interactions.iid):
    interactions_vec[user_id, item_id] += 1

res = interactions_vec.sum(axis=1)
for i in range(len(interactions_vec)):
    interactions_vec[i] /= res[i]

In [18]:
items_ohe = items_ohe[items_ohe['item_id'] != 11805]
print(interactions.item_id.nunique())
print(items_ohe.item_id.nunique())
print(interactions.user_id.nunique())
print(users_ohe.user_id.nunique())

print(set(items_ohe.item_id.unique()) - set(interactions.item_id.unique()))

6901
6900
139210
139210
set()


# Mappings

In [19]:
iid_to_item_id = interactions[["iid", "item_id"]].drop_duplicates().set_index("iid").to_dict()["item_id"]
item_id_to_iid = interactions[["iid", "item_id"]].drop_duplicates().set_index("item_id").to_dict()["iid"]

uid_to_user_id = interactions[["uid", "user_id"]].drop_duplicates().set_index("uid").to_dict()["user_id"]
user_id_to_uid = interactions[["uid", "user_id"]].drop_duplicates().set_index("user_id").to_dict()["uid"]

# Loss

In [20]:
def triplet_loss(y_true, y_pred, n_dims=128, alpha=0.4):
    anchor = y_pred[:, 0:n_dims]
    positive = y_pred[:, n_dims:n_dims*2]
    negative = y_pred[:, n_dims*2:n_dims*3]

    pos_dist = K.sum(K.square(anchor - positive), axis=1)
    neg_dist = K.sum(K.square(anchor - negative), axis=1)

    basic_loss = pos_dist - neg_dist + alpha
    loss = K.maximum(basic_loss, 0.0)

    return loss

# Text features

In [21]:
items['description'].head()

0    Мелодрама легендарного Педро Альмодовара «Пого...
1    Уморительная современная комедия на популярную...
2    Профессиональный рестлер Стив Остин («Все или ...
3    Шарлотта Рэмплинг, Том Кортни, Джеральдин Джей...
4    Расчетливая чаровница из советского кинохита «...
Name: description, dtype: object

# Normalization

In [23]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/elizaveta/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [24]:
def normalize(text):
    res = [w.strip(punct) for w in word_tokenize(text)]
    res = [w.lower() for w in res if w != '']
    return res

In [25]:
from nltk.tokenize import word_tokenize

items['description'] = items['description'].fillna('')

punct = '!"#$%&()*\+,-\./:;<=>?@\[\]^_`{|}~„“«»†*\—/\-‘’–'

items['clean_description'] = items['description'].apply(lambda x: normalize(x))

# Stop words

In [26]:
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/elizaveta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
def filter_sw(text):
    return [w for w in text if w not in sw]

In [28]:
sw = stopwords.words('russian')
items['clean_description'] = items['clean_description'].apply(lambda x: filter_sw(x))

# Lemmatization

In [29]:
# !pip install pymorphy2

In [30]:
from pymorphy2 import MorphAnalyzer

In [31]:
%%capture

morph = MorphAnalyzer()

tqdm.pandas()
items['clean_description'] = items['clean_description'].progress_apply(lambda x: [morph.parse(w)[0].normalized.word for w in x])

# Text vectorization

In [32]:
from sklearn.feature_extraction.text import CountVectorizer

In [33]:
items['clean_description'] = items['clean_description'].apply(lambda x: ' '.join(x))

In [34]:
vectorizer = CountVectorizer(min_df=5, max_features=350)
text_vectorized = pd.DataFrame(vectorizer.fit_transform(items['clean_description']).toarray(), columns=vectorizer.get_feature_names_out())
text_vectorized.shape

(15963, 350)

In [35]:
text_vectorized['item_id'] = items['item_id']
items_ohe = items_ohe.merge(text_vectorized, on='item_id', how='left')
items_ohe = items_ohe.fillna(0)

# Generator

In [37]:
tree = spatial.cKDTree(items_ohe.drop(["item_id"], axis=1))

In [77]:
def generator(items, users, interactions, batch_size=32):
    while True:
        uid_meta = []
        uid_interaction = []
        pos = []
        neg = []
        for _ in range(batch_size):
            while True:
                try:
                    # Random user
                    uid_iid = randint(0, interactions.shape[0]-1)

                    # Sample positive item
                    pos_iid = np.random.choice(range(interactions.shape[1]), p=interactions[uid_iid])

                    # Get 20 most closest items from KDTree
                    distances, indices = tree.query(items.iloc[pos_iid].values, k=30)


                    # Filter to sample only negative item
                    pos_ids = np.nonzero(interactions[uid_iid])
                    neg_items_candidates = indices[np.in1d(indices, pos_ids, invert=True)]
                    neg_iid = np.random.choice(neg_items_candidates)

                    #neg_iid = np.random.choice(range(interactions.shape[1]))

                    # Negative item features
                    neg.append(items.iloc[neg_iid - 1])

                    # User features
                    uid_meta.append(users.iloc[uid_iid])

                    # User's items
                    uid_interaction.append(interactions_vec[uid_iid])

                    # Positive item features
                    pos.append(items.iloc[pos_iid])
                    break
                except:
                    continue
        yield [np.array(uid_meta).astype('float32'), np.array(uid_interaction).astype('float32'), np.array(pos).astype('float32'), np.array(neg).astype('float32')], [np.array(uid_meta).astype('float32'), np.array(uid_interaction).astype('float32')]

In [78]:
gen = generator(
    items=items_ohe.drop(["item_id"], axis=1),
    users=users_ohe.drop(["user_id"], axis=1),
    interactions=interactions_vec,
)

ret = next(gen)


print(f"вектор фичей юзера: {ret[0][0].shape}")
print(f"вектор взаимодействий юзера с айтемами: {ret[0][1].shape}")
print(f"вектор 'хорошего' айтема: {ret[0][2].shape}")
print(f"вектор 'плохого' айтема: {ret[0][3].shape}")
print()
print(f"вектор фичей юзера: {ret[1][0].shape}")
print(f"вектор взаимодействий юзера с айтемами: {ret[1][1].shape}")

вектор фичей юзера: (32, 16)
вектор взаимодействий юзера с айтемами: (32, 6901)
вектор 'хорошего' айтема: (32, 484)
вектор 'плохого' айтема: (32, 484)

вектор фичей юзера: (32, 16)
вектор взаимодействий юзера с айтемами: (32, 6901)


In [79]:
N_FACTORS = 128

ITEM_MODEL_SHAPE = (items_ohe.drop(["item_id"], axis=1).shape[1], )
USER_META_MODEL_SHAPE = (users_ohe.drop(["user_id"], axis=1).shape[1], )

USER_INTERACTION_MODEL_SHAPE = (interactions_vec.shape[1], )

print(f"N_FACTORS: {N_FACTORS}")
print(f"ITEM_MODEL_SHAPE: {ITEM_MODEL_SHAPE}") # add text features
print(f"USER_META_MODEL_SHAPE: {USER_META_MODEL_SHAPE}")
print(f"USER_INTERACTION_MODEL_SHAPE: {USER_INTERACTION_MODEL_SHAPE}")

N_FACTORS: 128
ITEM_MODEL_SHAPE: (484,)
USER_META_MODEL_SHAPE: (16,)
USER_INTERACTION_MODEL_SHAPE: (6901,)


In [80]:
def item_model(n_factors=N_FACTORS):
    inp = keras.layers.Input(shape=ITEM_MODEL_SHAPE)

    layer_1 = keras.layers.Dense(N_FACTORS, activation='elu', use_bias=False,
                               kernel_regularizer=keras.regularizers.l2(1e-6),
                               activity_regularizer=keras.regularizers.l2(l2=1e-6))(inp)

    layer_2 = keras.layers.Dense(N_FACTORS, activation='elu', use_bias=False,
                             kernel_regularizer=keras.regularizers.l2(1e-6),
                             activity_regularizer=keras.regularizers.l2(l2=1e-6))(layer_1)

    add = keras.layers.Add()([layer_1, layer_2])

    out = keras.layers.Dense(N_FACTORS, activation='linear', use_bias=False,
                             kernel_regularizer=keras.regularizers.l2(1e-6),
                             activity_regularizer=keras.regularizers.l2(l2=1e-6))(add)

    return keras.models.Model(inp, out)


def user_model(n_factors=N_FACTORS):
    inp_meta = keras.layers.Input(shape=USER_META_MODEL_SHAPE)
    inp_interaction = keras.layers.Input(shape=USER_INTERACTION_MODEL_SHAPE)

    layer_1_meta = keras.layers.Dense(N_FACTORS, activation='elu', use_bias=False,
                                 kernel_regularizer=keras.regularizers.l2(1e-6),
                                 activity_regularizer=keras.regularizers.l2(l2=1e-6))(inp_meta)

    layer_1_interaction = keras.layers.Dense(N_FACTORS, activation='elu', use_bias=False,
                                 kernel_regularizer=keras.regularizers.l2(1e-6),
                                 activity_regularizer=keras.regularizers.l2(l2=1e-6))(inp_interaction)

    layer_2_meta = keras.layers.Dense(N_FACTORS, activation='elu', use_bias=False,
                                 kernel_regularizer=keras.regularizers.l2(1e-6),
                                 activity_regularizer=keras.regularizers.l2(l2=1e-6))(layer_1_meta)


    add = keras.layers.Add()([layer_1_meta, layer_2_meta])

    concat_meta_interaction = keras.layers.Concatenate()([add, layer_1_interaction])

    out = keras.layers.Dense(N_FACTORS, activation='linear', use_bias=False,
                             kernel_regularizer=keras.regularizers.l2(1e-6),
                             activity_regularizer=keras.regularizers.l2(l2=1e-6))(concat_meta_interaction)

    return keras.models.Model([inp_meta, inp_interaction], out)

In [81]:
i2v = item_model()
u2v = user_model()

In [82]:
ancor_meta_in = keras.layers.Input(shape=USER_META_MODEL_SHAPE)
ancor_interaction_in = keras.layers.Input(shape=USER_INTERACTION_MODEL_SHAPE)

pos_in = keras.layers.Input(shape=ITEM_MODEL_SHAPE)
neg_in = keras.layers.Input(shape=ITEM_MODEL_SHAPE)

ancor = u2v([ancor_meta_in, ancor_interaction_in])
pos = i2v(pos_in)
neg = i2v(neg_in)

res = keras.layers.Concatenate(name="concat_ancor_pos_neg")([ancor, pos, neg])

model = keras.models.Model([ancor_meta_in, ancor_interaction_in, pos_in, neg_in], res)

In [83]:
model_name = 'recsys_resnet_linear'

decay = keras.callbacks.ReduceLROnPlateau(monitor='loss', patience=2, factor=0.8, verbose=1)
early_stopping =  keras.callbacks.EarlyStopping(monitor='loss', patience=4)

opt = keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss=triplet_loss, optimizer=opt, run_eagerly=True)

In [84]:
item_model().summary()

Model: "model_18"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_38 (InputLayer)          [(None, 484)]        0           []                               
                                                                                                  
 dense_49 (Dense)               (None, 128)          61952       ['input_38[0][0]']               
                                                                                                  
 dense_50 (Dense)               (None, 128)          16384       ['dense_49[0][0]']               
                                                                                                  
 add_14 (Add)                   (None, 128)          0           ['dense_49[0][0]',               
                                                                  'dense_50[0][0]']        

In [85]:
user_model().summary()

Model: "model_19"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_39 (InputLayer)          [(None, 16)]         0           []                               
                                                                                                  
 dense_52 (Dense)               (None, 128)          2048        ['input_39[0][0]']               
                                                                                                  
 dense_54 (Dense)               (None, 128)          16384       ['dense_52[0][0]']               
                                                                                                  
 input_40 (InputLayer)          [(None, 6901)]       0           []                               
                                                                                           

In [86]:
model.summary()

Model: "model_17"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_34 (InputLayer)          [(None, 16)]         0           []                               
                                                                                                  
 input_35 (InputLayer)          [(None, 6901)]       0           []                               
                                                                                                  
 input_36 (InputLayer)          [(None, 484)]        0           []                               
                                                                                                  
 input_37 (InputLayer)          [(None, 484)]        0           []                               
                                                                                           

In [87]:
%%time
model.fit(
    generator(
        items=items_ohe.drop(["item_id"], axis=1),
        users=users_ohe.drop(["user_id"], axis=1),
        interactions=interactions_vec,
        batch_size=16,
    ),
    steps_per_epoch=100,
    epochs=100,
    initial_epoch=0,
    callbacks=[decay, early_stopping],
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 14: ReduceLROnPlateau reducing learning rate to 0.000800000037997961.
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 20: ReduceLROnPlateau reducing learning rate to 0.0006400000303983689.
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 24: ReduceLROnPlateau reducing learning rate to 0.0005120000336319208.
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 28: ReduceLROnPlateau reducing learning rate to 0.00040960004553198815.
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 31: ReduceLROnPlateau reducing learning rate to 0.00032768002711236477.
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 35: ReduceLROnPlateau reducing learning rate to 0.0002621440216898918.
Epoch 36/100
Epoch 37/100
Epoch 37: ReduceLROnPlateau reducing learning rate to 0.000209715

<keras.callbacks.History at 0x7fb862bfa910>

In [99]:
items_feats = items_ohe.drop(["item_id"], axis=1).to_numpy().astype('float32')
items_vecs = i2v.predict(items_feats)



In [108]:
def get_recs_for_user_batch(user_ids):
  uids = [user_id_to_uid[user_id] for user_id in user_ids]
  user_meta_feats = users_ohe.drop(["user_id"], axis=1).iloc[uids]
  user_interaction_vec = interactions_vec[uids]

  user_vecs = u2v.predict([np.array(user_meta_feats), np.array(user_interaction_vec)])

  dists = ED(user_vecs, items_vecs)
  top10_iids = np.argsort(dists, axis=1)[:,:10]
  top10_iids_items = [iid_to_item_id[iid] for iid in top10_iids.reshape(-1)]
  top10_iids_items = np.array(top10_iids_items).reshape(top10_iids.shape)
  return top10_iids_items

# Prepare offline predictions

In [101]:
def get_dssm_reco(user_id):
    uid = user_id_to_uid[user_id]
    user_meta_feats = users_ohe.drop(["user_id"], axis=1).iloc[uid]
    user_interaction_vec = interactions_vec[uid]
    user_vec = u2v.predict(
        [np.array(user_meta_feats).reshape(1, -1), np.array(user_interaction_vec).reshape(1, -1)],
        verbose=False,
    )
    dists = ED(user_vec, items_vecs)
    top10_iids = np.argsort(dists, axis=1)[:, :10]
    top10_iids_item = [iid_to_item_id[iid] for iid in top10_iids.reshape(-1)]
    return top10_iids_item

In [102]:
recos = []

for user_id in tqdm(interactions.user_id.unique()):
    recos.append(get_dssm_reco(user_id))

df_recos = pd.DataFrame({'user_id':interactions.user_id.unique(), 'reco': recos})


100%|██████████| 139210/139210 [1:45:06<00:00, 22.08it/s]   


KeyError: "Column(s) ['rec'] do not exist"

In [106]:
recos = (
    df_recos.groupby(["user_id"])
    .agg({"reco": lambda x: x})
    .reset_index()
    .set_index("user_id")
    .to_dict()["reco"]
)

In [107]:
with open(f"{RESULTS_PATH}/dssm_recs.pkl", "wb") as file:
    pickle.dump(recos, file)

df_recos.to_csv(f"{RESULTS_PATH}/dssm_preds.csv")

In [56]:
recos = pd.read_csv(f"{RESULTS_PATH}/dssm_preds.csv")
recos.head()

Unnamed: 0.1,Unnamed: 0,user_id,reco
0,0,176549,"[4702, 8444, 341, 1000, 11982, 14359, 5490, 20..."
1,1,699317,"[8444, 14359, 4702, 11982, 2028, 14, 1000, 445..."
2,2,864613,"[3598, 8444, 1000, 4702, 2028, 7745, 1436, 138..."
3,3,1016458,"[4702, 8444, 1000, 11982, 14359, 3598, 2028, 3..."
4,4,884009,"[8444, 4702, 2028, 11982, 341, 3598, 14359, 52..."


In [57]:
recos.reco = recos.reco.apply(lambda x: x.replace(" ", "")[1:-1].split(','))
recos.reco = recos.reco.apply(lambda x: [int(value) for value in x])
recos_dict = {key: value for key, value in zip(recos.index, recos.reco.to_numpy() )}

In [58]:
with open(f"{RESULTS_PATH}/dssm_recs.dill", "wb") as file:
    pickle.dump(recos_dict, file)