In [1]:
%load_ext autoreload
%autoreload 2

import os
os.chdir("../../")
print(os.getcwd())

C:\Users\Milosz\thesis-recsys


In [2]:
import pandas as pd
import numpy as np
import functools
import operator
import json
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import pickle

import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
import torch_geometric
import torch_geometric.transforms as T
from torch_geometric.sampler import NegativeSampling
from torch_geometric.loader import LinkNeighborLoader, NeighborLoader
from torch_geometric.data import HeteroData
from torch_geometric.utils import to_scipy_sparse_matrix

from sklearn.metrics import roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler

from models.gnn.sage import GraphSAGE
from scripts.train_graph import train_epoch, test

torch.set_printoptions(precision=2, sci_mode=False)
torch.manual_seed(0)

<torch._C.Generator at 0x24104af99d0>

In [3]:
from ast import literal_eval
from time import time

In [4]:
dir_art = 'data/steam'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [5]:
with open(os.path.join(dir_art, 'data.pkl'), "rb") as f:
    data = pd.read_pickle(f)

train_set = data['relations_datastore'].dataframe.train
supervision_set = data['relations_datastore'].dataframe.supervision
valid_set = data['relations_datastore'].dataframe.valid
item_attr = data['items_datastore'].dataframe.df
user_attr = data['users_datastore'].dataframe.df

train_set = train_set.sort_values(by=['user_id', 'app_id']).reset_index(drop=True)
supervision_set = supervision_set.sort_values(by=['user_id', 'app_id']).reset_index(drop=True)
valid_set = valid_set.sort_values(by=['user_id', 'app_id']).reset_index(drop=True)

train_set['app_id'] = train_set['app_id'] + 1
supervision_set['app_id'] =supervision_set['app_id'] + 1
valid_set['app_id'] = valid_set['app_id'] + 1

train_users = supervision_set['user_id'].unique()
valid_users = valid_set['user_id'].unique()

In [6]:
RELATIONS = {"developer": "developed", "publisher": "published", "OS": "system", "tags": "tag"}
RELATIONS_MAP = {k: v for v, k in enumerate(["developed", "published", "system", "tag"])}
ENTITY_MAP = None

In [7]:
def read_items_data(path: str, path_additional: str, data: dict):
    df_items = pd.read_csv(path)
    df_items_additional_info = pd.read_csv(path_additional)[['appid', 'developer', 'publisher']].rename(
        columns={"appid": "app_id"})
    df_items[['win', 'mac', 'linux']] = df_items[['win', 'mac', 'linux']].astype(int)
    df_items['OS'] = df_items[['win', 'mac', 'linux']].apply(lambda row: [row.index[i] for i, v in enumerate(row) if v],
                                                             axis=1)
    df_items['tags'] = df_items['tags'].apply(literal_eval)
    df_merged = df_items.merge(df_items_additional_info, on="app_id", how="left")[
        ['app_id', 'title', 'developer', 'publisher', 'OS', 'tags']]
    df_merged['app_id'] = df_merged['app_id'].map(data['relations_datastore'].mapping['app_id'])
    df_merged = df_merged.dropna(subset='app_id').sort_values(by='app_id')
    df_merged['developer'] = df_merged['developer'].str.split(';')
    df_merged['publisher'] = df_merged['publisher'].str.split(';')
    return df_merged

In [8]:
def create_exploded_df(df: pd.DataFrame):
    global ENTITY_MAP
    df_exploded = []
    for k, v in RELATIONS.items():
        df_temp = df[['app_id', k]].explode(k).dropna()
        df_temp['relation'] = v
        df_temp['tail'] = df_temp[k]
        df_temp['head'] = df_temp['app_id']
        df_temp = df_temp[['head', 'relation', 'tail']]
        df_exploded.append(df_temp)
    df_exploded = pd.concat(df_exploded)
    df_exploded['head'] = df_exploded['head'] + 1
    df_exploded['relation'] = df_exploded['relation'].map(RELATIONS_MAP)
    n_entities = df_exploded['head'].max()
    ENTITY_MAP = {k: v for v, k in enumerate(df_exploded['tail'].unique(), int(n_entities)+1)}
    df_exploded['tail'] = df_exploded['tail'].map(ENTITY_MAP)
    return df_exploded.astype(int)

In [9]:
def get_user_item_list(edge_index: pd.DataFrame):
    tqdm.pandas()
    user_apps = edge_index.groupby("user_id").progress_apply(lambda x: x['app_id'].values).values
    return user_apps

In [10]:
def sample_paths(x, K=0.3):
    unique, counts = np.unique(x['tail'].values, return_counts=True)
    probabilities = counts / counts.sum()
    random_tails = np.random.choice(np.arange(counts.shape[0]), size=min(max(int(counts.shape[0] * K), 1), 3), p=probabilities)
    heads = unique[random_tails]
    return heads

In [11]:
def get_ripple_set1(df_exploded, user_apps, users):
    if users is None:
        users = np.arange(user_apps.shape[0])
    
    ripple_set1 = []
    n_users = len(users)
    for u in tqdm(users):
        df_ripple_set1 = df_exploded[np.isin(df_exploded.values[:, 0], user_apps[u])] \
            .groupby("head") \
            .apply(lambda x: x.sample(n=min(10, len(x)))).values
        ripple_set1.append(df_ripple_set1)
        
    ripple_index = [i.shape[0] for i in ripple_set1]
    ripple_index_1 = np.repeat(users, ripple_index)
    ripple_index_2 = np.concatenate([np.arange(i) for i in ripple_index])
    multiindex = pd.MultiIndex.from_arrays([ripple_index_1, ripple_index_2], names=['user_id', 'id'])
    
    df_ripple_set1 = pd.DataFrame(np.concatenate(ripple_set1), index=multiindex, columns=['heads', 'relations', 'tails'])

    return df_ripple_set1

In [12]:
def get_ripple_set2(df_exploded, df_ripple_set1, users):
    if users is None:
        users = np.arange(df_ripple_set1.shape[0])
    
    ripple_set2 = []
    n_users = df_ripple_set1.index.get_level_values(0).nunique()
    for u in tqdm(users):
        dff = df_ripple_set1.loc[u]
        heads = dff.groupby('relation').apply(sample_paths).explode().values
        df_ripple_set2 = df_exploded[df_exploded['head'].isin(heads)] \
            .groupby('head') \
            .apply(lambda x: x.sample(n=min(10, len(x))))
        df_ripple_set2 = df_ripple_set2[~df_ripple_set2['tail'].isin(dff['head'])].values
        ripple_set2.append(df_ripple_set2)

    ripple_index = [i.shape[0] for i in ripple_set2]
    ripple_index_1 = np.repeat(users, ripple_index)
    ripple_index_2 = np.concatenate([np.arange(i) for i in ripple_index])
    multiindex = pd.MultiIndex.from_arrays([ripple_index_1, ripple_index_2], names=['user_id', 'id'])

    df_ripple_set2 = pd.DataFrame(np.concatenate(ripple_set2), index=multiindex, columns=['head', 'relation', 'tail'])

    return df_ripple_set2

In [13]:
df_items = read_items_data(path="data/items.csv", path_additional="data/steam.csv", data=data)
df_exploded = create_exploded_df(df_items)
df_exploded2 = df_exploded.copy(deep=True)
df_exploded2[['head', 'tail']] = df_exploded2[['tail', 'head']].values
print("> Exploded DataFrame created")

> Exploded DataFrame created


In [15]:
df_exploded

Unnamed: 0,head,relation,tail
9695,1,0,1232
9695,1,0,1233
9695,1,0,1234
17005,2,0,1235
11952,3,0,1236
...,...,...,...
5484,1231,3,2326
5484,1231,3,2231
5484,1231,3,2240
5484,1231,3,2267


In [87]:
user_item_list_train = get_user_item_list(train_set)
user_item_list_valid = get_user_item_list(pd.concat([train_set, supervision_set]))
print("> User-Items lists created")

  0%|          | 0/3066721 [00:00<?, ?it/s]

  0%|          | 0/3066721 [00:00<?, ?it/s]

> User-Items lists created


In [7]:
with open('data/steam/knowledge_graph.pkl', "rb") as f:
    knowledge_graph = pd.read_pickle(f)
    
train_set = knowledge_graph["train_set"]
supervision_set = knowledge_graph["supervision_set"]
valid_set = knowledge_graph["valid_set"]
ripple_sets_train = knowledge_graph['ripple_sets_train']
ripple_sets_valid = knowledge_graph['ripple_sets_valid']

In [111]:
ripple_sets_train = []
ripple_sets_train.append(get_ripple_set1(df_exploded=df_exploded, user_apps=user_item_list_train, users=train_users))
ripple_sets_train.append(get_ripple_set2(df_exploded=df_exploded2, df_ripple_set1=ripple_sets_train[0], users=train_users))

ripple_sets_valid = []
ripple_sets_valid.append(get_ripple_set1(df_exploded=df_exploded, user_apps=user_item_list_valid, users=valid_users))
ripple_sets_valid.append(get_ripple_set2(df_exploded=df_exploded2, df_ripple_set1=ripple_sets_valid[0], users=valid_users))
print("> Ripple Sets created")

  0%|          | 0/3066721 [00:00<?, ?it/s]


KeyboardInterrupt



In [82]:
knowledge_graph = {
    "train_set": train_set,
    "supervision_set": supervision_set,
    "valid_set": valid_set,
    "ripple_sets_train": ripple_sets_train,
    "ripple_sets_valid": ripple_sets_valid,
    "relations_map": RELATIONS_MAP,
    "entity_map": ENTITY_MAP
}

In [83]:
# with open(os.path.join(dir_art, "knowledge_graph.pkl"), "wb") as f:
#     pickle.dump(knowledge_graph, f)
# print("> Knowledge Graph Saved")

> Knowledge Graph Saved


In [8]:
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F

In [9]:
def collate_fn(batch):
    edge_index, ripple_sets = zip(*batch)
    edge_index = torch.tensor(np.array(edge_index))
    
    rs_pad_tensor = []
    for i in range(2):
        rs = [torch.tensor(rs[i]) for rs in ripple_sets]
        rs_pad_tensor.append(pad_sequence(rs, batch_first=True, padding_value=0))
            
    return edge_index, rs_pad_tensor

class RippleDataset(Dataset):
    def __init__(self, edge_index, edge_label, ripple_sets):
        super(RippleDataset).__init__()
        self.edge_index = edge_index
        self.edge_label = edge_label
        self.df_ripple_set1 = ripple_sets[0]
        self.df_ripple_set2 = ripple_sets[1]

        self.n_users = ripple_sets[0].index.get_level_values(0).nunique()

    def __len__(self):
        return self.n_users

    def __getitem__(self, idx):
        #ei = self.edge_index.loc[idx].values
        el = self.edge_label.loc[idx].values
        ripple1 = self.df_ripple_set1.loc[el[0]]
        ripple2 = self.df_ripple_set2.loc[el[0]]
        
        ripple1 = ripple1.sample(n=min(750, ripple1.shape[0])).values
        ripple2 = ripple2.sample(n=min(750, ripple2.shape[0])).values

        return el, [ripple1, ripple2]

In [10]:
#vals = [train_dataset[i][1][0].shape for i in tqdm(df_ripple_set1.index.get_level_values(0).unique())]
# import matplotlib.pyplot as plt

# # Sample data
# x = df_ripple_set1.index.get_level_values(0).unique()
# y = vals

# # Create a line plot
# plt.plot(x, y)

# # Add labels and title
# plt.xlabel('X-axis Label')
# plt.ylabel('Y-axis Label')
# plt.title('Simple Line Plot')

# # Show the plot
# plt.show()


In [11]:
class RippleNet(nn.Module):
    def __init__(self, emb_dim, n_relations, n_entities):
        super(RippleNet, self).__init__()
        self.relation_emb = nn.Embedding(n_relations, emb_dim * emb_dim, padding_idx=0)
        self.entity_emb = nn.Embedding(n_entities, emb_dim, padding_idx=0)

        self.emb_dim = emb_dim
        self.n_relations = n_relations
        self.n_items = n_entities

    def forward(self, edge_index, ripple_sets):
        item_emb = self.entity_emb(edge_index[:, 1]).unsqueeze(-1)
        u = torch.zeros_like(item_emb)
        for ripple_set in ripple_sets:
            R = self.relation_emb(ripple_set[:, :, 1]).view(ripple_set.shape[0], -1, self.emb_dim, self.emb_dim)
            h = self.entity_emb(ripple_set[:, :, 0]).unsqueeze(-1)
            Rh = torch.matmul(R, h).squeeze(-1)
            vRh = torch.matmul(Rh, item_emb).squeeze(-1)
            p = F.softmax(vRh, dim=1).unsqueeze(-1)
            t = self.entity_emb(ripple_set[:, :, 0])
            u += torch.sum(p * t, axis=1).unsqueeze(-1)

        return torch.sum(u * item_emb, axis=1)

In [12]:
train_rs0_users, train_rs1_users = (ripple_sets_train[i].index.get_level_values(0).unique() for i in range(2))
valid_rs0_users, valid_rs1_users = (ripple_sets_valid[i].index.get_level_values(0).unique() for i in range(2))

print(np.setdiff1d(train_rs0_users, train_rs1_users))
print(np.setdiff1d(valid_rs0_users, valid_rs1_users))

[1966277 2653187]
[]


In [13]:
max(knowledge_graph['entity_map'].values())

2563

In [14]:
ripple_sets_train[0] = ripple_sets_train[0].drop([1966277, 2653187])#.loc[1966277]
ripple_sets_valid[0] = ripple_sets_valid[0].drop([1966277, 2653187])
ripple_sets_valid[1] = ripple_sets_valid[1].drop([1966277, 2653187])
supervision_set = supervision_set[~supervision_set['user_id'].isin([1966277, 2653187])].reset_index(drop=True)
valid_set = valid_set[~valid_set['user_id'].isin([1966277, 2653187])].reset_index(drop=True)

In [15]:
train_dataset = RippleDataset(train_set, supervision_set, ripple_sets_train)
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True, collate_fn=collate_fn, drop_last=True, pin_memory=True, num_workers=2)

valid_dataset = RippleDataset(pd.concat([train_set, supervision_set]), valid_set, ripple_sets_valid)
valid_loader = DataLoader(valid_dataset, batch_size=1024, shuffle=True, collate_fn=collate_fn, drop_last=True)

In [None]:
if __name__ == "__main__":
    next(iter(train_loader))

In [None]:
%%timeit -r 2
next(iter(train_loader))

In [69]:
def train_epoch(model, criterion, optimizer, train_loader, device):
    model.train()

    running_loss = 0.
    preds, ground_truths = [], []
    for i_batch, (edge_index, ripple_sets) in enumerate(tqdm(train_loader)):
        neg_sampl = edge_index.clone().repeat(2, 1)
        neg_sampl[:, 1] = torch.randint(low=1, high=1232, size=(train_loader.batch_size*2, ))
        edge_index = torch.cat([edge_index, neg_sampl]).to(device)
        
        ripple_sets = [rs.repeat(1+2,1,1).to(device) for rs in ripple_sets]
        y_true = torch.cat([
            torch.tensor([1.0] * train_loader.batch_size),
            torch.tensor([0.0] * (train_loader.batch_size * 2))
        ]).unsqueeze(-1).to(device)
        
        y_pred = model(edge_index, ripple_sets)
        loss = criterion(y_pred, y_true)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        preds.append(y_pred)
        ground_truths.append(y_true)
        running_loss += loss.item()

    pred = torch.cat(preds, dim=0).detach().sigmoid().cpu().numpy()
    ground_truth = torch.cat(ground_truths, dim=0).detach().cpu().numpy()
    train_loss = running_loss / len(train_loader)
    train_roc_auc = roc_auc_score(ground_truth, pred)

    return train_loss, train_roc_auc


@torch.no_grad()
def test(model, criterion, val_loader, device):
    model.eval()

    running_loss = 0.
    preds, ground_truths = [], []

    for i_batch, (edge_index, ripple_sets) in enumerate(tqdm(val_loader)):
        neg_sampl = edge_index.clone().repeat(2, 1)
        neg_sampl[:, 1] = torch.randint(low=1, high=1232, size=(train_loader.batch_size*2, ))
        edge_index = torch.cat([edge_index, neg_sampl]).to(device)
        
        ripple_sets = [rs.repeat(1+2,1,1).to(device) for rs in ripple_sets]
        y_true = torch.cat([
            torch.tensor([1.0] * train_loader.batch_size),
            torch.tensor([0.0] * (train_loader.batch_size * 2))
        ]).unsqueeze(-1).to(device)

        y_pred = model(edge_index, ripple_sets)
        loss = criterion(y_pred, y_true)

        preds.append(y_pred)
        ground_truths.append(y_true)
        running_loss += loss.item()

    pred = torch.cat(preds, dim=0).sigmoid().cpu().numpy()
    ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()

    test_loss = running_loss / len(val_loader)
    test_roc_auc = roc_auc_score(ground_truth, pred)

    return test_loss, test_roc_auc

In [70]:
model = RippleNet(emb_dim=16, n_relations=4, n_entities=max(ENTITY_MAP.values())).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.RMSprop(params=model.parameters(), lr=1e-4, momentum=0.9)

In [71]:
for epoch in tqdm(range(20)):
    train_loss, train_roc_auc = train_epoch(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        train_loader=train_loader,
        device=device
    )
    test_loss, test_roc_auc = test(
        model=model,
        criterion=criterion,
        val_loader=valid_loader,
        device=device
    )
    print(f"""Epoch <{epoch}>\ntrain_loss: {train_loss} - train_roc_auc: {train_roc_auc}
test_loss: {test_loss} - test_roc_auc: {test_roc_auc}\n""")

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/356 [00:00<?, ?it/s]

  0%|          | 0/282 [00:00<?, ?it/s]

Epoch <0>
train_loss: 1.7840057186196359 - train_roc_auc: 0.5353931910107345
test_loss: 1.5635305389444878 - test_roc_auc: 0.5586266506696201



  0%|          | 0/356 [00:00<?, ?it/s]

  0%|          | 0/282 [00:00<?, ?it/s]

Epoch <1>
train_loss: 1.3789446611752672 - train_roc_auc: 0.5810771933665646
test_loss: 1.2580251676816467 - test_roc_auc: 0.601430214648815



  0%|          | 0/356 [00:00<?, ?it/s]

  0%|          | 0/282 [00:00<?, ?it/s]

Epoch <2>
train_loss: 1.1174290836527105 - train_roc_auc: 0.6261641845387381
test_loss: 1.0353109039736133 - test_roc_auc: 0.6466166950058848



  0%|          | 0/356 [00:00<?, ?it/s]

  0%|          | 0/282 [00:00<?, ?it/s]

Epoch <3>
train_loss: 0.9309341026825851 - train_roc_auc: 0.6700285990710603
test_loss: 0.8792063659387277 - test_roc_auc: 0.6870114052747195



  0%|          | 0/356 [00:00<?, ?it/s]

  0%|          | 0/282 [00:00<?, ?it/s]

Epoch <4>
train_loss: 0.7936314496766316 - train_roc_auc: 0.709599626092895
test_loss: 0.7644487238944845 - test_roc_auc: 0.7215143107329388



  0%|          | 0/356 [00:00<?, ?it/s]

  0%|          | 0/282 [00:00<?, ?it/s]

Epoch <5>
train_loss: 0.6948664760991429 - train_roc_auc: 0.7423466259655724
test_loss: 0.677873303070136 - test_roc_auc: 0.7516070281612005



  0%|          | 0/356 [00:00<?, ?it/s]

  0%|          | 0/282 [00:00<?, ?it/s]

Epoch <6>
train_loss: 0.6216177644019716 - train_roc_auc: 0.7699526596413224
test_loss: 0.6179547153466137 - test_roc_auc: 0.7736429890320639



  0%|          | 0/356 [00:00<?, ?it/s]

  0%|          | 0/282 [00:00<?, ?it/s]

Epoch <7>
train_loss: 0.5701959985360671 - train_roc_auc: 0.7920141243946672
test_loss: 0.5732923644654294 - test_roc_auc: 0.7923428764328153



  0%|          | 0/356 [00:00<?, ?it/s]

  0%|          | 0/282 [00:00<?, ?it/s]

Epoch <8>
train_loss: 0.5323665001419153 - train_roc_auc: 0.809674580325052
test_loss: 0.5411522081980469 - test_roc_auc: 0.8070395936621044



  0%|          | 0/356 [00:00<?, ?it/s]

  0%|          | 0/282 [00:00<?, ?it/s]

Epoch <9>
train_loss: 0.50560608054145 - train_roc_auc: 0.8234293548416841
test_loss: 0.5175669661439057 - test_roc_auc: 0.8187377501299165



  0%|          | 0/356 [00:00<?, ?it/s]

  0%|          | 0/282 [00:00<?, ?it/s]

Epoch <10>
train_loss: 0.4855605529600315 - train_roc_auc: 0.8345403916651275
test_loss: 0.5011881678662402 - test_roc_auc: 0.8275190556584802



  0%|          | 0/356 [00:00<?, ?it/s]


KeyboardInterrupt



In [None]:
n_entites

In [None]:
df_ripple_set2

In [None]:
# knowledge_graph = {}
# for name, relation in RELATIONS.items():
#     idx = df_merged[['app_id', name]].explode(name).dropna()
#     idx[name] = idx[name].map(entity_maps[RELATIONS[name]])
#     idx = idx.values.astype(int)
#     knowledge_graph[relation] = np.zeros((n_items, (len(sorted(df_merged[name].explode().dropna().unique())))))
#     knowledge_graph[relation][idx[:, 0], idx[:, 1]] = 1

In [None]:
#train_set_1m = train_set[:(train_set.user_id == int(1e6)).values.nonzero()[0][0]]

In [None]:
# F.softmax((torch.rand(25, 73, 16) @ torch.rand(25, 16, 1)).squeeze(-1), dim=1).shape

# (torch.rand(25, 10, 16, 16) @ torch.rand(25, 10, 16, 1)).shape

# torch.tensor([[[0.1, 0.1, 0.1], [0.1, 0.1, 0.1], [0.1, 0.1, 0.1]], [[0.1, 0.1, 0.1], [0.1, 0.1, 0.1], [0.1, 0.1, 0.1]]]).shape

# torch.tensor([[1,1,1], [2,2,2]]).unsqueeze(-1).shape

# (torch.tensor([[1,1,1], [2,2,2]]).unsqueeze(-1) * torch.tensor([[[0.1, 0.1, 0.1], [0.1, 0.1, 0.1], [0.1, 0.1, 0.1]], [[0.1, 0.1, 0.1], [0.1, 0.1, 0.1], [0.1, 0.1, 0.1]]]))

# torch.sum((torch.rand(25, 73, 1) * torch.rand(25, 73, 16)), axis=1).shape

In [None]:
# ripple_set1_dict = {"head":[], "relation": [], "tail": []}
# ripple_index = [[], []]
# for u in tqdm(range(0, 1000000)):
#     apps = train_set[train_set.user_id == u].app_id.values
#     user_apps = df_merged[df_merged.app_id.isin(apps)]
#     heads_u, relations_u, tails_u = [], [], []
#     for key, rel in RELATIONS.items():
#         user_apps_relation = user_apps[['app_id', key]].dropna()['app_id'].values.astype(int)
#         if not len(user_apps_relation):
#             continue
#         tails = [i.nonzero()[0] for i in knowledge_graph[RELATIONS[key]][user_apps_relation]]
#         offsets = [len(i) for i in tails]
        
#         heads = list(np.repeat(user_apps_relation, offsets))
#         relations = [rel] * len(heads)
#         tails = list(np.concatenate(tails))

#         heads_u.extend(heads)
#         relations_u.extend(relations)
#         tails_u.extend(tails)

#     ripple_index[0].extend(np.repeat(u, len(heads_u)))
#     ripple_index[1].extend(np.arange(len(heads_u)))

#     ripple_set1_dict['head'].extend(heads_u)
#     ripple_set1_dict['relation'].extend(relations_u)
#     ripple_set1_dict['tail'].extend(tails_u)
    
# df_ripple_set1 = pd.DataFrame(ripple_set1_dict, index=ripple_index)
# df_ripple_set1.index = df_ripple_set1.index.set_names(['user_id', 'id'])