In [68]:
import os
import random

import numpy as np
import pandas as pd
import scipy.sparse as sps
import matplotlib.pyplot as plt
import seaborn as sns

# Random seed for reproducibility
SEED = 42

random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [69]:
item_type = pd.read_csv(
    "data/data_ICM_type.csv",
    usecols=["item_id", "feature_id"],
    dtype={0: "Int64", 1: "Int64"},
).set_index("item_id")
item_length = pd.read_csv(
    "data/data_ICM_length.csv",
    usecols=["item_id", "data"],
    dtype={0: "Int64", 1: "Int64"},
).set_index("item_id")
item_length = item_length.rename(columns={"data": "length"})
interactions = pd.read_csv(
    "data/interactions_and_impressions.csv",
    dtype={0: "Int64", 1: "Int64", 2: str, 3: "Int64"},
)
interactions = interactions.rename(
    columns={
        "UserID": "user_id",
        "ItemID": "item_id",
        "Data": "data",
        "Impressions": "impressions",
    }
)
target_users = pd.read_csv(
    "data/data_target_users_test.csv",
    usecols=["user_id"],
    dtype={0: "Int64"},
)
views = interactions[interactions["data"] == 0].drop(["data", "impressions"], axis=1)
details = interactions[interactions["data"] == 1].drop(["data", "impressions"], axis=1)
print("item_type", item_type.shape)
print("item_length", item_length.shape)
print("interactions", interactions.shape)
print("views", views.shape)
print("details", details.shape)


item_type (23091, 1)
item_length (23091, 1)
interactions (5826506, 4)
views (3567479, 2)
details (2259027, 2)


In [70]:
views["views_count"] = 1
views_count = views.groupby(["user_id", "item_id"], as_index = False)["views_count"].sum()
details["details_count"] = 1
details_count = details.groupby(["user_id", "item_id"], as_index = False)["details_count"].sum()

view_details_count = views_count.set_index(['user_id', 'item_id']).join(
    details_count.set_index(['user_id', 'item_id']), on=["user_id", "item_id"], how='outer'
)
view_details_count.loc[view_details_count['details_count'].isna(), 'details_count'] = 0
view_details_count.loc[view_details_count['views_count'].isna(), 'views_count'] = 0
view_details_count = view_details_count.join(item_length, on="item_id")
view_details_count = view_details_count.join(item_type, on="item_id")
view_details_count = view_details_count.reset_index()
view_details_count

Unnamed: 0,user_id,item_id,views_count,details_count,length,feature_id
0,0,21,3.0,9.0,27.0,4
1,0,124,1.0,0.0,3.0,1
2,0,808,1.0,0.0,1.0,1
3,0,1326,1.0,3.0,3.0,1
4,0,1995,1.0,2.0,17.0,4
...,...,...,...,...,...,...
1554635,41628,11228,0.0,1.0,30.0,4
1554636,41628,15033,0.0,1.0,1.0,3
1554637,41628,15181,0.0,1.0,8.0,4
1554638,41628,20896,0.0,1.0,,


In [71]:
from ast import literal_eval
mask_with_impressions = ~interactions["impressions"].isna()
interactions.loc[mask_with_impressions, "impressions"] = interactions.loc[mask_with_impressions, "impressions"].apply(literal_eval)
interactions

Unnamed: 0,user_id,item_id,impressions,data
0,0,11,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",1
1,0,21,,0
2,0,21,,0
3,0,21,"(20, 21, 22, 23, 24, 25, 26, 27, 28, 29)",0
4,0,21,,1
...,...,...,...,...
5826501,41628,20448,,0
5826502,41628,20896,,1
5826503,41628,21506,,1
5826504,41628,22882,,0


In [72]:
interactions.loc[interactions["user_id"] == 0].head(30)

Unnamed: 0,user_id,item_id,impressions,data
0,0,11,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",1
1,0,21,,0
2,0,21,,0
3,0,21,"(20, 21, 22, 23, 24, 25, 26, 27, 28, 29)",0
4,0,21,,1
5,0,21,,1
6,0,21,,1
7,0,21,,1
8,0,21,,1
9,0,21,,1


In [73]:
exploded = interactions[mask_with_impressions].explode("impressions")
exploded

Unnamed: 0,user_id,item_id,impressions,data
0,0,11,0,1
0,0,11,1,1
0,0,11,2,1
0,0,11,3,1
0,0,11,4,1
...,...,...,...,...
5443853,36018,65,2341,0
5443853,36018,65,361,0
5443853,36018,65,1300,0
5443853,36018,65,1503,0


In [79]:
exploded["impressions_count"] = 1
impressions_count = exploded.groupby(["user_id", "impressions"], as_index = False)["impressions_count"].sum()
impressions_count = impressions_count.rename(columns={"impressions": "item_id"})
impressions_count

Unnamed: 0,user_id,item_id,impressions_count
0,0,0,1
1,0,1,1
2,0,2,1
3,0,3,1
4,0,4,1
...,...,...,...
1605037,36018,1675,1
1605038,36018,1906,1
1605039,36018,2340,1
1605040,36018,2341,1


In [80]:
# Remove impressions of seen items
merged = pd.merge(impressions_count, view_details_count, on=["user_id", "item_id"], how="left", indicator="exists")
impressions_count = impressions_count.loc[merged["exists"] != "both"]
impressions_count.reset_index()
impressions_count.head(30)

Unnamed: 0,user_id,item_id,impressions_count
0,0,0,1
1,0,1,1
2,0,2,1
3,0,3,1
4,0,4,1
5,0,5,1
6,0,6,1
7,0,7,1
8,0,8,1
9,0,9,1
