In [1]:
import os
import random

import numpy as np
import pandas as pd
import scipy.sparse as sps
import matplotlib.pyplot as plt
import seaborn as sns

# Random seed for reproducibility
SEED = 42

random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [6]:
interactions_and_impressions = pd.read_csv(
    "data/interactions_and_impressions.csv",
    dtype={0: "Int64", 1: "Int64", 2: str, 3: "Int64"},
)
interactions_and_impressions = interactions_and_impressions.rename(
    columns={
        "UserID": "user_id",
        "ItemID": "item_id",
        "Data": "data",
        "Impressions": "impressions",
    }
)
interactions = pd.read_csv("data/processed/interactions.csv")
interactions_train = pd.read_csv("data/processed/splitted/interactions_train.csv")
interactions_val = pd.read_csv("data/processed/splitted/interactions_val.csv")
print(interactions_and_impressions.shape)
print(interactions.shape)
print(interactions_train.shape)
print(interactions_val.shape)

(5826506, 4)
(1554640, 7)
(1321444, 7)
(233196, 7)


In [7]:
from ast import literal_eval
mask_with_impressions = ~interactions_and_impressions["impressions"].isna()
interactions_and_impressions.loc[mask_with_impressions, "impressions"] = interactions_and_impressions.loc[mask_with_impressions, "impressions"].apply(literal_eval)
interactions_and_impressions

Unnamed: 0,user_id,item_id,impressions,data
0,0,11,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",1
1,0,21,,0
2,0,21,,0
3,0,21,"(20, 21, 22, 23, 24, 25, 26, 27, 28, 29)",0
4,0,21,,1
...,...,...,...,...
5826501,41628,20448,,0
5826502,41628,20896,,1
5826503,41628,21506,,1
5826504,41628,22882,,0


In [8]:
exploded = interactions_and_impressions[mask_with_impressions].explode("impressions")
exploded

Unnamed: 0,user_id,item_id,impressions,data
0,0,11,0,1
0,0,11,1,1
0,0,11,2,1
0,0,11,3,1
0,0,11,4,1
...,...,...,...,...
5443853,36018,65,2341,0
5443853,36018,65,361,0
5443853,36018,65,1300,0
5443853,36018,65,1503,0


In [11]:
exploded_train = pd.merge(interactions_train, exploded, how="inner", on=["user_id", "item_id"])
exploded_val = pd.merge(interactions_val, exploded, how="inner", on=["user_id", "item_id"])
print(exploded.shape)
print(exploded_train.shape)
print(exploded_val.shape)

(2914990, 4)
(2484632, 9)
(430358, 9)


In [12]:
def generate_impressions(exploded_df):
    exploded_df["impressions_count"] = 1
    impressions_count = exploded_df.groupby(["user_id", "impressions"], as_index = False)["impressions_count"].sum()
    impressions_count = impressions_count.rename(columns={"impressions": "item_id"})
    return impressions_count

impressions = generate_impressions(exploded)
impressions_train = generate_impressions(exploded_train)
impressions_val = generate_impressions(exploded_val)

In [16]:
impressions.to_csv("data/processed/impressions.csv")
impressions_train.to_csv("data/processed/splitted/impressions_train.csv")
impressions_val.to_csv("data/processed/splitted/impressions_val.csv")

In [80]:
# Remove impressions of seen items (not needed?)
merged = pd.merge(impressions_count, view_details_count, on=["user_id", "item_id"], how="left", indicator="exists")
impressions_count = impressions_count.loc[merged["exists"] != "both"]
impressions_count.reset_index()
impressions_count.head(30)

Unnamed: 0,user_id,item_id,impressions_count
0,0,0,1
1,0,1,1
2,0,2,1
3,0,3,1
4,0,4,1
5,0,5,1
6,0,6,1
7,0,7,1
8,0,8,1
9,0,9,1
